Skip to content

Commit

Permalink
feat: Initial support for specific ingredients parsing (#6243)
Browse files Browse the repository at this point in the history
* initial support for specific ingredients parsing #6242

* fix utf8 issue

* use specific ingredients for fruits/vegetables in nutriscore computation

* update test

* use Modern::Perl

* add support for origins in specific ingredients

* fix 'teneur en légumes de 100%'

* update test

* make specific_ingredients an array, display them in details
  • Loading branch information
stephanegigandet authored Jan 6, 2022
1 parent c0605a4 commit f69e9a9
Show file tree
Hide file tree
Showing 46 changed files with 1,635 additions and 415 deletions.
66 changes: 59 additions & 7 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ BEGIN
&add_tag_prefix_to_link
&display_taxonomy_api
&display_ingredient_analysis
&display_nested_list_of_ingredients
&display_ingredients_analysis_details
&display_ingredients_analysis
&display_possible_improvement_description
Expand Down Expand Up @@ -10815,7 +10815,7 @@ sub display_icon {
}


=head2 display_ingredient_analysis ( $ingredients_ref, $ingredients_text_ref, $ingredients_list_ref )
=head2 display_nested_list_of_ingredients ( $ingredients_ref, $ingredients_text_ref, $ingredients_list_ref )
Recursive function to display how the ingredients were analyzed.
This function calls itself to display sub-ingredients of ingredients.
Expand All @@ -10832,11 +10832,11 @@ Reference to a list of ingredients in text format that we will reconstruct from
=head4 $ingredients_list_ref (output)
Reference to a list of ingredients in ordered nested list format that corresponds to the ingredients array.
Reference to an HTML list of ingredients in ordered nested list format that corresponds to the ingredients array.
=cut

sub display_ingredient_analysis($$$) {
sub display_nested_list_of_ingredients($$$) {

my $ingredients_ref = shift;
my $ingredients_text_ref = shift;
Expand Down Expand Up @@ -10874,7 +10874,7 @@ sub display_ingredient_analysis($$$) {

if (defined $ingredient_ref->{ingredients}) {
${$ingredients_text_ref} .= " (";
display_ingredient_analysis($ingredient_ref->{ingredients}, $ingredients_text_ref, $ingredients_list_ref);
display_nested_list_of_ingredients($ingredient_ref->{ingredients}, $ingredients_text_ref, $ingredients_list_ref);
${$ingredients_text_ref} .= ")";
}

Expand All @@ -10887,6 +10887,55 @@ sub display_ingredient_analysis($$$) {
}


=head2 display_list_of_specific_ingredients ( $product_ref )
Generate HTML to display how the specific ingredients (e.g. mentions like "Total milk content: 90%")
were analyzed.
=head3 Parameters
=head4 $product_ref
=head3 Return value
Empty string if no specific ingredients were detected, or HTML describing the specific ingredients.
=cut

sub display_list_of_specific_ingredients($) {

my $product_ref = shift;

if (not defined $product_ref->{specific_ingredients}) {
return "";
}

my $html = "<ul id=\"specific_ingredients_list\">\n";

foreach my $ingredient_ref (@{$product_ref->{specific_ingredients}}) {

my $ingredients_exists = exists_taxonomy_tag("ingredients", $ingredient_ref->{id});
my $class = '';
if (not $ingredients_exists) {
$class = ' class="unknown_ingredient"';
}

$html .= "<li>" . $ingredient_ref->{text} . "<br>" . "<span$class>" . $ingredient_ref->{ingredient} . "</span>" . " -> " . $ingredient_ref->{id};

foreach my $property (qw(origin labels vegan vegetarian from_palm_oil percent_min percent percent_max)) {
if (defined $ingredient_ref->{$property}) {
$html .= " - " . $property . ":&nbsp;" . $ingredient_ref->{$property};
}
}

$html .= "</li>\n";
}

$html .= "</ul>\n";

return $html;
}


=head2 display_ingredients_analysis_details ( $product_ref )
Expand All @@ -10912,12 +10961,14 @@ sub display_ingredients_analysis_details($) {
my $ingredients_text = "";
my $ingredients_list = "";

display_ingredient_analysis($product_ref->{ingredients}, \$ingredients_text, \$ingredients_list);
display_nested_list_of_ingredients($product_ref->{ingredients}, \$ingredients_text, \$ingredients_list);

my $specific_ingredients = display_list_of_specific_ingredients($product_ref);

my $unknown_ingredients_html = '';
my $unknown_ingredients_help_html = '';

if ($ingredients_text =~ /unknown_ingredient/) {
if ($ingredients_text . $specific_ingredients =~ /unknown_ingredient/) {
$template_data_ref->{ingredients_text_comp} = 'unknown_ingredient';

$styles .= <<CSS
Expand All @@ -10930,6 +10981,7 @@ CSS

$template_data_ref->{ingredients_text} = $ingredients_text;
$template_data_ref->{ingredients_list} = $ingredients_list;
$template_data_ref->{specific_ingredients} = $specific_ingredients;

my $html;

Expand Down
201 changes: 185 additions & 16 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -852,17 +852,154 @@ my %min_regexp = (

# Words that can be ignored after a percent
# e.g. 50% du poids total, 30% of the total weight
# groups need to be non-capturing: prefixed with (?:

my %ignore_strings_after_percent = (
en => "of (the )?(?:total weight|grain is wholegrain rye)",
es => "(en el chocolate( con leche)?)",
en => "of (?:the )?(?:total weight|grain is wholegrain rye)",
es => "(?:en el chocolate(?: con leche)?)",
fi => "jauhojen määrästä",
fr => "(dans le chocolat( (blanc|noir|au lait))?)|(du poids total|du poids)",
fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)",
sv => "fetthalt",
);



=head2 parse_specific_ingredients_text ( product_ref, $text )
Lists of ingredients sometime include extra mentions for specific ingredients
at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product".
This function extracts those mentions and adds them to a special specific_ingredients structure.
=head3 Return values
=head4 specific_ingredients structure
Array of specific ingredients.
=head4
=cut

sub parse_specific_ingredients_text($$$) {

my $product_ref = shift;
my $text = shift;
my $percent_regexp = shift;

my $product_lc = $product_ref->{lc};

$product_ref->{specific_ingredients} = [];

# Go through the ingredient lists multiple times
# as long as we have one match
my $ingredient = "start";

while ($ingredient) {

# Initialize values
$ingredient = undef;
my $matched_text;
my $percent;
my $origin;

# Note: in regular expressions below, use non-capturing groups (starting with (?: )
# for all groups, except groups that capture actual data: ingredient name, percent, origins

# Regexps should match until we reach a . ; or the end of the text

if ($product_lc eq "en") {
# examples:
# Total Milk Content 73%.

if ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

# Origin of the milk: United Kingdom
elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origin = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

}
elsif ($product_lc eq "fr") {

# examples:
# Teneur en lait 25% minimum.
# Teneur en lactose < 0,01 g/100 g.
# Préparée avec 50 g de fruits pour 100 g de produit fini.

if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
$percent = $1; # $percent_regexp
$ingredient = $2;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

# Teneur totale en sucres : 60 g pour 100 g de produit fini.
# Teneur en citron de 100%
elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

# Origine du Cacao: Pérou
elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origin = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

}

# If we found an ingredient, save it in specific_ingredients
if (defined $ingredient) {
my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient);

$matched_text =~ s/^\s+//;

my $specific_ingredients_ref = {
id => $ingredient_id,
ingredient => $ingredient,
text => $matched_text,
};

defined $percent and $specific_ingredients_ref->{percent} = $percent;
defined $origin and $specific_ingredients_ref->{origin} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origin ));

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
}
}

# Delete specific ingredients if empty
if (scalar @{$product_ref->{specific_ingredients}} == 0) {
delete $product_ref->{specific_ingredients};
}

return $text;
}


=head2 parse_ingredients_text ( product_ref )
Parse the ingredients_text field to extract individual ingredients.
Expand Down Expand Up @@ -918,14 +1055,6 @@ sub parse_ingredients_text($) {

my $level = 0;

# Farine de blé 56 g* ; beurre concentré 25 g* (soit 30 g* en beurre reconstitué); sucre 22 g* ; œufs frais 2 g
# 56 g -> 56%
$text =~ s/(\d| )g(\*)/$1g/ig;

# transform 0,2% into 0.2%
$text =~ s/(\d),(\d+)( )?(\%|g\b)/$1.$2\%/ig;
$text =~ s//-/g;

# assume commas between numbers are part of the name
# e.g. en:2-Bromo-2-Nitropropane-1,3-Diol, Bronopol
# replace by a lower comma ‚
Expand All @@ -943,7 +1072,10 @@ sub parse_ingredients_text($) {
$ignore_strings_after_percent = $ignore_strings_after_percent{$product_lc};
}

my $percent_regexp = '(<|' . $min_regexp . '|\s|\.|:)*(\d+((\,|\.)\d+)?)\s*(\%|g)\s*(' . $min_regexp . '|' . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*';
my $percent_regexp = '(?:<|' . $min_regexp . '|\s|\.|:)*(\d+(?:(?:\,|\.)\d+)?)\s*(?:\%|g)\s*(?:' . $min_regexp . '|' . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*';

# Extract phrases related to specific ingredients at the end of the ingredients list
$text = parse_specific_ingredients_text($product_ref, $text, $percent_regexp);

my $analyze_ingredients_function = sub($$$$) {

Expand Down Expand Up @@ -1022,7 +1154,7 @@ sub parse_ingredients_text($) {

if (($between =~ $separators) and ($` =~ /^$percent_regexp$/i)) {

$percent = $2;
$percent = $1;
# remove what is before the first separator
$between =~ s/(.*?)$separators//;
$debug_ingredients and $log->debug("separator found after percent", { between => $between, percent => $percent }) if $log->is_debug();
Expand All @@ -1048,7 +1180,7 @@ sub parse_ingredients_text($) {

if ($between =~ /^$percent_regexp$/i) {

$percent = $2;
$percent = $1;
$debug_ingredients and $log->debug("between is a percent", { between => $between, percent => $percent }) if $log->is_debug();
$between = '';
}
Expand Down Expand Up @@ -1146,7 +1278,7 @@ sub parse_ingredients_text($) {
}

if ($after =~ /^$percent_regexp($separators|$)/i) {
$percent = $2;
$percent = $1;
$after = $';
$debug_ingredients and $log->debug("after started with a percent", { after => $after, percent => $percent }) if $log->is_debug();
}
Expand Down Expand Up @@ -1243,7 +1375,7 @@ sub parse_ingredients_text($) {

# Strawberry 10.3%
if ($ingredient =~ /\s$percent_regexp$/i) {
$percent = $2;
$percent = $1;
$debug_ingredients and $log->debug("percent found after", { ingredient => $ingredient, percent => $percent, new_ingredient => $`}) if $log->is_debug();
$ingredient = $`;
}
Expand Down Expand Up @@ -4298,6 +4430,20 @@ sub preparse_ingredients_text($$) {
# turn & to and
$text =~ s/ \& /$and/g;

# number + gr / grams -> g
$text =~ s/(\d\s*)(gr|gram|grams)\b/$1g/ig;
if ($product_lc eq 'fr') {
$text =~ s/(\d\s*)(gramme|grammes)\b/$1g/ig;
}

# Farine de blé 56 g* ; beurre concentré 25 g* (soit 30 g* en beurre reconstitué); sucre 22 g* ; œufs frais 2 g
# 56 g -> 56%
$text =~ s/(\d| )g(\*)/$1g/ig;

# transform 0,2% into 0.2%
$text =~ s/(\d),(\d+)( )?(\%|g\b)/$1.$2\%/ig;
$text =~ s//-/g;

# abbreviations, replace language specific abbreviations first
foreach my $abbreviations_lc ($product_lc, "all") {
if (defined $abbreviations{$abbreviations_lc}) {
Expand Down Expand Up @@ -5614,6 +5760,29 @@ sub estimate_nutriscore_fruits_vegetables_nuts_value_from_ingredients($) {
(defined $product_ref->{nutriments}) or $product_ref->{nutriments} = {};

$product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"} = add_fruits($product_ref->{ingredients});
}

# If we have specific ingredients, check if we have a higher fruits / vegetables content
if (defined $product_ref->{specific_ingredients}) {
my $fruits = 0;
foreach my $ingredient_ref (@{$product_ref->{specific_ingredients}}) {
my $ingredient_id = $ingredient_ref->{id};
if (defined $ingredient_ref->{percent}) {
my $nutriscore_fruits_vegetables_nuts = get_inherited_property("ingredients", $ingredient_id, "nutriscore_fruits_vegetables_nuts:en");

if ((defined $nutriscore_fruits_vegetables_nuts) and ($nutriscore_fruits_vegetables_nuts eq "yes")) {
$fruits += $ingredient_ref->{percent};
}
}
}

if (($fruits > 0) and ((not defined $product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"})
or ($fruits > $product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"}))) {
$product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"} = $fruits;
}
}

if (defined $product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"}) {
$product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_serving"} = $product_ref->{nutriments}{"fruits-vegetables-nuts-estimate-from-ingredients_100g"};
}

Expand Down
Loading

0 comments on commit f69e9a9

Please sign in to comment.