Skip to content

Commit

Permalink
initial support for specific ingredients parsing #6242
Browse files Browse the repository at this point in the history
  • Loading branch information
stephanegigandet committed Jan 4, 2022
1 parent 1116cad commit ddfed6a
Show file tree
Hide file tree
Showing 7 changed files with 716 additions and 25 deletions.
144 changes: 128 additions & 16 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -852,17 +852,120 @@ my %min_regexp = (

# Words that can be ignored after a percent
# e.g. 50% du poids total, 30% of the total weight
# groups need to be non-capturing: prefixed with (?:

my %ignore_strings_after_percent = (
en => "of (the )?(?:total weight|grain is wholegrain rye)",
es => "(en el chocolate( con leche)?)",
en => "of (?:the )?(?:total weight|grain is wholegrain rye)",
es => "(?:en el chocolate(?: con leche)?)",
fi => "jauhojen määrästä",
fr => "(dans le chocolat( (blanc|noir|au lait))?)|(du poids total|du poids)",
fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)",
sv => "fetthalt",
);



=head2 parse_specific_ingredients_text ( product_ref, $text )
Lists of ingredients sometime include extra mentions for specific ingredients
at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product".
This function extracts those mentions and adds them to a special specific_ingredients structure.
=head3 Return values
=head4 specific_ingredients structure
Hash of specific ingredients.
=head4
=cut

sub parse_specific_ingredients_text($$$) {

my $product_ref = shift;
my $text = shift;
my $percent_regexp = shift;

my $product_lc = $product_ref->{lc};

$product_ref->{specific_ingredients} = {};

# Go through the ingredient lists multiple times
# as long as we have one match
my $ingredient = "start";

while ($ingredient) {

# Initialize values
$ingredient = undef;
my $matched_text;
my $percent;

# Note: in regular expressions below, use non-capturing groups (starting with (?: )
# for all groups, except groups that capture actual data: ingredient name, percent, origins

# Regexps should match until we reach a . ; or the end of the text

if ($product_lc eq "en") {
# examples:
# Total Milk Content 73%.

if ($text =~ /\s*(?:total |min |minimum )?([^,.;-]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}

}
elsif ($product_lc eq "fr") {

# examples:
# Teneur en lait 25% minimum.
# Teneur en lactose < 0,01 g/100 g.
# Préparée avec 50 g de fruits pour 100 g de produit fini.
# Teneur totale en sucres : 60 g pour 100 g de produit fini.

if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;-]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
$percent = $1; # $percent_regexp
$ingredient = $2;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}
elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;-]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g|gr|grammes)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}
}

# If we found an ingredient, save it in specific_ingredients
if (defined $ingredient) {
my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient);

# We might have an ingredient specified multiple times (e.g. once for percent, another for origins or labels)
defined $product_ref->{specific_ingredients}{$ingredient_id} or $product_ref->{specific_ingredients}{$ingredient_id} = {};
$product_ref->{specific_ingredients}{$ingredient_id}{ingredient} = $ingredient;
$product_ref->{specific_ingredients}{$ingredient_id}{text} = $matched_text;

defined $percent and $product_ref->{specific_ingredients}{$ingredient_id}{percent} = $percent;
}
}

# Delete specific ingredients if empty
if (scalar keys %{$product_ref->{specific_ingredients}} == 0) {
delete $product_ref->{specific_ingredients};
}

return $text;
}


=head2 parse_ingredients_text ( product_ref )
Parse the ingredients_text field to extract individual ingredients.
Expand Down Expand Up @@ -918,14 +1021,6 @@ sub parse_ingredients_text($) {

my $level = 0;

# Farine de blé 56 g* ; beurre concentré 25 g* (soit 30 g* en beurre reconstitué); sucre 22 g* ; œufs frais 2 g
# 56 g -> 56%
$text =~ s/(\d| )g(\*)/$1g/ig;

# transform 0,2% into 0.2%
$text =~ s/(\d),(\d+)( )?(\%|g\b)/$1.$2\%/ig;
$text =~ s//-/g;

# assume commas between numbers are part of the name
# e.g. en:2-Bromo-2-Nitropropane-1,3-Diol, Bronopol
# replace by a lower comma ‚
Expand All @@ -943,7 +1038,10 @@ sub parse_ingredients_text($) {
$ignore_strings_after_percent = $ignore_strings_after_percent{$product_lc};
}

my $percent_regexp = '(<|' . $min_regexp . '|\s|\.|:)*(\d+((\,|\.)\d+)?)\s*(\%|g)\s*(' . $min_regexp . '|' . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*';
my $percent_regexp = '(?:<|' . $min_regexp . '|\s|\.|:)*(\d+(?:(?:\,|\.)\d+)?)\s*(?:\%|g)\s*(?:' . $min_regexp . '|' . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*';

# Extract phrases related to specific ingredients at the end of the ingredients list
$text = parse_specific_ingredients_text($product_ref, $text, $percent_regexp);

my $analyze_ingredients_function = sub($$$$) {

Expand Down Expand Up @@ -1022,7 +1120,7 @@ sub parse_ingredients_text($) {

if (($between =~ $separators) and ($` =~ /^$percent_regexp$/i)) {

$percent = $2;
$percent = $1;
# remove what is before the first separator
$between =~ s/(.*?)$separators//;
$debug_ingredients and $log->debug("separator found after percent", { between => $between, percent => $percent }) if $log->is_debug();
Expand All @@ -1048,7 +1146,7 @@ sub parse_ingredients_text($) {

if ($between =~ /^$percent_regexp$/i) {

$percent = $2;
$percent = $1;
$debug_ingredients and $log->debug("between is a percent", { between => $between, percent => $percent }) if $log->is_debug();
$between = '';
}
Expand Down Expand Up @@ -1146,7 +1244,7 @@ sub parse_ingredients_text($) {
}

if ($after =~ /^$percent_regexp($separators|$)/i) {
$percent = $2;
$percent = $1;
$after = $';
$debug_ingredients and $log->debug("after started with a percent", { after => $after, percent => $percent }) if $log->is_debug();
}
Expand Down Expand Up @@ -1243,7 +1341,7 @@ sub parse_ingredients_text($) {

# Strawberry 10.3%
if ($ingredient =~ /\s$percent_regexp$/i) {
$percent = $2;
$percent = $1;
$debug_ingredients and $log->debug("percent found after", { ingredient => $ingredient, percent => $percent, new_ingredient => $`}) if $log->is_debug();
$ingredient = $`;
}
Expand Down Expand Up @@ -4298,6 +4396,20 @@ sub preparse_ingredients_text($$) {
# turn & to and
$text =~ s/ \& /$and/g;

# number + gr / grams -> g
$text =~ s/(\d\s*)(gr|gram|grams)\b/$1g/ig;
if ($product_lc eq 'fr') {
$text =~ s/(\d\s*)(gramme|grammes)\b/$1g/ig;
}

# Farine de blé 56 g* ; beurre concentré 25 g* (soit 30 g* en beurre reconstitué); sucre 22 g* ; œufs frais 2 g
# 56 g -> 56%
$text =~ s/(\d| )g(\*)/$1g/ig;

# transform 0,2% into 0.2%
$text =~ s/(\d),(\d+)( )?(\%|g\b)/$1.$2\%/ig;
$text =~ s//-/g;

# abbreviations, replace language specific abbreviations first
foreach my $abbreviations_lc ($product_lc, "all") {
if (defined $abbreviations{$abbreviations_lc}) {
Expand Down
87 changes: 87 additions & 0 deletions t/expected_test_results/ingredients/en-specific-ingredients.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"ingredients" : [
{
"id" : "en:milk",
"percent_estimate" : 66.6666666666667,
"percent_max" : 100,
"percent_min" : 33.3333333333333,
"text" : "Milk",
"vegan" : "no",
"vegetarian" : "yes"
},
{
"id" : "en:cream",
"percent_estimate" : 16.6666666666667,
"percent_max" : 50,
"percent_min" : 0,
"text" : "cream",
"vegan" : "no",
"vegetarian" : "yes"
},
{
"id" : "en:sugar",
"percent_estimate" : 16.6666666666667,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"text" : "sugar",
"vegan" : "yes",
"vegetarian" : "yes"
}
],
"ingredients_analysis_tags" : [
"en:palm-oil-free",
"en:non-vegan",
"en:vegetarian"
],
"ingredients_hierarchy" : [
"en:milk",
"en:dairy",
"en:cream",
"en:sugar",
"en:added-sugar",
"en:disaccharide"
],
"ingredients_n" : 3,
"ingredients_n_tags" : [
"3",
"1-10"
],
"ingredients_original_tags" : [
"en:milk",
"en:cream",
"en:sugar"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:milk",
"en:dairy",
"en:cream",
"en:sugar",
"en:added-sugar",
"en:disaccharide"
],
"ingredients_text" : "Milk, cream, sugar. Sugar content: 3 %. Total milk content: 75.2g",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 3,
"ingredients_with_unspecified_percent_sum" : 100,
"known_ingredients_n" : 6,
"lc" : "en",
"nutriments" : {
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0
},
"specific_ingredients" : {
"en:milk" : {
"ingredient" : "milk",
"percent" : "75.2",
"text" : " Total milk content: 75.2g"
},
"en:sugar" : {
"ingredient" : "Sugar",
"percent" : "3",
"text" : " Sugar content: 3 %."
}
},
"unknown_ingredients_n" : 0
}
Loading

0 comments on commit ddfed6a

Please sign in to comment.