diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 088822ede3243..c0fdd1d160e7e 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -517,7 +517,9 @@ my @labels = ( "en:vegetarian", "nl:beter-leven-1-ster", "nl:beter-leven-2-ster", "nl:beter-leven-3-ster", "en:halal", "en:kosher", - "en:fed-without-gmos", + "en:fed-without-gmos", "fr:crc", + "en:without-gluten", "en:sustainable-farming", + "en:krav", ); my %labels_regexps = (); @@ -536,6 +538,9 @@ sub init_labels_regexps() { foreach my $labelid (@labels) { + # Canonicalize the label ids in case the normalized id changed + $labelid = canonicalize_taxonomy_tag("en", "labels", $labelid); + foreach my $label_lc (keys %{$translations_to{labels}{$labelid}}) { # the synonyms below also contain the main translation as the first entry @@ -2254,20 +2259,42 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { {ingredient => $ingredient, labelid => $labelid, regexp => $regexp}) if $log->is_trace(); if ((defined $regexp) and ($ingredient =~ /\b($regexp)\b/i)) { + + my $label = $1; + if (defined $labels) { $labels .= ", " . $labelid; } else { $labels = $labelid; } - $ingredient = $` . ' ' . $'; + + # Remove stopwords after or before the label + # e.g. "Abricots from sustainable farming" -> "Abricots" + "from" + "sustainable farming" -> "Abricots" + my $before_the_label = $`; + my $after_the_label = $'; + + $before_the_label = remove_stopwords_from_start_or_end_of_string("labels", $ingredients_lc, + $before_the_label); + + # Don't remove stopwords on $after_the_label, as it can remove words we want to keep + # e.g. "Cacao issu de l'agriculture biologique de Madagascar": need to keep "de" in "Cacao de Madagascar" + + $ingredient = $before_the_label . ' ' . $after_the_label; $ingredient =~ s/\s+/ /g; - # If the ingredient is just the label + sub ingredients (e.g. "vegan (orange juice)") - # then we replace the now empty ingredient by the sub ingredients - if (($ingredient =~ /^\s*$/) and (defined $between) and ($between ne "")) { - $ingredient = $between; - $between = ''; + # If we matched a label, but no ingredient + if ($ingredient =~ /^\s*$/) { + # If the ingredient is just the label + sub ingredients (e.g. "vegan (orange juice)") + # then we replace the now empty ingredient by the sub ingredients + if ((defined $between) and ($between !~ /^\s*$/)) { + $ingredient = $between; + $between = ''; + } + else { + # Otherwise we leave the label in place, so that it can be parsed as a non-ingredient specific label + $ingredient = $label; + } } $debug_ingredients and $log->debug("found label", {ingredient => $ingredient, labelid => $labelid}) diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm index c9db84b36dc4e..a8391f39e58c1 100644 --- a/lib/ProductOpener/Tags.pm +++ b/lib/ProductOpener/Tags.pm @@ -851,8 +851,8 @@ sub remove_stopwords_from_start_or_end_of_string ($tagtype, $lc, $string) { my $regexp = $stopwords_regexps{$tagtype . '.' . $lc . '.strings'}; - $string =~ s/^(\b($regexp)\s)+//ig; - $string =~ s/(\s($regexp)\b)+$//ig; + $string =~ s/^(\s*($regexp)\s*\b)+//ig; + $string =~ s/(\b\s*($regexp)\s*)+$//ig; } return $string; } diff --git a/taxonomies/ingredients.txt b/taxonomies/ingredients.txt index cef1a9391e531..38c13372554ed 100644 --- a/taxonomies/ingredients.txt +++ b/taxonomies/ingredients.txt @@ -29,6 +29,10 @@ synonyms:fr:semi-complet, demi-complet synonyms:fr:semi-complète, demi-complète +synonyms:fr:complet, intégral + +synonyms:fr:complète, intégrale + synonyms:lt:ląsteliena, skaidulos synonyms:lt:pilno grūdo, viso grūdo @@ -22553,6 +22557,15 @@ az:Manna yarması pl:kasza manna, kasza manna z pszenicy sr:griz + +# Soft wheat + + "fr", ingredients_text => "Viande de boeuf issue d'animaux nourris sans OGM", } - ] + ], + # French ingredient + [ + "fr-oignon-francais-tomate-francaise", + { + lc => "fr", + ingredients_text => "Oignon français, tomate française", + } + ], + [ + 'fr-legumes-issus-de-l-agriculture-durable', + { + lc => "fr", + ingredients_text => "Légumes issus de l'agriculture durable", + } + ], + [ + "fr-farines-labels-and-processes", + { + lc => "fr", + ingredients_text => + "Farine de blé CRC, farine de maïs fermentée, farine sans gluten, farine de petit épeautre fortifiée", + } + ], + # Label in a list of ingredients: the product should have labels organic and gluten-free. + [ + "en-wheat-flour-organic-gluten-free", + { + lc => "en", + ingredients_text => "wheat flour. MSC (fish). organic. gluten-free", + } + ], + # Removing a label with stopwords without removing the stopwords in origins + [ + "fr-cacao-issu-de-l-agriculture-biologique-de-madagascar", + { + lc => "fr", + ingredients_text => "cacao issu de l'agriculture biologique de Madagascar", + } + ], ); foreach my $test_ref (@tests) {