Skip to content

Commit

Permalink
feat: improvements to parsing of ingredient with labels (#9330)
Browse files Browse the repository at this point in the history
  • Loading branch information
stephanegigandet authored Nov 18, 2023
1 parent 0c9912d commit ba6739f
Show file tree
Hide file tree
Showing 12 changed files with 498 additions and 27 deletions.
41 changes: 34 additions & 7 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,9 @@ my @labels = (
"en:vegetarian", "nl:beter-leven-1-ster",
"nl:beter-leven-2-ster", "nl:beter-leven-3-ster",
"en:halal", "en:kosher",
"en:fed-without-gmos",
"en:fed-without-gmos", "fr:crc",
"en:without-gluten", "en:sustainable-farming",
"en:krav",
);
my %labels_regexps = ();

Expand All @@ -536,6 +538,9 @@ sub init_labels_regexps() {

foreach my $labelid (@labels) {

# Canonicalize the label ids in case the normalized id changed
$labelid = canonicalize_taxonomy_tag("en", "labels", $labelid);

foreach my $label_lc (keys %{$translations_to{labels}{$labelid}}) {

# the synonyms below also contain the main translation as the first entry
Expand Down Expand Up @@ -2254,20 +2259,42 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {
{ingredient => $ingredient, labelid => $labelid, regexp => $regexp})
if $log->is_trace();
if ((defined $regexp) and ($ingredient =~ /\b($regexp)\b/i)) {

my $label = $1;

if (defined $labels) {
$labels .= ", " . $labelid;
}
else {
$labels = $labelid;
}
$ingredient = $` . ' ' . $';

# Remove stopwords after or before the label
# e.g. "Abricots from sustainable farming" -> "Abricots" + "from" + "sustainable farming" -> "Abricots"
my $before_the_label = $`;
my $after_the_label = $';

$before_the_label = remove_stopwords_from_start_or_end_of_string("labels", $ingredients_lc,
$before_the_label);

# Don't remove stopwords on $after_the_label, as it can remove words we want to keep
# e.g. "Cacao issu de l'agriculture biologique de Madagascar": need to keep "de" in "Cacao de Madagascar"

$ingredient = $before_the_label . ' ' . $after_the_label;
$ingredient =~ s/\s+/ /g;

# If the ingredient is just the label + sub ingredients (e.g. "vegan (orange juice)")
# then we replace the now empty ingredient by the sub ingredients
if (($ingredient =~ /^\s*$/) and (defined $between) and ($between ne "")) {
$ingredient = $between;
$between = '';
# If we matched a label, but no ingredient
if ($ingredient =~ /^\s*$/) {
# If the ingredient is just the label + sub ingredients (e.g. "vegan (orange juice)")
# then we replace the now empty ingredient by the sub ingredients
if ((defined $between) and ($between !~ /^\s*$/)) {
$ingredient = $between;
$between = '';
}
else {
# Otherwise we leave the label in place, so that it can be parsed as a non-ingredient specific label
$ingredient = $label;
}
}
$debug_ingredients
and $log->debug("found label", {ingredient => $ingredient, labelid => $labelid})
Expand Down
4 changes: 2 additions & 2 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -851,8 +851,8 @@ sub remove_stopwords_from_start_or_end_of_string ($tagtype, $lc, $string) {

my $regexp = $stopwords_regexps{$tagtype . '.' . $lc . '.strings'};

$string =~ s/^(\b($regexp)\s)+//ig;
$string =~ s/(\s($regexp)\b)+$//ig;
$string =~ s/^(\s*($regexp)\s*\b)+//ig;
$string =~ s/(\b\s*($regexp)\s*)+$//ig;
}
return $string;
}
Expand Down
25 changes: 20 additions & 5 deletions taxonomies/ingredients.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ synonyms:fr:semi-complet, demi-complet

synonyms:fr:semi-complète, demi-complète

synonyms:fr:complet, intégral

synonyms:fr:complète, intégrale

synonyms:lt:ląsteliena, skaidulos

synonyms:lt:pilno grūdo, viso grūdo
Expand Down Expand Up @@ -22553,6 +22557,15 @@ az:Manna yarması
pl:kasza manna, kasza manna z pszenicy
sr:griz


# Soft wheat

<en:wheat
en:soft wheat, tender wheat
es:trigo blando
fr:blé tendre
it:grano tenero

##################################################################################
#
# Durum wheat
Expand All @@ -22561,7 +22574,7 @@ sr:griz
# description:en:Durum wheat (Triticum durum or Triticum turgidum subsp. durum) is a tetraploid species of wheat.

<en:wheat
en:durum wheat
en:durum wheat, hard wheat
af:durum
ar:قمح صلب
bg:твърда пшеница
Expand Down Expand Up @@ -25703,13 +25716,14 @@ nl:biologische tarwemeel
en:wheat flour type 0, flour type 0
bg:Бяло брашно тип 0
de:Weizenmehl Type 0
fr:Farine blanche type 0
fr:farine de blé tendre type 0, farine de blé type 0, Farine blanche type 0
hr:pšenično brašno tip "0"
it:farina di grano tenero tipo 0, farina di grano tipo 0

<en:wheat flour
en:wheat flour type 00, flour type 00
de:Weichweizenmehl Typ 00, Typ 00 Weichweizenmehl, Weizenmehl Typ 00
fr:farine de blé tendre type 00, farine de blé type 00
hr:brašno tip 00
it:farina di grano tenero tipo 00, farina di grano tipo 00
pl:mąka pszenna typ 00
Expand Down Expand Up @@ -25855,24 +25869,25 @@ en:fortified british wheat flour
de:angereichertes britisches Weizenmehl

<en:wheat flour
en:soft wheat flour, tender wheat flour
de:Weichweizenmehl
fi:hieno vehnäjauho
fr:farine de blé tendre
fr:farine de blé tendre, farine de blés tendres
it:farina di grano tenero
ciqual_food_code:en:9410
ciqual_food_name:en:Wheat flour, type 110
ciqual_food_name:fr:Farine de blé tendre ou froment T110

<en:wheat flour
en:Wheat flour type 150
fr:Farine de blé tendre T150
fr:Farine de blé tendre T150, farine de blé T150
ciqual_food_code:en:9415
ciqual_food_name:en:Wheat flour, type 150
ciqual_food_name:fr:Farine de blé tendre ou froment T150

<en:wheat flour
en:Wheat flour type 110
fr:Farine de blé tendre T110
fr:Farine de blé tendre T110, farine de blé t110
ciqual_food_code:en:9410
ciqual_food_name:en:Wheat flour, type 110
ciqual_food_name:fr:Farine de blé tendre ou froment T110
Expand Down
16 changes: 10 additions & 6 deletions taxonomies/ingredients_processing.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1601,12 +1601,15 @@ en:enriched
de:angereichert
es:enriquecida
#de:comment:Can also appear in combinations: de:Eiweißangereichertes
fr:enrichi, enrichie, enrichis
fr:enrichi, enrichie, enrichis, enrichies
hr:obogačena
nl:verrijkt
pl:wzbogacany, wzbogacana, wzbogacane, wzbogacanej, wzbogacanego, wzbogacanych, fortyfikowany, fortyfikowana, fortyfikowane
pt:enriquecido, enriquecida, enriquecidos, enriquecidas, enriquecido com, enriquecida com, enriquecidos com, enriquecidas com, enriquecido em, enriquecida em, enriquecidos em, enriquecidas em

en:fortified
fr:fortifié, fortifiée, fortifiés, fortifiées

#<en:unadded
en:unenriched

Expand Down Expand Up @@ -1863,6 +1866,7 @@ en:fermented
bg:ферментирало, ферментирал, ферментирала, ферментирали
ca:fermentat
es:fermentado
fr:fermenté, fermentée, fermentés, fermentées
hr:fermentirano, fermentirana
hu:fermentált
it:fermentato, fermentata, fermentati, fermentate
Expand Down Expand Up @@ -2075,11 +2079,11 @@ ro:nehidrogenate
sv:ohärdad

#en:description:Breakdown into simpler chemical constituents by appropriate treatment with water and possibly either enzymes or acid/alkali
#en:hydrolysed, hydrolized
#fr:Hydrolisé, hydrolisée
#hr:hidrolizat, hidrolizirane, hidrolizirano
#nl:gehydrolyseerd, gehydrolyseerde
#pl:hydrolizat, hydrolizowane
en:hydrolysed, hydrolized
fr:Hydrolisé, hydrolisée, hydrolisés, hydrolisées
hr:hidrolizat, hidrolizirane, hidrolizirano
nl:gehydrolyseerd, gehydrolyseerde
pl:hydrolizat, hydrolizowane

#en:description:Transformation of unsaturated glycerides into saturated glycerides (of oils and fats)
en:hydrogenated, hardened, partially hardened
Expand Down
21 changes: 15 additions & 6 deletions taxonomies/labels.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1786,7 +1786,7 @@ wikidata:en:Q1076110

<en:Kosher
en:cRc Pareve, Chicago Rabbinical Council Pareve, Chicago Rabbinical Council, cRc
fr:cRc Pareve, cRc
fr:cRc Pareve

<en:Kosher
en:Kof-K
Expand Down Expand Up @@ -17613,16 +17613,18 @@ nb:Debio Ø-merke, Debio økologisk
image:nb:debio-ø-merke.90x90.svg
country:en:Norway

<en:Organic
en:KRAV
ca:Categoria Extra
de:Klasse “Extra”, Klasse Extra
en:“Extra” Class, extra class
es:Categoría “Extra”
fi:"Ekstra"-luokka
fr:Catégorie “Extra”, catégorie supérieure, catégorie extra
it:Classe &quot;Extra&quot;
sv:KRAV

# Should not be associate with "extra class", "superior quality" etc.
<en:Organic
sv:KRAV, KRAV-certifierad ekologisk ingrediens
xx:KRAV
image:en:krav.136x90.svg
country:en:Sweden

Expand Down Expand Up @@ -18636,14 +18638,14 @@ sl:zajamčena tradicionalna posebnost, ZTP
sv:garanterad traditionell specialitet, GTS
wikidata:en:Q2751813

en:Sustainable farming
en:Sustainable farming, sustainable agriculture
bg:Устойчиво земеделие
ca:Agricultura Sostenible
cs:Udržitelné zemědělství
de:Nachhaltige Agrikultur
es:Agricultura sustentable
fi:Kestävä maatalous
fr:Agriculture durable
fr:Agriculture durable, agriculture soutenable, agriculture raisonnée, culture durable, culture soutenable, culture raisonnée
he:חקלאות בת־קיימא
hr:održiva poljoprivreda
hu:Fenntartható gazdálkodás, Fenntartható gazdálkodásból
Expand All @@ -18652,6 +18654,13 @@ nl:Duurzame landbouw
pl:Zrównoważone rolnictwo
pt:Agricultura sustentável

<en:Sustainable farming
fr:CRC, Culture raisonnée contrôlée, filière CRC
origins:en: en:france
ingredients:en: en:wheat

# https://agriculture.gouv.fr/cereales-francaises-la-filiere-crc-culture-raisonnee-controlee

<en:Sustainable farming
en:UTZ Certified, UTZ
ca:Certificat UTZ
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"ingredients" : [
{
"id" : "en:wheat-flour",
"percent_estimate" : 75,
"percent_max" : 100,
"percent_min" : 50,
"text" : "wheat flour",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:fish",
"labels" : "en:sustainable-seafood-msc",
"percent_estimate" : 25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "fish",
"vegan" : "no",
"vegetarian" : "no"
}
],
"ingredients_analysis" : {
"en:non-vegan" : [
"en:fish"
],
"en:non-vegetarian" : [
"en:fish"
]
},
"ingredients_analysis_tags" : [
"en:palm-oil-free",
"en:non-vegan",
"en:non-vegetarian"
],
"ingredients_hierarchy" : [
"en:wheat-flour",
"en:cereal",
"en:flour",
"en:wheat",
"en:cereal-flour",
"en:fish"
],
"ingredients_n" : 2,
"ingredients_n_tags" : [
"2",
"1-10"
],
"ingredients_original_tags" : [
"en:wheat-flour",
"en:fish"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:wheat-flour",
"en:cereal",
"en:flour",
"en:wheat",
"en:cereal-flour",
"en:fish"
],
"ingredients_text" : "wheat flour. MSC (fish). organic. gluten-free",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 2,
"ingredients_with_unspecified_percent_sum" : 100,
"ingredients_without_ciqual_codes" : [
"en:fish",
"en:wheat-flour"
],
"ingredients_without_ciqual_codes_n" : 2,
"known_ingredients_n" : 6,
"labels" : "Organic, en:no-gluten",
"labels_hierarchy" : [
"en:no-gluten",
"en:organic"
],
"labels_lc" : "en",
"labels_tags" : [
"en:no-gluten",
"en:organic"
],
"lc" : "en",
"nutriments" : {
"fruits-vegetables-legumes-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-legumes-estimate-from-ingredients_serving" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0
},
"unknown_ingredients_n" : 0
}
Loading

0 comments on commit ba6739f

Please sign in to comment.