Skip to content

Commit

Permalink
feat: add_hr_develop_ingredients_categories_and_types (#9128)
Browse files Browse the repository at this point in the history
add_hr_develop_ingredients_categories_and_types
  • Loading branch information
benbenben2 authored Oct 16, 2023
1 parent 292d365 commit d2ffc70
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 19 deletions.
113 changes: 96 additions & 17 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3693,16 +3693,46 @@ sub normalize_fr_a_de_b ($a, $b) {
}
}

# English: oil, olive -> olive oil
# French: huile, olive -> huile d'olive
# Russian: масло растительное, пальмовое -> масло растительное оливковое
=head2 normalize_a_of_b ($lc, $a, $b, $of_bool)
This function is called by normalize_enumeration()
Given a category ($a) and a type ($b), it will return the ingredient that result from the combination of these two.
English: oil, olive -> olive oil
Croatian: ječmeni, slad -> ječmeni slad
French: huile, olive -> huile d'olive
Russian: масло растительное, пальмовое -> масло растительное оливковое
=head3 Arguments
=head4 lc
language abbreviation (en for English, for example)
=head4 $a
string, category as defined in %ingredients_categories_and_types, example: 'oil' for 'oil (sunflower, olive and palm)'
=head4 $b
string, type as defined in %ingredients_categories_and_types, example: 'sunflower' or 'olive' or 'palm' for 'oil (sunflower, olive and palm)'
=head3 Return value
=head4 combined $a and $b (or $b and $a, depending of the language), that is expected to be an ingredient
string, comma-joined category and type, example: 'palm vegetal oil' or 'sunflower vegetal oil' or 'olive vegetal oil'
=cut

sub normalize_a_of_b ($lc, $a, $b, $of_bool) {

$a =~ s/\s+$//;
$b =~ s/^\s+//;

if ($lc eq "en") {
if (($lc eq "en") or ($lc eq "hr")) {
return $b . " " . $a;
}
elsif ($lc eq "es") {
Expand All @@ -3721,29 +3751,60 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool) {
return $a . " " . $b;
}
}
elsif (($lc eq "ru") or ($lc eq "pl")) {
elsif (($lc eq "pl") or ($lc eq "ru")) {
return $a . " " . $b;
}
}

# Vegetal oil (palm, sunflower and olive)
# -> palm vegetal oil, sunflower vegetal oil, olive vegetal oil
=head2 normalize_enumeration ($lc, $category, $types, $of_bool)
This function is called by develop_ingredients_categories_and_types()
Some ingredients are specified by an ingredient "category" (e.g. "oil") and a "types" string (e.g. "sunflower, palm").
This function combines the category to all elements of the types string
$category = "Vegetal oil" and $types = "palm, sunflower and olive"
will return
"palm vegetal oil, sunflower vegetal oil, olive vegetal oil"
=head3 Arguments
=head4 lc
language abbreviation (en for English, for example)
sub normalize_enumeration ($lc, $type, $enumeration, $of_bool) {
$log->debug("normalize_enumeration", {type => $type, enumeration => $enumeration}) if $log->is_debug();
=head4 category
string, as defined in %ingredients_categories_and_types, example: 'Vegetal oil' for 'Vegetal oil (sunflower, olive and palm)'
=head4 types
string, as defined in %ingredients_categories_and_types, example: 'sunflower, olive and palm' for 'Vegetal oil (sunflower, olive and palm)'
=head3 Return value
=head4 Transformed ingredients list text
string, comma-joined category with all elements of the types, example: 'sunflower vegetal oil, olive vegetal oil, palm vegetal oil'
=cut

sub normalize_enumeration ($lc, $category, $types, $of_bool) {
$log->debug("normalize_enumeration", {category => $category, types => $types}) if $log->is_debug();

# If there is a trailing space, save it and output it
my $trailing_space = "";
if ($enumeration =~ /\s+$/) {
if ($types =~ /\s+$/) {
$trailing_space = " ";
}

# do not match anything if we don't have a translation for "and"
my $and = $and{$lc} || " will not match ";

my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $enumeration);
my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $types);

return join(", ", map {normalize_a_of_b($lc, $type, $_, $of_bool)} @list) . $trailing_space;
return join(", ", map {normalize_a_of_b($lc, $category, $_, $of_bool)} @list) . $trailing_space;
}

# iodure et hydroxide de potassium
Expand Down Expand Up @@ -4825,6 +4886,16 @@ my %ingredients_categories_and_types = (
[["piment", "poivron"], ["vert", "jaune", "rouge",], 0,],
],

hr => [
# malts
[
# categories
["slad",],
# types
["ječmeni", "pšenični",]
],
],

pl => [
# oils and fats
[
Expand Down Expand Up @@ -4893,11 +4964,11 @@ my @symbols = ('\*\*\*', '\*\*', '\*', '°°°', '°°', '°', '\(1\)', '\(2\)',
my $symbols_regexp = join('|', @symbols);

sub develop_ingredients_categories_and_types ($ingredients_lc, $text) {
$log->debug("develop_ingredients_categories_and_types: start with>$text<") if $log->is_debug();

if (defined $ingredients_categories_and_types{$ingredients_lc}) {

foreach my $categories_and_types_ref (@{$ingredients_categories_and_types{$ingredients_lc}}) {

my $category_regexp = "";
foreach my $category (@{$categories_and_types_ref->[0]}) {
$category_regexp .= '|' . $category . '|' . $category . 's';
Expand Down Expand Up @@ -4956,8 +5027,11 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) {
$and_or = $and_or{$ingredients_lc};
}

if (($ingredients_lc eq "en") or ($ingredients_lc eq "ru") or ($ingredients_lc eq "pl")) {

if ( ($ingredients_lc eq "en")
or ($ingredients_lc eq "hr")
or ($ingredients_lc eq "ru")
or ($ingredients_lc eq "pl"))
{
# vegetable oil (palm, sunflower and olive)
$text
=~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
Expand All @@ -4967,7 +5041,11 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) {
=~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
# vegetable oil: palm
$text
=~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
=~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;

# ječmeni i pšenični slad (barley and wheat malt)
$text
=~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool)/ieg;
}
elsif ($ingredients_lc eq "fr") {
# arôme naturel de pomme avec d'autres âromes
Expand All @@ -4980,6 +5058,7 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) {
# Carbonate de magnésium, fer élémentaire -> should not trigger carbonate de fer élémentaire. Bug #3838
# TODO 18/07/2020 remove when we have a better solution
$text =~ s/fer (é|e)l(é|e)mentaire/fer_élémentaire/ig;

$text
=~ s/($category_regexp)(?::|\(|\[| | de | d')+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
$text =~ s/fer_élémentaire/fer élémentaire/ig;
Expand All @@ -4989,7 +5068,7 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) {
=~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
# huile végétale : colza,
$text
=~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
=~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool)/ieg;
}
}

Expand Down
5 changes: 5 additions & 0 deletions stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Crowdin
csv
CSV
d'acérola
d'olive
dans
dataset
datasets
Expand Down Expand Up @@ -108,12 +109,14 @@ heic
hinnies
http
https
huile
incrontab
Ingrédients
Intermarché
ip
IPs
iso
ječmeni
jpeg
jpf
jpg
Expand Down Expand Up @@ -192,6 +195,7 @@ scanbot
scrypt
Scrypt
sirop
slad
sprintf
ssconvert
stabilisant
Expand Down Expand Up @@ -227,6 +231,7 @@ utilisant
uuid
UUID
Valeur
vegetal
viande
vitamines
VPF
Expand Down
1 change: 1 addition & 0 deletions taxonomies/ingredients.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14072,6 +14072,7 @@ de:modifizierte Weizenstärke
es:almidón de trigo modificado
fi:muunnettu vehnätärkkelys, muunneltu vehnätärkkelys
fr:amidon modifié de blé, amidon transformé de blé
hr:modificirani pšenični škrob
hu:módosított búzakeményítő
it:amido modificato di fromento
nl:gemodificeerd tarwezetmeel
Expand Down
19 changes: 17 additions & 2 deletions tests/unit/ingredients_parsing.t
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ my @lists = (
],
["fr", "huile végétale : colza", "huile végétale de colza"],
["fr", "huile végétale : colza, fraises", "huile végétale de colza, fraises"],
["fr", "huile végétale : colza et tomates", "huile végétale : colza et tomates"],
["fr", "huile végétale : colza et tomates", "huile végétale de colza et tomates"],
["en", "vegetable oil: sunflower", "sunflower vegetable oil"],
["en", "vegetable oil (palm)", "palm vegetable oil"],
["en", "vegetable oils (palm, olive)", "palm vegetable oils, olive vegetable oils"],
Expand Down Expand Up @@ -626,8 +626,23 @@ my @lists = (
["it", "formaggio, E 472 e, E470a.", "formaggio, e472 e, e470a."],
["it", "formaggio, E 472 e E470a.", "formaggio, e472, e470a."],
["sk", "syr, E470 a E470a, mlieko.", "syr, e470, e470a, mlieko."],
# Piments (vert, rouge, jaune) -> Piments vert, Piments rouge, Piments jaune
# normalize category and types
["fr", "Piments (vert, rouge, jaune)", "Piments vert, Piments rouge, Piments jaune"],
[
"fr",
"Huiles végétales de palme, de colza et de tournesol",
"Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol"
],
["fr", "arôme naturel de pomme avec d'autres âromes", "arôme naturel de pomme, âromes"],
["fr", "Carbonate de magnésium, fer élémentaire", "Carbonate de magnésium, fer élémentaire"],
["fr", "huile végétale (colza)", "huile végétale de colza"],
["fr", "huile végétale : colza", "huile végétale de colza"],
["hr", "ječmeni i pšenični slad", "ječmeni slad, pšenični slad"],
["hr", "ječmeni, ječmeni i pšenični slad", "ječmeni slad, ječmeni slad, pšenični slad"],
["en", "Vegetal oil (sunflower, olive and palm)", "sunflower vegetal oil, olive vegetal oil, palm vegetal oil"],
["en", "vegetable oil (palm)", "palm vegetable oil"],
["en", "vegetable oil: palm", "palm vegetable oil"],

);

foreach my $test_ref (@lists) {
Expand Down

0 comments on commit d2ffc70

Please sign in to comment.