Skip to content

Commit

Permalink
feat: parse origin of ingredients for Japanese (#9125)
Browse files Browse the repository at this point in the history
  • Loading branch information
benbenben2 authored Nov 30, 2023
1 parent 5410d24 commit 730f621
Show file tree
Hide file tree
Showing 12 changed files with 4,236 additions and 46 deletions.
80 changes: 60 additions & 20 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ use Data::DeepAccess qw(deep_get deep_exists);
# U+204D "⁍" (Black Rightwards Bullet)
# U+2219 "∙" (Bullet Operator )
# U+22C5 "⋅" (Dot Operator)
# U+30FB "・" (Katakana Middle Dot)
my $middle_dot
= qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i;
= qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5}|\N{U+30FB})/i;

# Unicode category 'Punctuation, Dash', SWUNG DASH and MINUS SIGN
my $dashes = qr/(?:\p{Pd}|\N{U+2053}|\N{U+2212})/i;
Expand Down Expand Up @@ -466,6 +467,7 @@ my %and_or = (
fr => " et | ou | et/ou | et / ou ",
is => " og | eða | og/eða | og / eða ",
it => " e | o | e/o | e / o",
ja => "又は", # or
nl => " en/of | en / of ",
nb => " og | eller | og/eller | og / eller ",
pl => " i | oraz | lub | albo ",
Expand Down Expand Up @@ -1898,15 +1900,13 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {
# e.g. (Contains milk.) -> Contains milk.
$between =~ s/(\s|\.)+$//;

$debug_ingredients and $log->debug("found sub-ingredients", {between => $between, after => $after})
$debug_ingredients and $log->debug("parse_ingredients_text - sub-ingredients found: $between")
if $log->is_debug();

# percent followed by a separator, assume the percent applies to the parent (e.g. tomatoes)
# tomatoes (64%, origin: Spain)
# tomatoes (145g per 100g of finished product)

if (($between =~ $separators) and ($` =~ /^$percent_or_quantity_regexp$/i)) {

$percent_or_quantity_value = $1;
$percent_or_quantity_unit = $2;
# remove what is before the first separator
Expand All @@ -1924,10 +1924,20 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {

# sel marin (France, Italie)
# -> if we have origins, put "origins:" before
if ( ($between =~ $separators)
and (exists_taxonomy_tag("origins", canonicalize_taxonomy_tag($ingredients_lc, "origins", $`))))
if (
(
($between =~ /$separators|$and/)
and (
exists_taxonomy_tag(
"origins", canonicalize_taxonomy_tag($ingredients_lc, "origins", $`)
)
)
)
or ($between =~ /産|製造/)
)
{
$between =~ s/^(.*?$separators)/origins:$1/;
# prepend "origins:" in the beginning of the text, that will be reused below
$between = "origins:" . $between;
}

$debug_ingredients and $log->debug(
Expand All @@ -1940,59 +1950,82 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {
}
) if $log->is_debug();

# : is in $separators but we want to keep "origine : France" or "min : 23%"
if ( ($between =~ $separators)
and ($` !~ /\s*(origin|origins|origine|alkuperä|ursprung)\s*/i)
and ($between !~ /^$percent_or_quantity_regexp$/i))
{
$between_level = $level + 1;
$debug_ingredients and $log->debug("between contains a separator", {between => $between})
if $log->is_debug();
$log->debug(
"parse_ingredients_text - sub-ingredients: between contains a separator and is not origin nor has percent",
{between => $between}
) if $log->is_debug();
}
else {
# no separator found : 34% ? or single ingredient
$debug_ingredients
and $log->debug("between does not contain a separator", {between => $between})
if $log->is_debug();
$log->debug(
"parse_ingredients_text - sub-ingredients: between does not contain a separator or is origin or is percent",
{between => $between}
) if $log->is_debug();

if ($between =~ /^$percent_or_quantity_regexp(?:$per_100g_regexp)?$/i) {

$percent_or_quantity_value = $1;
$percent_or_quantity_unit = $2;
$debug_ingredients
and $log->debug(
"between is a percent",
$log->debug(
"parse_ingredients_text - sub-ingredients: between is a percent",
{
between => $between,
percent_or_quantity_value => $percent_or_quantity_value,
percent_or_quantity_unit => $percent_or_quantity_unit
}
) if $log->is_debug();
) if $log->is_debug();
$between = '';
}
else {
# label? (organic)
# origin? (origine : France)
$log->debug("parse_ingredients_text - sub-ingredients: label? origin? ($between)")
if $log->is_debug();

# try to remove the origin and store it as property
if ($between
=~ /\s*(de origine|d'origine|origine|origin|origins|alkuperä|ursprung|oorsprong)\s?:?\s?\b(.*)$/i
=~ /\s*(?:de origine|d'origine|origine|origin|origins|alkuperä|ursprung|oorsprong)\s?:?\s?\b(.*)$/i
)
{
$log->debug("parse_ingredients_text - sub-ingredients: contains origin in $between")
if $log->is_debug();

$between = '';
my $origin_string = $2;
# rm first occurence (origin:)
my $origin_string = $1;

# rm additional parenthesis and its content that are sub-ingredient of origing (not parsed for now)
# example: "トマト (輸入又は国産 (未満 5%))"" (i.e., "Tomatoes (imported or domestically produced (less than 5%)))"")
$origin_string =~ s/\s*\([^)]*\)//g;

if ($ingredients_lc eq 'ja') {
# rm all occurences at the end of words (ブラジル産、エチオピア産)
$origin_string =~ s/(産|製造)//g;
# remove "and more" その他
$origin_string =~ s/(?: and )?その他//g;
}

# d'origine végétale -> not a geographic origin, add en:vegan
if ($origin_string =~ /vegetal|végétal/i) {
$vegan = "en:yes";
$vegetarian = "en:yes";
}
else {

$origin = join(",",
map {canonicalize_taxonomy_tag($ingredients_lc, "origins", $_)}
split(/,/, $origin_string));
split(/$commas|$and/, $origin_string));
}
}
else {
$log->debug(
"parse_ingredients_text - sub-ingredients: origin not explicitly written in: $between"
) if $log->is_debug();

# origins: Fraise (France)
my $originid = canonicalize_taxonomy_tag($ingredients_lc, "origins", $between);
Expand All @@ -2003,6 +2036,9 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {
$log->debug("between is an origin", {between => $between, origin => $origin})
if $log->is_debug();
$between = '';
$log->debug(
"parse_ingredients_text - sub-ingredients: between is an origin: $between")
if $log->is_debug();
}
# put origins first because the country can be associated with the label "Made in ..."
# Skip too short entries (1 or 2 letters) to avoid false positives
Expand Down Expand Up @@ -2545,6 +2581,10 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) {

'it' => ['^in proporzion[ei] variabil[ei]$',],

'ja' => [
'その他', # etc.
],

'nb' => ['^Pakket i beskyttende atmosfære$',],

'nl' => [
Expand Down
2 changes: 1 addition & 1 deletion taxonomies/additives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2045,7 +2045,7 @@ fr:E160a, carotènes mélangés, carotène, γ-Carotène, gamma-carotène, Alpha
hr:E160a, karoten, bojilo karoteni, karoteni
hu:E160a, Karotinok
it:E160a, Carotene, Carotina, Caroteni
ja:E160a, カロチン, カロテン
ja:E160a, カロチン, カロテン, カロテン色素
lt:E160a, Karotinas
lv:E160a, E160a food additive
mt:E160a, E160a food additive
Expand Down
2 changes: 1 addition & 1 deletion taxonomies/countries.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21480,7 +21480,7 @@ io:Japonia
is:Japan
it:Giappone
iu:ᓃᑉᐊᓐ
ja:日本, 日本国
ja:日本, 日本国, 国, 国内, 国産
jbo:pongu'e
jv:Jepang
ka:იაპონია
Expand Down
65 changes: 57 additions & 8 deletions taxonomies/ingredients.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ wikidata:en:Q421576
wikipedia:en:https://en.wikipedia.org/wiki/Enilconazole
# ingredient/fr:imazalil has 23 products @2019-05-29

en:frying
fr:friture

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
Expand Down Expand Up @@ -331,7 +332,7 @@ hy:Կարոտին
id:Karotena
io:Karotino
it:Carotene
ja:カロテン
ja:カロテン, カロテン色素
kk:Каротин
ko:카로틴
lt:karotinas
Expand Down Expand Up @@ -10480,6 +10481,7 @@ fr:bœuf, boeuf, Boeufs
hr:goveđa, govedi, govedina
hu:marha
it:manzo
ja:ビーフ
nb:storfekjøtt
pl:Wołowina, wołowe, wołowa, wołowy
pt:Carne bovina
Expand Down Expand Up @@ -13021,6 +13023,7 @@ bg:свински бульон
de:Schweinefleischbrühe
es:caldo de cerdo
fr:bouillon de porc
ja:ポークブイヨン

# <en:bone
fr:os et viande de porc
Expand Down Expand Up @@ -13454,6 +13457,7 @@ bg:хидролизиран колаген
de:kollagen-hydrolysat
es:colágeno hidrolizado
hr:hidrolizat kolagena
ja:たん白加水分解物, 蛋白加水分解物
pl:hydrolizat kolagenu
sv:kollagenpeptider
vi:collagen peptide
Expand Down Expand Up @@ -13795,7 +13799,7 @@ id:Amilum
io:Amilo
is:Sterkja
it:Amido, amidi, fecola
ja:デンプン, でん粉
ja:デンプン, でん粉, 澱粉
ka:სახამებელი
kk:Крахмал
ko:녹말
Expand Down Expand Up @@ -17600,7 +17604,7 @@ id:vinegar
io:vinagro
is:edik
it:aceto
ja:酢
ja:酢, 醸造酢
kk:сірке суы
kn:ವಿನಿಗರ್
ko:식초
Expand Down Expand Up @@ -24333,7 +24337,7 @@ hu:zab
id:haver
is:hafrar
it:avena
ja:エンバク
ja:エンバク, オーツ麦
jv:haver
ka:შვრია
kk:екпе сұлы
Expand Down Expand Up @@ -28431,7 +28435,7 @@ id:gula
is:sykur, matarsykur
it:zucchero, zuccheri
iu:ᓱᑲᒃ
ja:砂糖
ja:砂糖, 糖類
jv:Gula
ka:შაქარი
kk:Қант
Expand Down Expand Up @@ -43058,7 +43062,7 @@ id:Pepaya
io:Papayo
is:sólaldin, papaya
it:papaia, papaya
ja:パパイア
ja:パパイア, パパイヤ
jv:katès
kg:dilolo
kk:papaýya, Папайя, پاپاييا
Expand Down Expand Up @@ -51859,6 +51863,30 @@ wikidata:en:Q8047551
wikipedia:en:https://en.wikipedia.org/wiki/Yam_(vegetable)
eurocode_2_group_3:en: 8.34.40

<en:yam
en:chinese yam
ar:ديسقوريا متعددة السنيبلات
eo:Dioscorea polystachya
fa:یم چینی
fr:Dioscorea polystachya
hu:Dioscorea polystachya
it:Dioscorea polystachya
ja:ナガイモ, 長芋
ko:마
lt:Batatinė dioskorėja
lv:Ķīnas jamss
ms:Ubi cina
pl:Pochrzyn chiński
ru:Диоскорея многокистевая
sv:Dioscorea polystachya
th:ฮ่วยซัว
uk:Китайський ямс
vi:Dioscorea polystachya
za:Maenzbya
zh:薯蕷
wikidata:en:Q5279593


# description:en:The POTATO is a starchy, tuberous crop from the perennial nightshade Solanum tuberosum.

<en:root vegetable
Expand Down Expand Up @@ -53851,7 +53879,7 @@ hy:Կարմիր պղպեղ
id:Cabai
is:Chilli pipar
it:peperoncino
ja:唐辛子
ja:唐辛子, チリ
jv:Lombok abang
ka:წიწაკა
kn:ಮೆಣಸಿನಕಾಯಿ
Expand Down Expand Up @@ -81368,6 +81396,7 @@ fi:kokonainen muna, kokonaiset munat
fr:œuf entier, œufs entiers, oeuf entier, oeufs entiers
hu:egész tojás
it:uova intere
ja:全卵
nl:hele eieren, geheel eieren
pl:Całe jajo
pt:Ovo inteiro
Expand Down Expand Up @@ -84832,6 +84861,21 @@ sc:Panada
wikidata:en:Q3892955
# de:comment:Panade (Weizenmehl, Sonnenblumenöl, Salz, Hefe, Gewürz Paprika)

<en:coating
<en:frying
en:Tenkasu
de:Tenkasu
es:Tenkasu
fr:Tenkasu
it:Tenkasu
ja:天かす
th:เท็งกาซุ
wikidata:en:Q783828

<en:Tenkasu
en:Tenkasu with shrimp
ja:えび入り天かす

# <en:compound
fr:pain de mie au blé malté
# ingredient/fr:pain-de-mie-au-blé-malté has 22 products in french @2019-02-09
Expand Down Expand Up @@ -85950,6 +85994,11 @@ sv:köttbuljong, köttfond
wikidata:en:Q67860017
# ingredient/fr:bouillon-de-viande has 22 products in 4 languages @2020-06-13

<en:broth
en:white dashi
ja:白だし
wikidata:en:Q11579290

# description:en:LEMONADE can be any one of a variety of sweetened beverages found throughout the world, but which are traditionally all characterized by a lemon flavor.

# <en:compound
Expand Down Expand Up @@ -86660,7 +86709,7 @@ id:Saus
io:Sauco
is:Sósa
it:salsa
ja:ソース
ja:ソース, ドレッシング
jv:Saos
ka:საწებელი
kk:Соус
Expand Down
Loading

0 comments on commit 730f621

Please sign in to comment.