Skip to content

Commit

Permalink
fix: use openfoodfacts-python package for taxonomy processing
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Aug 7, 2023
1 parent a0eae2d commit eaaeca3
Show file tree
Hide file tree
Showing 9 changed files with 467 additions and 696 deletions.
835 changes: 429 additions & 406 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ lark = "~1.1.4"
h5py = "~3.8.0"
opencv-contrib-python = "~4.7.0.72"
toml = "~0.10.2"
openfoodfacts = "~0.1.5"

[tool.poetry.dependencies.sentry-sdk]
version = "~1.14.0"
Expand Down
2 changes: 1 addition & 1 deletion robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def add_optional_fields(cls, insight: ProductInsight, product: Optional[Product]
campaigns = []
if (
insight.value_tag in taxonomy
and "agribalyse_food_code" in taxonomy[insight.value_tag].additional_data
and "agribalyse_food_code" in taxonomy[insight.value_tag].properties
):
# This category is linked to an agribalyse category, add it as a
# campaign tag
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np

from robotoff import settings
from robotoff.taxonomy import Taxonomy, fetch_taxonomy
from robotoff.taxonomy import Taxonomy
from robotoff.types import JSONType
from robotoff.utils.text import KeywordProcessor

Expand All @@ -35,11 +35,7 @@

@functools.cache
def get_ingredient_taxonomy():
return fetch_taxonomy(
"",
V3_MODEL_DATA_DIR / "ingredients.full.json.gz",
offline=True,
)
return Taxonomy.from_path(V3_MODEL_DATA_DIR / "ingredients.full.json.gz")


@functools.cache
Expand Down
297 changes: 23 additions & 274 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,255 +1,18 @@
import collections
import pathlib
from enum import Enum
from typing import Any, Iterable, Optional, Union
from typing import Optional

import cachetools
import requests
from openfoodfacts.taxonomy import Taxonomy
from openfoodfacts.taxonomy import get_taxonomy as _get_taxonomy
from openfoodfacts.types import TaxonomyType

from robotoff import settings
from robotoff.types import JSONType
from robotoff.utils import get_logger, http_session, load_json
from robotoff.utils import get_logger
from robotoff.utils.text import get_tag

try:
import networkx
except ImportError:
networkx = None

logger = get_logger(__name__)


class TaxonomyType(Enum):
category = 1
ingredient = 2
label = 3
brand = 4
packaging_shape = 5
packaging_material = 6
packaging_recycling = 7


class TaxonomyNode:
__slots__ = ("id", "names", "parents", "children", "synonyms", "additional_data")

def __init__(
self,
identifier: str,
names: dict[str, str],
synonyms: Optional[dict[str, list[str]]],
additional_data: Optional[dict[str, Any]] = None,
):
self.id: str = identifier
self.names: dict[str, str] = names
self.parents: list["TaxonomyNode"] = []
self.children: list["TaxonomyNode"] = []
self.additional_data = additional_data or {}

if synonyms:
self.synonyms = synonyms
else:
self.synonyms = {}

def is_child_of(self, item: "TaxonomyNode") -> bool:
"""Return True if `item` is a child of `self` in the taxonomy."""
if not self.parents:
return False

if item in self.parents:
return True

for parent in self.parents:
is_parent = parent.is_child_of(item)

if is_parent:
return True

return False

def is_parent_of(self, item: "TaxonomyNode") -> bool:
return item.is_child_of(self)

def is_parent_of_any(self, candidates: Iterable["TaxonomyNode"]) -> bool:
for candidate in candidates:
if candidate.is_child_of(self):
return True

return False

def get_parents_hierarchy(self) -> list["TaxonomyNode"]:
"""Return the list of all parent nodes (direct and indirect)."""
all_parents = []
seen: set[str] = set()

if not self.parents:
return []

for self_parent in self.parents:
if self_parent.id not in seen:
all_parents.append(self_parent)
seen.add(self_parent.id)

for parent_parent in self_parent.get_parents_hierarchy():
if parent_parent.id not in seen:
all_parents.append(parent_parent)
seen.add(parent_parent.id)

return all_parents

def get_localized_name(self, lang: str) -> str:
if lang in self.names:
return self.names[lang]

if "xx" in self.names:
# Return international name if it exists
return self.names["xx"]

return self.id

def get_synonyms(self, lang: str) -> list[str]:
return self.synonyms.get(lang, [])

def add_parents(self, parents: Iterable["TaxonomyNode"]):
for parent in parents:
if parent not in self.parents:
self.parents.append(parent)
parent.children.append(self)

def to_dict(self) -> JSONType:
return {"name": self.names, "parents": [p.id for p in self.parents]}

def __repr__(self):
return "<TaxonomyNode %s>" % self.id


class Taxonomy:
def __init__(self) -> None:
self.nodes: dict[str, TaxonomyNode] = {}

def add(self, key: str, node: TaxonomyNode) -> None:
self.nodes[key] = node

def __contains__(self, item: str):
return item in self.nodes

def __getitem__(self, item: str):
return self.nodes.get(item)

def __len__(self):
return len(self.nodes)

def iter_nodes(self) -> Iterable[TaxonomyNode]:
"""Iterate over the nodes of the taxonomy."""
return iter(self.nodes.values())

def keys(self):
return self.nodes.keys()

def find_deepest_nodes(self, nodes: list[TaxonomyNode]) -> list[TaxonomyNode]:
"""Given a list of nodes, returns the list of nodes where all the
parents within the list have been removed.
For example, for a taxonomy, 'fish' -> 'salmon' -> 'smoked-salmon':
['fish', 'salmon'] -> ['salmon'] ['fish', 'smoked-salmon'] ->
[smoked-salmon']
"""
excluded: set[str] = set()

for node in nodes:
for second_node in (
n for n in nodes if n.id not in excluded and n.id != node.id
):
if node.is_child_of(second_node):
excluded.add(second_node.id)

return [node for node in nodes if node.id not in excluded]

def is_parent_of_any(
self, item: str, candidates: Iterable[str], raises: bool = True
) -> bool:
"""Return True if `item` is parent of any candidate, False otherwise.
If the item is not in the taxonomy and raises is False, return False.
:param item: The item to compare
:param candidates: A list of candidates
:param raises: if True, raises a ValueError if item is not in the
taxonomy, defaults to True.
"""
node: TaxonomyNode = self[item]

if node is None:
if raises:
raise ValueError(f"unknown id in taxonomy: {node}")
else:
return False

to_check_nodes: set[TaxonomyNode] = set()

for candidate in candidates:
candidate_node = self[candidate]

if candidate_node is not None:
to_check_nodes.add(candidate_node)

return node.is_parent_of_any(to_check_nodes)

def get_localized_name(self, key: str, lang: str) -> str:
if key not in self.nodes:
return key

return self.nodes[key].get_localized_name(lang)

def to_dict(self) -> JSONType:
export = {}

for key, node in self.nodes.items():
export[key] = node.to_dict()

return export

@classmethod
def from_dict(cls, data: JSONType) -> "Taxonomy":
taxonomy = Taxonomy()

for key, key_data in data.items():
if key not in taxonomy:
node = TaxonomyNode(
identifier=key,
names=key_data.get("name", {}),
synonyms=key_data.get("synonyms", None),
additional_data={
k: v
for k, v in key_data.items()
if k not in {"parents", "name", "synonyms", "children"}
},
)
taxonomy.add(key, node)

for key, key_data in data.items():
node = taxonomy[key]
parents = [taxonomy[ref] for ref in key_data.get("parents", [])]
node.add_parents(parents)

return taxonomy

@classmethod
def from_json(cls, file_path: Union[str, pathlib.Path]):
return cls.from_dict(load_json(file_path, compressed=True)) # type: ignore

def to_graph(self):
"""Generate a networkx.DiGraph from the taxonomy."""
graph = networkx.DiGraph()
graph.add_nodes_from((x.id for x in self.iter_nodes()))

for node in self.iter_nodes():
for child in node.children:
graph.add_edge(node.id, child.id)

return graph


def generate_category_hierarchy(
taxonomy: Taxonomy, category_to_index: dict[str, int], root: int
):
Expand Down Expand Up @@ -280,42 +43,28 @@ def generate_category_hierarchy(
return categories_hierarchy_list


def fetch_taxonomy(
url: str, fallback_path: Optional[str] = None, offline: bool = False
) -> Taxonomy:
if offline and fallback_path:
return Taxonomy.from_json(fallback_path)

try:
r = http_session.get(url, timeout=120) # might take some time
if r.status_code >= 300:
raise requests.HTTPError(
"Taxonomy download at %s returned status code %s", url, r.status_code
)
data = r.json()
except Exception as e:
logger.exception(f"{type(e)} exception while fetching taxonomy at %s", url)
if fallback_path:
return Taxonomy.from_json(fallback_path)
else:
raise e

return Taxonomy.from_dict(data)


@cachetools.cached(cache=cachetools.TTLCache(maxsize=100, ttl=12 * 60 * 60)) # 12h
def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy:
"""Returned the requested Taxonomy."""
logger.info("Loading taxonomy %s...", taxonomy_type)
"""Return the taxonomy of type `taxonomy_type`.
The taxonomy is cached in memory and locally on disk. Every 12h, we check
if a new version is available and download if True.
if taxonomy_type not in settings.TAXONOMY_URLS:
raise ValueError(f"unknown taxonomy type: {taxonomy_type}")
A local static version can also be fetched (for unit tests for example)
with `offline=True`.
:param taxonomy_type: the taxonomy type
:param offline: if True, return a local static version of the taxonomy,
defaults to False. It's not available for all taxonomy types.
:return: the Taxonomy
"""
if offline:
return Taxonomy.from_path(str(settings.TAXONOMY_PATHS[taxonomy_type]))

url = settings.TAXONOMY_URLS[taxonomy_type]
return fetch_taxonomy(
url,
fallback_path=str(settings.TAXONOMY_PATHS.get(taxonomy_type, "")) or None,
offline=offline,
return _get_taxonomy(
TaxonomyType[taxonomy_type],
force_download=False,
cache_dir=settings.DATA_DIR / "taxonomies",
)


Expand Down
3 changes: 2 additions & 1 deletion scripts/category_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
import os
from typing import Iterator, Optional

from openfoodfacts.taxonomy import Taxonomy, TaxonomyNode
from sklearn.model_selection import train_test_split

from robotoff import settings
from robotoff.products import ProductDataset, ProductStream
from robotoff.taxonomy import Taxonomy, TaxonomyNode, get_taxonomy
from robotoff.taxonomy import get_taxonomy
from robotoff.types import JSONType
from robotoff.utils import dump_jsonl, get_logger

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ def peewee_db(peewee_db_create):

@pytest.fixture(scope="session")
def category_taxonomy():
return Taxonomy.from_json(settings.TAXONOMY_PATHS["category"])
return Taxonomy.from_path(settings.TAXONOMY_PATHS["category"])
Loading

0 comments on commit eaaeca3

Please sign in to comment.