Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update category matching algorithm #924

Closed
wants to merge 8 commits into from
Prev Previous commit
Next Next commit
feat: use gzipped version of fallback taxonomy files
And fix encoding UTF-8 errors in ingredient taxonomy JSON
raphael0202 committed Oct 11, 2022

Verified

This commit was signed with the committer’s verified signature.
AdmiralCurtiss Admiral H. Curtiss
commit 813cb41aaa44ceb2b82b864a92f36a5582843e1b
1 change: 0 additions & 1 deletion data/taxonomies/brands.full.json

This file was deleted.

Binary file added data/taxonomies/brands.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/categories.full.json

This file was deleted.

Binary file added data/taxonomies/categories.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/ingredients.full.json

This file was deleted.

Binary file added data/taxonomies/ingredients.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/labels.full.json

This file was deleted.

Binary file added data/taxonomies/labels.full.json.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
@@ -129,10 +129,10 @@ def off_credentials() -> Dict[str, str]:
# Taxonomies are huge JSON files that describe many concepts in OFF, in many languages, with synonyms. Those are the full version of taxos.

TAXONOMY_DIR = DATA_DIR / "taxonomies"
TAXONOMY_CATEGORY_PATH = TAXONOMY_DIR / "categories.full.json"
TAXONOMY_INGREDIENT_PATH = TAXONOMY_DIR / "ingredients.full.json"
TAXONOMY_LABEL_PATH = TAXONOMY_DIR / "labels.full.json"
TAXONOMY_BRAND_PATH = TAXONOMY_DIR / "brands.full.json"
TAXONOMY_CATEGORY_PATH = TAXONOMY_DIR / "categories.full.json.gz"
TAXONOMY_INGREDIENT_PATH = TAXONOMY_DIR / "ingredients.full.json.gz"
TAXONOMY_LABEL_PATH = TAXONOMY_DIR / "labels.full.json.gz"
TAXONOMY_BRAND_PATH = TAXONOMY_DIR / "brands.full.json.gz"
INGREDIENTS_FR_PATH = TAXONOMY_DIR / "ingredients_fr.txt"
INGREDIENT_TOKENS_PATH = TAXONOMY_DIR / "ingredients_tokens.txt"
FR_TOKENS_PATH = TAXONOMY_DIR / "fr_tokens_lower.gz"
7 changes: 2 additions & 5 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
@@ -4,11 +4,10 @@
from enum import Enum
from typing import Dict, Iterable, List, Optional, Set, Union

import orjson
import requests

from robotoff import settings
from robotoff.utils import get_logger, http_session
from robotoff.utils import get_logger, http_session, load_json
from robotoff.utils.cache import CachedStore
from robotoff.utils.types import JSONType

@@ -223,9 +222,7 @@ def from_dict(cls, data: JSONType) -> "Taxonomy":

@classmethod
def from_json(cls, file_path: Union[str, pathlib.Path]):
with open(str(file_path), "rb") as f:
data = orjson.loads(f.read())
return cls.from_dict(data)
return cls.from_dict(load_json(file_path, compressed=True))

def to_graph(self):
"""Generate a networkx.DiGraph from the taxonomy."""