Skip to content

Commit

Permalink
feat: use gzipped version of fallback taxonomy files
Browse files Browse the repository at this point in the history
And fix encoding UTF-8 errors in ingredient taxonomy JSON
  • Loading branch information
raphael0202 committed Oct 11, 2022
1 parent 67112f9 commit 813cb41
Show file tree
Hide file tree
Showing 10 changed files with 6 additions and 13 deletions.
1 change: 0 additions & 1 deletion data/taxonomies/brands.full.json

This file was deleted.

Binary file added data/taxonomies/brands.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/categories.full.json

This file was deleted.

Binary file added data/taxonomies/categories.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/ingredients.full.json

This file was deleted.

Binary file added data/taxonomies/ingredients.full.json.gz
Binary file not shown.
1 change: 0 additions & 1 deletion data/taxonomies/labels.full.json

This file was deleted.

Binary file added data/taxonomies/labels.full.json.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ def off_credentials() -> Dict[str, str]:
# Taxonomies are huge JSON files that describe many concepts in OFF, in many languages, with synonyms. Those are the full version of taxos.

TAXONOMY_DIR = DATA_DIR / "taxonomies"
TAXONOMY_CATEGORY_PATH = TAXONOMY_DIR / "categories.full.json"
TAXONOMY_INGREDIENT_PATH = TAXONOMY_DIR / "ingredients.full.json"
TAXONOMY_LABEL_PATH = TAXONOMY_DIR / "labels.full.json"
TAXONOMY_BRAND_PATH = TAXONOMY_DIR / "brands.full.json"
TAXONOMY_CATEGORY_PATH = TAXONOMY_DIR / "categories.full.json.gz"
TAXONOMY_INGREDIENT_PATH = TAXONOMY_DIR / "ingredients.full.json.gz"
TAXONOMY_LABEL_PATH = TAXONOMY_DIR / "labels.full.json.gz"
TAXONOMY_BRAND_PATH = TAXONOMY_DIR / "brands.full.json.gz"
INGREDIENTS_FR_PATH = TAXONOMY_DIR / "ingredients_fr.txt"
INGREDIENT_TOKENS_PATH = TAXONOMY_DIR / "ingredients_tokens.txt"
FR_TOKENS_PATH = TAXONOMY_DIR / "fr_tokens_lower.gz"
Expand Down
7 changes: 2 additions & 5 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@
from enum import Enum
from typing import Dict, Iterable, List, Optional, Set, Union

import orjson
import requests

from robotoff import settings
from robotoff.utils import get_logger, http_session
from robotoff.utils import get_logger, http_session, load_json
from robotoff.utils.cache import CachedStore
from robotoff.utils.types import JSONType

Expand Down Expand Up @@ -223,9 +222,7 @@ def from_dict(cls, data: JSONType) -> "Taxonomy":

@classmethod
def from_json(cls, file_path: Union[str, pathlib.Path]):
with open(str(file_path), "rb") as f:
data = orjson.loads(f.read())
return cls.from_dict(data)
return cls.from_dict(load_json(file_path, compressed=True))

def to_graph(self):
"""Generate a networkx.DiGraph from the taxonomy."""
Expand Down

0 comments on commit 813cb41

Please sign in to comment.