openfoodfacts · raphael0202 · Aug 20, 2023 · Aug 16, 2023 · Aug 17, 2023 · Aug 17, 2023
@@ -95,19 +95,31 @@ log:
 # Management #
 #------------#
 
-dl-models:
-	@echo "🥫 Downloading model files …"
-	mkdir -p models/triton; \
+dl-models: dl-langid-model dl-object-detection-models dl-category-classifier-model dl-ingredient-detection-model
+	@echo "⏬ Downloading all models …"
+
+dl-langid-model:
+	@echo "⏬ Downloading language identification model file …"
+	mkdir -p models; \
 	cd models; \
-	wget -cO - https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin > lid.176.bin; \
-	cd triton; \
+	wget -cO - https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin > lid.176.bin;
+
+dl-object-detection-models:
+	@echo "⏬ Downloading object detection model files …"
+	mkdir -p models/triton; \
+	cd models/triton; \
 	for asset_name in ${ML_OBJECT_DETECTION_MODELS}; \
 		do \
 			dir=`echo $${asset_name} | sed 's/tf-//g'`; \
 			mkdir -p $${dir}/1; \
 			wget -cO - https://github.com/openfoodfacts/robotoff-models/releases/download/$${asset_name}-1.0/model.onnx > $${dir}/1/model.onnx; \
 	done; \
-	mkdir -p clip clip/1; \
+
+dl-category-classifier-model:
+	@echo "⏬ Downloading category classifier model files …"
+	mkdir -p models/triton; \
+	cd models/triton; \
+	mkdir -p clip/1; \
 	wget -cO - https://github.com/openfoodfacts/robotoff-models/releases/download/clip-vit-base-patch32/model.onnx > clip/1/model.onnx; \
 	dir=category-classifier-keras-image-embeddings-3.0/1/model.savedmodel; \
 	mkdir -p $${dir}; \
@@ -116,6 +128,17 @@ dl-models:
 	tar -xzvf saved_model.tar.gz --strip-component=1; \
 	rm saved_model.tar.gz
 
+dl-ingredient-detection-model:
+	@echo "⏬ Downloading ingredient detection model files …"
+	mkdir -p models/triton; \
+	cd models/triton; \
+    dir=ingredient-ner/1/model.onnx; \
+	mkdir -p $${dir}; \
+	wget -cO - https://github.com/openfoodfacts/robotoff-models/releases/download/pytorch-ingredient-detection-1.0/onnx.tar.gz > $${dir}/onnx.tar.gz; \
+	cd $${dir}; \
+	tar -xzvf onnx.tar.gz --strip-component=1; \
+	rm onnx.tar.gz
+
 init-elasticsearch:
 	@echo "Initializing elasticsearch indices"
 	${DOCKER_COMPOSE} up -d elasticsearch 2>&1

@@ -1586,7 +1586,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         )
 
         offset: int = (page - 1) * count
-        insights = [i for i in get_insights_(limit=count, offset=offset)]
+        insights = list(get_insights_(limit=count, offset=offset))
 
         response["count"] = get_insights_(count=True)
 

@@ -65,7 +65,7 @@
 
                 results = [r for r in item["result"] if r["score"] > 0.1]
                 data = {"objects": results}
-                max_confidence = max([r["score"] for r in results], default=None)
+                max_confidence = max((r["score"] for r in results), default=None)
 
                 inserted += 1
                 image_prediction = ImagePrediction.create(

@@ -1,13 +1,8 @@
-import logging
-import os
-import pathlib
 from pathlib import Path
 from typing import Optional
 
 import typer
 
-from robotoff.elasticsearch import get_es_client
-from robotoff.off import get_barcode_from_url
 from robotoff.types import (
     NeuralCategoryClassifierModel,
     ObjectDetectionModel,
@@ -164,6 +159,8 @@
     the deepmost categories for a predicted taxonomy chain. For example, if we
     predict 'fresh vegetables' -> 'legumes' -> 'beans' for a product, setting
     deepest_only=True will return 'beans'."""
+    import logging
+
     from robotoff.off import get_product
     from robotoff.prediction.category.neural.category_classifier import (
         CategoryClassifier,
@@ -201,11 +198,11 @@
     batch_size: int = typer.Option(
         128, help="Number of insights that are imported in each atomic SQL transaction"
     ),
-    input_path: Optional[pathlib.Path] = typer.Option(
+    input_path: Optional[Path] = typer.Option(
         None,
         help="Input path of the JSONL archive, is incompatible with --generate-from",
     ),
-    generate_from: Optional[pathlib.Path] = typer.Option(
+    generate_from: Optional[Path] = typer.Option(
         None, help="Input path of the OCR archive, is incompatible with --input-path"
     ),
     server_type: ServerType = typer.Option(
@@ -403,7 +400,7 @@
     from peewee import JOIN
 
     from robotoff.models import ImageModel, ImagePrediction, db
-    from robotoff.off import generate_image_url
+    from robotoff.off import generate_image_url, get_barcode_from_url
     from robotoff.utils import text_file_iter
     from robotoff.workers.queues import enqueue_job, low_queue
     from robotoff.workers.tasks.import_image import (
@@ -505,6 +502,7 @@
     from more_itertools import chunked
     from playhouse.postgres_ext import ServerSide
 
+    from robotoff.elasticsearch import get_es_client
     from robotoff.logos import add_logos_to_ann, get_stored_logo_ids
     from robotoff.models import LogoEmbedding, db
     from robotoff.utils import get_logger
@@ -664,7 +662,7 @@
 
 @app.command()
 def import_logos(
-    data_path: pathlib.Path = typer.Argument(
+    data_path: Path = typer.Argument(
         ...,
         help="Path to the JSONL file containing data to import",
         exists=True,
@@ -716,7 +714,7 @@
 
 @app.command()
 def export_logos(
-    output: pathlib.Path = typer.Argument(
+    output: Path = typer.Argument(
         ...,
         help="Path to the output file, can either have .jsonl or .jsonl.gz as "
         "extension",
@@ -838,6 +836,8 @@
         help="Directory where the OCR JSON should be saved",
     ),
 ) -> None:
+    import os
+
     import orjson
 
     from robotoff.cli.ocr import run_ocr_on_image

@@ -85,7 +85,7 @@ def run_object_detection_model(
         image, output_image=False
     )
     data = results.to_json(threshold=threshold)
-    max_confidence = max([item["score"] for item in data], default=None)
+    max_confidence = max((item["score"] for item in data), default=None)
     return ImagePrediction.create(
         image=image_model,
         type="object_detection",

@@ -313,12 +313,3 @@ def generate_recent_changes_metrics(items: Iterable[dict]) -> Iterator[dict]:
             "time": item["t"],
             "fields": {"count": 1},
         }
-
-
-def save_recent_changes_metrics():
-    from robotoff.utils import jsonl_iter
-
-    if (client := get_influx_client()) is not None:
-        write_client = client.write_api(write_options=SYNCHRONOUS)
-        inserts = jsonl_iter()
-        write_client.write(bucket=settings.INFLUXDB_BUCKET, record=inserts)
@@ -199,10 +199,7 @@
         response.raw_output_contents[0],
         dtype=np.float32,
     ).reshape((len(images_by_id), -1))
-    return {
-        image_id: embedding
-        for image_id, embedding in zip(images_by_id.keys(), computed_embeddings)
-    }
+    return dict(zip(images_by_id.keys(), computed_embeddings))
 
 
 def fetch_ocr_texts(product: JSONType, product_id: ProductIdentifier) -> list[str]:

@@ -483,8 +483,8 @@
     keypoints_x = [k[1] for k in keypoints]
     keypoints_y = [k[0] for k in keypoints]
     if use_normalized_coordinates:
-        keypoints_x = tuple([im_width * x for x in keypoints_x])
-        keypoints_y = tuple([im_height * y for y in keypoints_y])
+        keypoints_x = tuple((im_width * x for x in keypoints_x))
+        keypoints_y = tuple((im_height * y for y in keypoints_y))
     for keypoint_x, keypoint_y in zip(keypoints_x, keypoints_y):
         draw.ellipse(
             [

@@ -61,7 +61,7 @@ def is_valid_weight(weight_value: str) -> bool:
     try:
         weight_value_float = float(weight_value)
     except ValueError:
-        logger.warning("Weight value is not a float: {}" "".format(weight_value))
+        logger.warning("Weight value is not a float: %s", weight_value)
         return False
 
     if weight_value_float <= 0:
@@ -70,8 +70,7 @@ def is_valid_weight(weight_value: str) -> bool:
 
     if float(int(weight_value_float)) != weight_value_float:
         logger.info(
-            "Weight value is not an integer ({}), "
-            "returning non valid".format(weight_value)
+            "Weight value is not an integer (%s), returning non valid", weight_value
         )
         return False
 

@@ -172,7 +172,7 @@
     current_etag = r.headers.get("ETag", "").strip("'\"")
 
     logger.info("Dataset has changed, downloading file")
-    logger.debug("Saving temporary file in {}".format(output_path))
+    logger.debug("Saving temporary file in %s", output_path)
 
     with open(output_path, "wb") as f:
         shutil.copyfileobj(r.raw, f)

@@ -280,7 +280,7 @@ def mark_insights() -> int:
         insight.save()
         marked += 1
 
-    logger.info("{} insights marked".format(marked))
+    logger.info("%s insights marked", marked)
     return marked  # useful for tests
 
 

@@ -24,13 +24,9 @@
         if not node.parents:
             categories_hierarchy[root].add(category_index)
 
-        children_indexes = set(
-            [
-                category_to_index[c.id]
-                for c in node.children
-                if c.id in category_to_index
-            ]
-        )
+        children_indexes = {
+            category_to_index[c.id] for c in node.children if c.id in category_to_index
+        }
 
         categories_hierarchy[category_index] = categories_hierarchy[
             category_index

@@ -61,7 +61,7 @@ def __init__(self, case_sensitive: bool = False):
         self._keyword = "_keyword_"
         self._white_space_chars = set([".", "\t", "\n", "\a", " ", ","])
         self.non_word_boundaries = set(string.digits + string.ascii_letters + "_")
-        self.keyword_trie_dict = dict()  # type: ignore
+        self.keyword_trie_dict = {}  # type: ignore
         self.case_sensitive = case_sensitive
         self._terms_in_trie = 0
 

@@ -106,7 +106,7 @@ def generate_train_test_val_datasets(
 
 
 def run(lang: Optional[str] = None):
-    logger.info("Generating category dataset for lang {}".format(lang or "xx"))
+    logger.info("Generating category dataset for lang %s", lang or "xx")
     dataset = ProductDataset.load()
     training_stream = dataset.stream().filter_nonempty_tag_field("categories_tags")
 
@@ -136,7 +136,7 @@ def run(lang: Optional[str] = None):
             WRITE_PATH / "category_{}.{}.jsonl".format(lang or "xx", key),
             data,
         )
-        logger.info("{} items for dataset {}, lang {}".format(count, key, lang or "xx"))
+        logger.info("%s items for dataset %s, lang %s", count, key, lang or "xx")
 
 
 if __name__ == "__main__":

@@ -43,7 +43,7 @@ def run() -> None:
             data_source_image = insight.data["source"]
             if data_source_image == insight.source_image:
                 insight.data.pop("source")
-                logger.info("Deleting source field for insight {}".format(insight.id))
+                logger.info("Deleting source field for insight %s", insight.id)
                 count += 1
                 save = True
             else:
@@ -81,8 +81,8 @@ def run() -> None:
         if save:
             insight.save()
 
-    logger.info("Updated insights: {}".format(count))
-    logger.info("Errors: {}".format(errors))
+    logger.info("Updated insights: %s", count)
+    logger.info("Errors: %s", errors)
 
 
 if __name__ == "__main__":

@@ -59,12 +59,12 @@ def insert_batch(
         )
 
         if image_instance is None:
-            logger.warning("Unknown image in DB: {}".format(source_image))
+            logger.warning("Unknown image in DB: %s", source_image)
             continue
 
         results = [r for r in item["result"] if r["score"] > 0.1]
         data = {"objects": results}
-        max_confidence = max([r["score"] for r in results], default=None)
+        max_confidence = max((r["score"] for r in results), default=None)
 
         inserted += 1
         image_prediction = ImagePrediction.create(
@@ -97,7 +97,7 @@ def main():
     with db:
         inserted = insert_batch(DATA_PATH, MODEL_NAME, MODEL_VERSION, SERVER_TYPE)
 
-    logger.info("{} image predictions inserted".format(inserted))
+    logger.info("%s image predictions inserted", inserted)
 
 
 if __name__ == "__main__":

@@ -47,4 +47,4 @@
 
             saved += 1
 
-logger.info("{} image saved".format(saved))
+logger.info("%s image saved", saved)