From 6c83b8c9141314305da52d20acb2b43122ec4db2 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Sat, 24 Aug 2024 18:03:52 +0200 Subject: [PATCH] feat(batch - spellcheck): :zap: From predictions to insights --- robotoff/app/api.py | 28 ++++++++++++++++-- robotoff/batch/__init__.py | 1 + robotoff/batch/importer.py | 55 +++++++++++++++++++++++++++++++++++ robotoff/insights/importer.py | 29 ++++++++++++++++++ 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 robotoff/batch/importer.py diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 4cf3c5eaa7..5daf219fb3 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -92,6 +92,8 @@ GoogleBatchJobConfig, BatchExtraction, GoogleStorageBucketForBatchJob, + generate_predictions_from_batch, + ) logger = get_logger() @@ -1762,7 +1764,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.status = falcon.HTTP_200 -class BatchJobResource: +class BatchJobLaunchResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) @@ -1779,7 +1781,6 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): ) if not BatchExtraction.extracted_file_path: raise ValueError("The extracted file was not found.") - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) @@ -1789,6 +1790,27 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): resp.media = {"batch_job_details": batch_job} +class BatchJobImportResource: + def on_post(self, req: falcon.Request, resp: falcon.Response): + job_type_str: str = req.get_param("job_type", required=True) + + from robotoff.insights.importer import import_insights + try: + job_type = BatchJobType[job_type_str] + except KeyError: + raise falcon.HTTPBadRequest( + description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" + ) + + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + predictions = generate_predictions_from_batch( + bucket_handler.download_file, + job_type + ) + with db: + import_insights(predictions=predictions, server_type="off") + + def custom_handle_uncaught_exception( req: falcon.Request, resp: falcon.Response, ex: Exception, params ): @@ -1856,4 +1878,4 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) api.add_route("/robots.txt", RobotsTxtResource()) -api.add_route("/api/v1/batch/launch", BatchJobResource()) \ No newline at end of file +api.add_route("/api/v1/batch/launch", BatchJobLaunchResource()) \ No newline at end of file diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 7bb0a17d87..d9470f8e2b 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -9,3 +9,4 @@ from .buckets import ( GoogleStorageBucketForBatchJob, ) +from .importer import generate_predictions_from_batch diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py new file mode 100644 index 0000000000..d8df8d48ec --- /dev/null +++ b/robotoff/batch/importer.py @@ -0,0 +1,55 @@ +import io +from typing import Iterator + +import pandas as pd + +from robotoff.batch import BatchJobType +from robotoff.types import Prediction, PredictionType + + +BATCH_JOB_TYPE_TO_FEATURES = { + BatchJobType.ingredients_spellcheck: { + "barcode": "code", + "value": "correction", + "value_tag": "lang", + }, +} + +BATCH_JOB_TYPE_TO_PREDICTION_TYPE = { + BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, +} + +PREDICTOR_VERSION = "1" + + +def generate_predictions_from_batch( + f: io.BufferedReader, + job_type: BatchJobType +) -> Iterator[Prediction]: + """From a file imported from google storage, generate predictions depending on the job type. + + :param f: Readable object. Should be a parquet file. + :type f: io.BufferedReader + :param job_type: Batch job type. + :type job_type: BatchJobType + :rtype: Iterable[Prediction] + :yield: Predictions. + :rtype: Iterator[Prediction] + """ + features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] + prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] + + try: + df = pd.read_parquet(f) + except Exception as e: + raise ValueError(f"Failed to read parquet file: {e}") + + for _, row in df.iterrows(): + yield Prediction( + type=prediction_type, + value=row[features_dict["value"]], + value_tag=[features_dict["value_tag"]], + barcode=row[features_dict["barcode"]], + predictor_version=PREDICTOR_VERSION, + predictor="llm", + ) diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index bc5cdcebde..275393b9eb 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1475,6 +1475,35 @@ def compute_crop_bounding_box( return results +class IngredientsSpellcheckImporter(InsightImporter): + + @staticmethod + def get_type() -> InsightType: + return InsightType.ingredient_spellcheck + + @classmethod + def get_required_prediction_types(cls) -> set[PredictionType]: + return {PredictionType.ingredient_spellcheck} + + @classmethod + def generate_candidates( + cls, + product: Optional[Product], + predictions: list[Prediction], + product_id: ProductIdentifier, + ) -> Iterator[ProductInsight]: + # No reason to have different candidates for now + candidate = predictions[0] + yield ProductInsight(**candidate.to_dict()) + + @classmethod + def is_conflicting_insight( + cls, + candidate: ProductInsight, + reference: ProductInsight + ) -> bool: + candidate.value == reference.value + class PackagingElementTaxonomyException(Exception): pass