diff --git a/robotoff/app/api.py b/robotoff/app/api.py index eaeb3b96f2..3f9b182c98 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -105,7 +105,10 @@ def get_server_type_from_req( ) -> ServerType: """Get `ServerType` value from POST x-www-form-urlencoded or GET requests.""" - server_type_str = req.get_param("server_type") + if req.media and "server_type" in req.media: + server_type_str = req.media["server_type"] + else: + server_type_str = req.get_param("server_type") if server_type_str is None: return default @@ -445,7 +448,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): Prediction.type == PredictionType.nutrient.name, Prediction.source_image.in_( [ - generate_image_path(barcode, image_id) + generate_image_path(product_id, image_id) for image_id in target_image_ids ] ), @@ -682,13 +685,13 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): class ImagePredictionImporterResource: @jsonschema.validate(schema.IMAGE_PREDICTION_IMPORTER_SCHEMA) def on_post(self, req: falcon.Request, resp: falcon.Response): + server_type = get_server_type_from_req(req) timestamp = datetime.datetime.utcnow() inserts = [] for prediction in req.media["predictions"]: - source_image = generate_image_path( - prediction["barcode"], prediction.pop("image_id") - ) + product_id = ProductIdentifier(prediction["barcode"], server_type) + source_image = generate_image_path(product_id, prediction.pop("image_id")) inserts.append( { "timestamp": timestamp, @@ -698,7 +701,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): ) inserted = batch_insert(ImagePrediction, inserts) - logger.info("{} image predictions inserted".format(inserted)) + logger.info("%s image predictions inserted", inserted) class ImagePredictionResource: diff --git a/robotoff/app/schema.py b/robotoff/app/schema.py index 0200e72f2e..be600f4a6c 100644 --- a/robotoff/app/schema.py +++ b/robotoff/app/schema.py @@ -15,6 +15,7 @@ "model_name": {"type": "string"}, "model_version": {"type": "string"}, "data": {"type": "object"}, + "server_type": {"type": "string"}, }, "required": [ "barcode", diff --git a/robotoff/cli/logos.py b/robotoff/cli/logos.py index a5aabd55a1..a3f7e5b07c 100644 --- a/robotoff/cli/logos.py +++ b/robotoff/cli/logos.py @@ -7,7 +7,7 @@ from robotoff.logos import filter_logos from robotoff.models import ImageModel, ImagePrediction, LogoAnnotation, db from robotoff.off import generate_image_path -from robotoff.types import ServerType +from robotoff.types import ProductIdentifier, ServerType from robotoff.utils import get_logger, jsonl_iter logger = get_logger(__name__) @@ -46,8 +46,9 @@ def import_logos( timestamp = datetime.datetime.utcnow() for item in batch: barcode = item["barcode"] + product_id = ProductIdentifier(barcode, server_type) source_image = generate_image_path( - barcode=barcode, image_id=item["image_id"] + product_id=product_id, image_id=item["image_id"] ) key = (model_name, source_image) diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 3c379413de..0bc426fb17 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -346,10 +346,11 @@ def import_images_in_db( barcode = product.barcode for image_id in (id_ for id_ in product.images.keys() if id_.isdigit()): if (barcode, image_id) not in existing_images: + product_id = ProductIdentifier(barcode, server_type) to_add.append( ( - ProductIdentifier(barcode, server_type), - generate_image_path(barcode, image_id), + product_id, + generate_image_path(product_id, image_id), ) ) diff --git a/robotoff/images.py b/robotoff/images.py index 228ec44456..3ad04e8d90 100644 --- a/robotoff/images.py +++ b/robotoff/images.py @@ -124,7 +124,7 @@ def refresh_images_in_db(product_id: ProductIdentifier, images: JSONType): missing_image_ids = set(image_ids) - existing_image_ids for missing_image_id in missing_image_ids: - source_image = generate_image_path(product_id.barcode, missing_image_id) + source_image = generate_image_path(product_id, missing_image_id) image_url = generate_image_url(product_id, missing_image_id) logger.debug("Creating missing image %s in DB", source_image) save_image(product_id, source_image, image_url, images) diff --git a/robotoff/models.py b/robotoff/models.py index d6d86175e5..88af9bccb3 100644 --- a/robotoff/models.py +++ b/robotoff/models.py @@ -79,7 +79,7 @@ class ProductInsight(BaseModel): id = peewee.UUIDField(primary_key=True, default=uuid.uuid4) # Barcode represents the barcode of the product for which the insight was - # generated. + # generated. It is prefixed by `{ORG_ID}/` for the pro platform. barcode = peewee.CharField(max_length=100, null=False, index=True) # Type represents the insight type - must match one of the types in diff --git a/robotoff/off.py b/robotoff/off.py index 61dc727a1f..5bb0bb4ac0 100644 --- a/robotoff/off.py +++ b/robotoff/off.py @@ -66,9 +66,6 @@ def get_username(self) -> Optional[str]: return None -BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$") - - def get_source_from_url(ocr_url: str) -> str: url_path = urlparse(ocr_url).path @@ -98,38 +95,96 @@ def get_barcode_from_path(path: str) -> Optional[str]: return barcode or None +BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$") + + def split_barcode(barcode: str) -> list[str]: + """Split barcode in the same way as done by Product Opener to generate a + product image folder. + + :param barcode: The barcode of the product. For the pro platform only, + it must be prefixed with the org ID using the format + `{ORG_ID}/{BARCODE}` + :raises ValueError: raise a ValueError if `barcode` is invalid + :return: a list containing the splitted barcode + """ + org_id = None + if "/" in barcode: + # For the pro platform, `barcode` is expected to be in the format + # `{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`) + org_id, barcode = barcode.split("/", maxsplit=1) + if not barcode.isdigit(): - raise ValueError("unknown barcode format: {}".format(barcode)) + raise ValueError(f"unknown barcode format: {barcode}") match = BARCODE_PATH_REGEX.fullmatch(barcode) - if match: - return [x for x in match.groups() if x] + splits = [x for x in match.groups() if x] if match else [barcode] + + if org_id is not None: + # For the pro platform only, images and OCRs belonging to an org + # are stored in a folder named after the org for all its products, ex: + # https://images.pro.openfoodfacts.org/images/products/org-lea-nature/330/713/080/3004/1.jpg + splits.append(org_id) - return [barcode] + return splits -def generate_image_path(barcode: str, image_id: str) -> str: - splitted_barcode = split_barcode(barcode) - return "/{}/{}.jpg".format("/".join(splitted_barcode), image_id) +def _generate_file_path(product_id: ProductIdentifier, image_id: str, suffix: str): + splitted_barcode = split_barcode(product_id.barcode) + return f"/{'/'.join(splitted_barcode)}/{image_id}{suffix}" -def generate_json_path(barcode: str, image_id: str) -> str: - splitted_barcode = split_barcode(barcode) - return "/{}/{}.json".format("/".join(splitted_barcode), image_id) +def generate_image_path(product_id: ProductIdentifier, image_id: str) -> str: + """Generate an image path. + + It's used to generate a unique identifier of an image for a product (and + to generate an URL to fetch this image from the server). + + :param product_id: the product identifier + :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) + :return: the full image path + """ + return _generate_file_path(product_id, image_id, ".jpg") + + +def generate_json_ocr_path(product_id: ProductIdentifier, image_id: str) -> str: + """Generate a JSON OCR path. + + It's used to generate a unique identifier of an OCR results for a product + (and to generate an URL to fetch this OCR JSON from the server). + + :param product_id: the product identifier + :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) + :return: the full image path + """ + return _generate_file_path(product_id, image_id, ".json") def generate_json_ocr_url(product_id: ProductIdentifier, image_id: str) -> str: + """Generate the OCR JSON URL for a specific product and + image ID. + + :param product_id: the product identifier + :param image_id: the image ID (ex: `1`, `2`,...) + :return: the generated image URL + """ return ( settings.BaseURLProvider.static(product_id.server_type) - + f"/images/products{generate_json_path(product_id.barcode, image_id)}" + + f"/images/products{generate_json_ocr_path(product_id, image_id)}" ) def generate_image_url(product_id: ProductIdentifier, image_id: str) -> str: + """Generate the image URL for a specific product and + image ID. + + :param product_id: the product identifier + :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) + :return: the generated image URL + """ return settings.BaseURLProvider.image_url( - product_id.server_type, generate_image_path(product_id.barcode, image_id) + product_id.server_type, generate_image_path(product_id, image_id) ) diff --git a/robotoff/products.py b/robotoff/products.py index c2b74465b5..308a7ec8b2 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -521,7 +521,10 @@ def __len__(self): def get_product( self, product_id: ProductIdentifier, projection: Optional[list[str]] = None ) -> Optional[JSONType]: - return self.collection.find_one({"code": product_id.barcode}, projection) + # We use `_id` instead of `code` field, as `_id` contains org ID + + # barcode for pro platform, which is also the case for + # `product_id.barcode` + return self.collection.find_one({"_id": product_id.barcode}, projection) def __getitem__(self, product_id: ProductIdentifier) -> Optional[Product]: product = self.get_product(product_id) diff --git a/robotoff/types.py b/robotoff/types.py index ac034f45b6..507ab5ec40 100644 --- a/robotoff/types.py +++ b/robotoff/types.py @@ -256,7 +256,9 @@ class ProductIdentifier: """Dataclass to uniquely identify a product across all Open*Facts projects, with: - - the product barcode + - the product barcode: for the pro platform, it must be in the format + `{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`), otherwise it's + the barcode only - the project specified by the ServerType """ diff --git a/robotoff/workers/queues.py b/robotoff/workers/queues.py index 582925064e..8c1b9f82e8 100644 --- a/robotoff/workers/queues.py +++ b/robotoff/workers/queues.py @@ -41,7 +41,8 @@ def get_high_queue(product_id: Optional[ProductIdentifier] = None) -> Queue: return random.choice(high_queues) # We compute a md5 hash of the barcode and convert the 4 last bytes to an - # int (long) This way, we make sure the distribution of `barcode_hash` is + # int (long) + # This way, we make sure the distribution of `barcode_hash` is # uniform and that all queues are sampled evenly with `queue_idx = # barcode_hash % len(high_queues)` barcode_hash: int = struct.unpack( diff --git a/scripts/insert_images.py b/scripts/insert_images.py index 0421272eaa..6dde7ca465 100644 --- a/scripts/insert_images.py +++ b/scripts/insert_images.py @@ -37,7 +37,7 @@ if (str(product.barcode), str(image_id)) in seen_set: continue - source_image = generate_image_path(product.barcode, str(image_id)) + source_image = generate_image_path(product_id, str(image_id)) image_url = generate_image_url(product_id, str(image_id)) try: