Skip to content

Commit

Permalink
fix: support pro platform for MongoDB queries and image/OCR URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed May 23, 2023
1 parent a8b8292 commit f8d1c64
Show file tree
Hide file tree
Showing 11 changed files with 98 additions and 31 deletions.
15 changes: 9 additions & 6 deletions robotoff/app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ def get_server_type_from_req(
) -> ServerType:
"""Get `ServerType` value from POST x-www-form-urlencoded or GET
requests."""
server_type_str = req.get_param("server_type")
if req.media and "server_type" in req.media:
server_type_str = req.media["server_type"]
else:
server_type_str = req.get_param("server_type")

if server_type_str is None:
return default
Expand Down Expand Up @@ -445,7 +448,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
Prediction.type == PredictionType.nutrient.name,
Prediction.source_image.in_(
[
generate_image_path(barcode, image_id)
generate_image_path(product_id, image_id)
for image_id in target_image_ids
]
),
Expand Down Expand Up @@ -682,13 +685,13 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
class ImagePredictionImporterResource:
@jsonschema.validate(schema.IMAGE_PREDICTION_IMPORTER_SCHEMA)
def on_post(self, req: falcon.Request, resp: falcon.Response):
server_type = get_server_type_from_req(req)
timestamp = datetime.datetime.utcnow()
inserts = []

for prediction in req.media["predictions"]:
source_image = generate_image_path(
prediction["barcode"], prediction.pop("image_id")
)
product_id = ProductIdentifier(prediction["barcode"], server_type)
source_image = generate_image_path(product_id, prediction.pop("image_id"))
inserts.append(
{
"timestamp": timestamp,
Expand All @@ -698,7 +701,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
)

inserted = batch_insert(ImagePrediction, inserts)
logger.info("{} image predictions inserted".format(inserted))
logger.info("%s image predictions inserted", inserted)


class ImagePredictionResource:
Expand Down
1 change: 1 addition & 0 deletions robotoff/app/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"model_name": {"type": "string"},
"model_version": {"type": "string"},
"data": {"type": "object"},
"server_type": {"type": "string"},
},
"required": [
"barcode",
Expand Down
5 changes: 3 additions & 2 deletions robotoff/cli/logos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from robotoff.logos import filter_logos
from robotoff.models import ImageModel, ImagePrediction, LogoAnnotation, db
from robotoff.off import generate_image_path
from robotoff.types import ServerType
from robotoff.types import ProductIdentifier, ServerType
from robotoff.utils import get_logger, jsonl_iter

logger = get_logger(__name__)
Expand Down Expand Up @@ -46,8 +46,9 @@ def import_logos(
timestamp = datetime.datetime.utcnow()
for item in batch:
barcode = item["barcode"]
product_id = ProductIdentifier(barcode, server_type)
source_image = generate_image_path(
barcode=barcode, image_id=item["image_id"]
product_id=product_id, image_id=item["image_id"]
)
key = (model_name, source_image)

Expand Down
5 changes: 3 additions & 2 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,11 @@ def import_images_in_db(
barcode = product.barcode
for image_id in (id_ for id_ in product.images.keys() if id_.isdigit()):
if (barcode, image_id) not in existing_images:
product_id = ProductIdentifier(barcode, server_type)
to_add.append(
(
ProductIdentifier(barcode, server_type),
generate_image_path(barcode, image_id),
product_id,
generate_image_path(product_id, image_id),
)
)

Expand Down
2 changes: 1 addition & 1 deletion robotoff/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def refresh_images_in_db(product_id: ProductIdentifier, images: JSONType):
missing_image_ids = set(image_ids) - existing_image_ids

for missing_image_id in missing_image_ids:
source_image = generate_image_path(product_id.barcode, missing_image_id)
source_image = generate_image_path(product_id, missing_image_id)
image_url = generate_image_url(product_id, missing_image_id)
logger.debug("Creating missing image %s in DB", source_image)
save_image(product_id, source_image, image_url, images)
2 changes: 1 addition & 1 deletion robotoff/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class ProductInsight(BaseModel):
id = peewee.UUIDField(primary_key=True, default=uuid.uuid4)

# Barcode represents the barcode of the product for which the insight was
# generated.
# generated. It is prefixed by `{ORG_ID}/` for the pro platform.
barcode = peewee.CharField(max_length=100, null=False, index=True)

# Type represents the insight type - must match one of the types in
Expand Down
85 changes: 70 additions & 15 deletions robotoff/off.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ def get_username(self) -> Optional[str]:
return None


BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$")


def get_source_from_url(ocr_url: str) -> str:
url_path = urlparse(ocr_url).path

Expand Down Expand Up @@ -98,38 +95,96 @@ def get_barcode_from_path(path: str) -> Optional[str]:
return barcode or None


BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$")


def split_barcode(barcode: str) -> list[str]:
"""Split barcode in the same way as done by Product Opener to generate a
product image folder.
:param barcode: The barcode of the product. For the pro platform only,
it must be prefixed with the org ID using the format
`{ORG_ID}/{BARCODE}`
:raises ValueError: raise a ValueError if `barcode` is invalid
:return: a list containing the splitted barcode
"""
org_id = None
if "/" in barcode:
# For the pro platform, `barcode` is expected to be in the format
# `{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`)
org_id, barcode = barcode.split("/", maxsplit=1)

if not barcode.isdigit():
raise ValueError("unknown barcode format: {}".format(barcode))
raise ValueError(f"unknown barcode format: {barcode}")

match = BARCODE_PATH_REGEX.fullmatch(barcode)

if match:
return [x for x in match.groups() if x]
splits = [x for x in match.groups() if x] if match else [barcode]

if org_id is not None:
# For the pro platform only, images and OCRs belonging to an org
# are stored in a folder named after the org for all its products, ex:
# https://images.pro.openfoodfacts.org/images/products/org-lea-nature/330/713/080/3004/1.jpg
splits.append(org_id)

return [barcode]
return splits


def generate_image_path(barcode: str, image_id: str) -> str:
splitted_barcode = split_barcode(barcode)
return "/{}/{}.jpg".format("/".join(splitted_barcode), image_id)
def _generate_file_path(product_id: ProductIdentifier, image_id: str, suffix: str):
splitted_barcode = split_barcode(product_id.barcode)
return f"/{'/'.join(splitted_barcode)}/{image_id}{suffix}"


def generate_json_path(barcode: str, image_id: str) -> str:
splitted_barcode = split_barcode(barcode)
return "/{}/{}.json".format("/".join(splitted_barcode), image_id)
def generate_image_path(product_id: ProductIdentifier, image_id: str) -> str:
"""Generate an image path.
It's used to generate a unique identifier of an image for a product (and
to generate an URL to fetch this image from the server).
:param product_id: the product identifier
:param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
:return: the full image path
"""
return _generate_file_path(product_id, image_id, ".jpg")


def generate_json_ocr_path(product_id: ProductIdentifier, image_id: str) -> str:
"""Generate a JSON OCR path.
It's used to generate a unique identifier of an OCR results for a product
(and to generate an URL to fetch this OCR JSON from the server).
:param product_id: the product identifier
:param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
:return: the full image path
"""
return _generate_file_path(product_id, image_id, ".json")


def generate_json_ocr_url(product_id: ProductIdentifier, image_id: str) -> str:
"""Generate the OCR JSON URL for a specific product and
image ID.
:param product_id: the product identifier
:param image_id: the image ID (ex: `1`, `2`,...)
:return: the generated image URL
"""
return (
settings.BaseURLProvider.static(product_id.server_type)
+ f"/images/products{generate_json_path(product_id.barcode, image_id)}"
+ f"/images/products{generate_json_ocr_path(product_id, image_id)}"
)


def generate_image_url(product_id: ProductIdentifier, image_id: str) -> str:
"""Generate the image URL for a specific product and
image ID.
:param product_id: the product identifier
:param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
:return: the generated image URL
"""
return settings.BaseURLProvider.image_url(
product_id.server_type, generate_image_path(product_id.barcode, image_id)
product_id.server_type, generate_image_path(product_id, image_id)
)


Expand Down
5 changes: 4 additions & 1 deletion robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,10 @@ def __len__(self):
def get_product(
self, product_id: ProductIdentifier, projection: Optional[list[str]] = None
) -> Optional[JSONType]:
return self.collection.find_one({"code": product_id.barcode}, projection)
# We use `_id` instead of `code` field, as `_id` contains org ID +
# barcode for pro platform, which is also the case for
# `product_id.barcode`
return self.collection.find_one({"_id": product_id.barcode}, projection)

def __getitem__(self, product_id: ProductIdentifier) -> Optional[Product]:
product = self.get_product(product_id)
Expand Down
4 changes: 3 additions & 1 deletion robotoff/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ class ProductIdentifier:
"""Dataclass to uniquely identify a product across all Open*Facts
projects, with:
- the product barcode
- the product barcode: for the pro platform, it must be in the format
`{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`), otherwise it's
the barcode only
- the project specified by the ServerType
"""

Expand Down
3 changes: 2 additions & 1 deletion robotoff/workers/queues.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def get_high_queue(product_id: Optional[ProductIdentifier] = None) -> Queue:
return random.choice(high_queues)

# We compute a md5 hash of the barcode and convert the 4 last bytes to an
# int (long) This way, we make sure the distribution of `barcode_hash` is
# int (long)
# This way, we make sure the distribution of `barcode_hash` is
# uniform and that all queues are sampled evenly with `queue_idx =
# barcode_hash % len(high_queues)`
barcode_hash: int = struct.unpack(
Expand Down
2 changes: 1 addition & 1 deletion scripts/insert_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
if (str(product.barcode), str(image_id)) in seen_set:
continue

source_image = generate_image_path(product.barcode, str(image_id))
source_image = generate_image_path(product_id, str(image_id))
image_url = generate_image_url(product_id, str(image_id))

try:
Expand Down

0 comments on commit f8d1c64

Please sign in to comment.