chore: move aws image sync script to ovh3

openfoodfacts · Oct 29, 2024 · 42dd4fc · 42dd4fc
1 parent c674291
commit 42dd4fc
Show file tree

Hide file tree

Showing 5 changed files with 268 additions and 0 deletions.
diff --git a/confs/ovh3/systemd/system/[email protected] b/confs/ovh3/systemd/system/[email protected]
@@ -0,0 +1,14 @@
+# service instance name "%i" is off only (for now)
+[Unit]
+Description=Synchronize images to AWS S3 %i
+# __ will be replaced by @ in [email protected]
+OnFailure=email-failures@sync_images_s3__%i.service
+
+[Service]
+Type=oneshot
+User=off
+Group=off
+# Warning: this script doesn't work currently with non-off product type
+ExecStart=/home/off/.cargo/bin/uv run /opt/openfoodfacts-infrastructure/scripts/ovh3/sync-s3-images/sync_s3_images.py /rpool/off/images/products
+KillMode=process
+
diff --git a/confs/ovh3/systemd/system/[email protected] b/confs/ovh3/systemd/system/[email protected]
@@ -0,0 +1,13 @@
+# service instance name "%i" is off only (for now)
+[Unit]
+Description=Synchronize images to AWS S3 daily
+
+[Timer]
+# every tuesday
+OnCalendar=Tue *-*-* 02:00:00
+# service instance name "%i" is off / obf / opff / opf
+Unit=sync_images_s3@%i.service
+
+[Install]
+WantedBy=multi-user.target
+
diff --git a/scripts/ovh3/sync-s3-images/README.md b/scripts/ovh3/sync-s3-images/README.md
@@ -0,0 +1,9 @@
+# AWS Open Dataset: Open Food Facts images
+
+This directory contains the [script](./sync_s3_images.py) that synchronizes
+images and OCR results, from a directory containing all images to `openfoodfacts-images` bucket,
+as part of AWS Open Dataset program.
+
+The dataset YAML description sent to [AWS Open Data
+registry](https://github.com/awslabs/open-data-registry/tree/main) can be found
+at [openfoodfacts-images.yml](./openfoodfacts-images.yml).
diff --git a/scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml b/scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml
@@ -0,0 +1,17 @@
+Name: Open Food Facts Images
+Description: A dataset of all images of Open Food Facts, the biggest open
+  dataset of food products in the world.
+Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/aws-images-dataset
+Contact: [email protected]
+ManagedBy: "[Open Food Facts](https://world.openfoodfacts.org)"
+UpdateFrequency: Monthly
+License: All data contained in this dataset is licenced under the [Creative Commons Attribution ShareAlike licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en)
+Tags:
+  - machine learning
+  - image processing
+Resources:
+  - Description: Open Food Facts image dataset
+    ARN: arn:aws:s3:::openfoodfacts-images
+    Region: eu-west-3
+    Type: S3 Bucket
+
diff --git a/scripts/ovh3/sync-s3-images/sync_s3_images.py b/scripts/ovh3/sync-s3-images/sync_s3_images.py
@@ -0,0 +1,215 @@
+# /// script
+# dependencies = [
+# "openfoodfacts==1.1.5",
+# "orjson==3.10.7",
+# "boto3==1.35.32",
+# "tqdm==4.66.5",
+# ]
+# requires-python = ">=3.7"
+# ///
+
+"""This script is used to synchronize Open Food Facts images and OCR JSONs on
+AWS S3. As part of AWS Open Dataset program, we can host free of charge data on
+AWS S3.
+
+This dataset can be used by researchers to access easily OFF data, without
+overloading OFF servers.
+
+This script should be run regularly, to synchronize new images. We currently
+upload:
+
+- all raw images (ex: 1.jpg, 2.jpg,...)
+- 400px resized version of the raw images
+- OCR results of the raw images (ex: 1.json.gz)
+"""
+
+import argparse
+import gzip
+import logging
+import tempfile
+from logging import getLogger
+from pathlib import Path
+from typing import Iterator, Optional, Tuple
+
+import boto3
+import tqdm
+from openfoodfacts import ProductDataset
+from openfoodfacts.images import split_barcode
+
+logger = getLogger()
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+    "%(asctime)s :: %(processName)s :: "
+    "%(threadName)s :: %(levelname)s :: "
+    "%(message)s"
+)
+handler.setFormatter(formatter)
+handler.setLevel(logging.INFO)
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+s3 = boto3.resource("s3", region_name="eu-west-3")
+bucket = s3.Bucket("openfoodfacts-images")
+
+
+
+def generate_product_path(barcode: str) -> str:
+    if not barcode.isdigit():
+        raise ValueError("unknown barcode format: {}".format(barcode))
+
+    splitted_barcode = split_barcode(barcode)
+    return "/".join(splitted_barcode)
+
+
+def get_sync_filepaths(
+    base_dir: Path, ds: ProductDataset
+) -> Iterator[Tuple[str, Path]]:
+    """Return an iterator containing files to synchronize with AWS S3 bucket.
+
+    The iterator returns (barcode, file_path) tuples, where `barcode` is the
+    product barcode, and `file_path` is the path of the file to synchronize.
+
+    We use the product dataset to know images associated with each products,
+    this way we don't push to S3 deleted images.
+
+    We currently synchronize:
+
+    - all raw images (ex: 1.jpg, 2.jpg,...)
+    - 400px resized version of the raw images
+    - OCR results of the raw images (ex: 1.json.gz)
+
+    :param base_dir: directory where images are stored
+    :param ds: product dataset
+    """
+    for item in tqdm.tqdm(ds, desc="products"):
+        barcode = item["code"]
+        if not barcode:
+            continue
+        product_path = generate_product_path(barcode)
+        product_dir = Path(product_path)
+        full_product_dir = base_dir / product_dir
+
+        for image_id in item.get("images", {}).keys():
+            if not image_id.isdigit():
+                # Ignore selected image keys
+                continue
+
+            # Only synchronize raw and 400px version of images
+            for image_name in (
+                "{}.jpg".format(image_id),
+                "{}.400.jpg".format(image_id),
+            ):
+                full_image_path = full_product_dir / image_name
+                if not full_image_path.is_file():
+                    logger.warning("image {} not found".format(full_image_path))
+                    continue
+                yield barcode, product_dir / image_name
+
+            # Synchronize OCR JSON if it exists
+            ocr_file_name = "{}.json.gz".format(image_id)
+            if (full_product_dir / ocr_file_name).is_file():
+                yield barcode, product_dir / ocr_file_name
+
+
+def run(image_dir: Path, dataset_path: Optional[Path]) -> None:
+    """Launch the synchronization.
+
+    :param image_dir: directory where images are stored
+    :param dataset_path: path to the JSONL dataset
+    """
+    ds = ProductDataset(dataset_path=dataset_path)
+    logger.info("Fetching existing keys...")
+    existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/"))
+    logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys))
+    dataset_keys = set()
+
+    uploaded = 0
+    kept = 0
+    deleted = 0
+    for barcode, file_path in get_sync_filepaths(image_dir, ds):
+        full_file_path = image_dir / file_path
+        key = "data/{}".format(file_path)
+        dataset_keys.add(key)
+
+        if key in existing_keys:
+            logger.debug("File %s already exists on S3", key)
+            kept += 1
+            continue
+
+        extra_args = {"Metadata": {"barcode": barcode}}
+        if key.endswith(".jpg"):
+            extra_args["ContentType"] = "image/jpeg"
+
+        logger.debug("Uploading file %s -> %s", full_file_path, key)
+        bucket.upload_file(str(full_file_path), key, ExtraArgs=extra_args)
+        uploaded += 1
+        existing_keys.add(key)
+
+        if (kept + uploaded) % 1000 == 0:
+            logger.info("uploaded: %d, kept: %d", uploaded, kept)
+
+    logger.info("Removing deleted files...")
+    for missing_key in existing_keys - dataset_keys:
+        # Removing files associated with deleted images
+        logger.debug("Deleting S3 file %s", missing_key)
+        deleted += 1
+        bucket.delete_objects(
+            Delete={
+                "Objects": [
+                    {"Key": missing_key},
+                ],
+            },
+        )
+
+    # We upload all S3 keys in a single `data_keys.txt` text file
+    # to make it easier to know existing files on the bucket
+
+    # Create a temporary directory to avoid uploading a corrupted file
+    tmp_dir = Path(tempfile.mkdtemp())
+    data_keys_path = tmp_dir / "data_keys.txt"
+    logger.info("Saving data keys in %s", data_keys_path)
+
+    with gzip.open(str(data_keys_path), "wt") as f:
+        f.write("\n".join(sorted(existing_keys)))
+
+    logger.info("Uploading data keys...")
+    bucket.upload_file(str(data_keys_path), "data/data_keys.gz")
+    data_keys_path.unlink()
+    tmp_dir.rmdir()
+
+    logger.info(
+        "Synchronization finished, uploaded: %d, kept: %d, deleted: %d",
+        uploaded,
+        kept,
+        deleted,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Synchronize Open Food Facts images and OCR files with AWS S3.
+
+        This script should be run regularly, to synchronize new images. We currently
+        upload:
+
+        - all raw images (ex: 1.jpg, 2.jpg,...)
+        - 400px resized version of the raw images
+        - OCR results of the raw images (ex: 1.json.gz)
+
+        Before upload, the latest version of the dataset is downloaded from Open Food
+        Facts servers to get the list of images to synchronize.
+        """
+    )
+    parser.add_argument(
+        "image_dir",
+        type=Path,
+        help="Directory where images are stored.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=Path,
+        help="Directory where dataset is stored.",
+    )
+    args = parser.parse_args()
+    run(args.image_dir, args.dataset_path)
+