From 42dd4fcdb1f52d55767f33136bc94a6590fa4fd8 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 29 Oct 2024 16:21:43 +0000 Subject: [PATCH] chore: move aws image sync script to ovh3 --- .../systemd/system/sync_images_s3@.service | 14 ++ .../ovh3/systemd/system/sync_images_s3@.timer | 13 ++ scripts/ovh3/sync-s3-images/README.md | 9 + .../sync-s3-images/openfoodfacts-images.yaml | 17 ++ scripts/ovh3/sync-s3-images/sync_s3_images.py | 215 ++++++++++++++++++ 5 files changed, 268 insertions(+) create mode 100644 confs/ovh3/systemd/system/sync_images_s3@.service create mode 100644 confs/ovh3/systemd/system/sync_images_s3@.timer create mode 100644 scripts/ovh3/sync-s3-images/README.md create mode 100644 scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml create mode 100644 scripts/ovh3/sync-s3-images/sync_s3_images.py diff --git a/confs/ovh3/systemd/system/sync_images_s3@.service b/confs/ovh3/systemd/system/sync_images_s3@.service new file mode 100644 index 00000000..9044f872 --- /dev/null +++ b/confs/ovh3/systemd/system/sync_images_s3@.service @@ -0,0 +1,14 @@ +# service instance name "%i" is off only (for now) +[Unit] +Description=Synchronize images to AWS S3 %i +# __ will be replaced by @ in email-failures@.service +OnFailure=email-failures@sync_images_s3__%i.service + +[Service] +Type=oneshot +User=off +Group=off +# Warning: this script doesn't work currently with non-off product type +ExecStart=/home/off/.cargo/bin/uv run /opt/openfoodfacts-infrastructure/scripts/ovh3/sync-s3-images/sync_s3_images.py /rpool/off/images/products +KillMode=process + diff --git a/confs/ovh3/systemd/system/sync_images_s3@.timer b/confs/ovh3/systemd/system/sync_images_s3@.timer new file mode 100644 index 00000000..aadb27a3 --- /dev/null +++ b/confs/ovh3/systemd/system/sync_images_s3@.timer @@ -0,0 +1,13 @@ +# service instance name "%i" is off only (for now) +[Unit] +Description=Synchronize images to AWS S3 daily + +[Timer] +# every tuesday +OnCalendar=Tue *-*-* 02:00:00 +# service instance name "%i" is off / obf / opff / opf +Unit=sync_images_s3@%i.service + +[Install] +WantedBy=multi-user.target + diff --git a/scripts/ovh3/sync-s3-images/README.md b/scripts/ovh3/sync-s3-images/README.md new file mode 100644 index 00000000..de748e33 --- /dev/null +++ b/scripts/ovh3/sync-s3-images/README.md @@ -0,0 +1,9 @@ +# AWS Open Dataset: Open Food Facts images + +This directory contains the [script](./sync_s3_images.py) that synchronizes +images and OCR results, from a directory containing all images to `openfoodfacts-images` bucket, +as part of AWS Open Dataset program. + +The dataset YAML description sent to [AWS Open Data +registry](https://github.com/awslabs/open-data-registry/tree/main) can be found +at [openfoodfacts-images.yml](./openfoodfacts-images.yml). diff --git a/scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml b/scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml new file mode 100644 index 00000000..a5d4dc9d --- /dev/null +++ b/scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml @@ -0,0 +1,17 @@ +Name: Open Food Facts Images +Description: A dataset of all images of Open Food Facts, the biggest open + dataset of food products in the world. +Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/aws-images-dataset +Contact: contact@openfoodfacts.org +ManagedBy: "[Open Food Facts](https://world.openfoodfacts.org)" +UpdateFrequency: Monthly +License: All data contained in this dataset is licenced under the [Creative Commons Attribution ShareAlike licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en) +Tags: + - machine learning + - image processing +Resources: + - Description: Open Food Facts image dataset + ARN: arn:aws:s3:::openfoodfacts-images + Region: eu-west-3 + Type: S3 Bucket + diff --git a/scripts/ovh3/sync-s3-images/sync_s3_images.py b/scripts/ovh3/sync-s3-images/sync_s3_images.py new file mode 100644 index 00000000..72021225 --- /dev/null +++ b/scripts/ovh3/sync-s3-images/sync_s3_images.py @@ -0,0 +1,215 @@ +# /// script +# dependencies = [ +# "openfoodfacts==1.1.5", +# "orjson==3.10.7", +# "boto3==1.35.32", +# "tqdm==4.66.5", +# ] +# requires-python = ">=3.7" +# /// + +"""This script is used to synchronize Open Food Facts images and OCR JSONs on +AWS S3. As part of AWS Open Dataset program, we can host free of charge data on +AWS S3. + +This dataset can be used by researchers to access easily OFF data, without +overloading OFF servers. + +This script should be run regularly, to synchronize new images. We currently +upload: + +- all raw images (ex: 1.jpg, 2.jpg,...) +- 400px resized version of the raw images +- OCR results of the raw images (ex: 1.json.gz) +""" + +import argparse +import gzip +import logging +import tempfile +from logging import getLogger +from pathlib import Path +from typing import Iterator, Optional, Tuple + +import boto3 +import tqdm +from openfoodfacts import ProductDataset +from openfoodfacts.images import split_barcode + +logger = getLogger() +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s :: %(processName)s :: " + "%(threadName)s :: %(levelname)s :: " + "%(message)s" +) +handler.setFormatter(formatter) +handler.setLevel(logging.INFO) +logger.addHandler(handler) +logger.setLevel(logging.INFO) + +s3 = boto3.resource("s3", region_name="eu-west-3") +bucket = s3.Bucket("openfoodfacts-images") + + + +def generate_product_path(barcode: str) -> str: + if not barcode.isdigit(): + raise ValueError("unknown barcode format: {}".format(barcode)) + + splitted_barcode = split_barcode(barcode) + return "/".join(splitted_barcode) + + +def get_sync_filepaths( + base_dir: Path, ds: ProductDataset +) -> Iterator[Tuple[str, Path]]: + """Return an iterator containing files to synchronize with AWS S3 bucket. + + The iterator returns (barcode, file_path) tuples, where `barcode` is the + product barcode, and `file_path` is the path of the file to synchronize. + + We use the product dataset to know images associated with each products, + this way we don't push to S3 deleted images. + + We currently synchronize: + + - all raw images (ex: 1.jpg, 2.jpg,...) + - 400px resized version of the raw images + - OCR results of the raw images (ex: 1.json.gz) + + :param base_dir: directory where images are stored + :param ds: product dataset + """ + for item in tqdm.tqdm(ds, desc="products"): + barcode = item["code"] + if not barcode: + continue + product_path = generate_product_path(barcode) + product_dir = Path(product_path) + full_product_dir = base_dir / product_dir + + for image_id in item.get("images", {}).keys(): + if not image_id.isdigit(): + # Ignore selected image keys + continue + + # Only synchronize raw and 400px version of images + for image_name in ( + "{}.jpg".format(image_id), + "{}.400.jpg".format(image_id), + ): + full_image_path = full_product_dir / image_name + if not full_image_path.is_file(): + logger.warning("image {} not found".format(full_image_path)) + continue + yield barcode, product_dir / image_name + + # Synchronize OCR JSON if it exists + ocr_file_name = "{}.json.gz".format(image_id) + if (full_product_dir / ocr_file_name).is_file(): + yield barcode, product_dir / ocr_file_name + + +def run(image_dir: Path, dataset_path: Optional[Path]) -> None: + """Launch the synchronization. + + :param image_dir: directory where images are stored + :param dataset_path: path to the JSONL dataset + """ + ds = ProductDataset(dataset_path=dataset_path) + logger.info("Fetching existing keys...") + existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/")) + logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys)) + dataset_keys = set() + + uploaded = 0 + kept = 0 + deleted = 0 + for barcode, file_path in get_sync_filepaths(image_dir, ds): + full_file_path = image_dir / file_path + key = "data/{}".format(file_path) + dataset_keys.add(key) + + if key in existing_keys: + logger.debug("File %s already exists on S3", key) + kept += 1 + continue + + extra_args = {"Metadata": {"barcode": barcode}} + if key.endswith(".jpg"): + extra_args["ContentType"] = "image/jpeg" + + logger.debug("Uploading file %s -> %s", full_file_path, key) + bucket.upload_file(str(full_file_path), key, ExtraArgs=extra_args) + uploaded += 1 + existing_keys.add(key) + + if (kept + uploaded) % 1000 == 0: + logger.info("uploaded: %d, kept: %d", uploaded, kept) + + logger.info("Removing deleted files...") + for missing_key in existing_keys - dataset_keys: + # Removing files associated with deleted images + logger.debug("Deleting S3 file %s", missing_key) + deleted += 1 + bucket.delete_objects( + Delete={ + "Objects": [ + {"Key": missing_key}, + ], + }, + ) + + # We upload all S3 keys in a single `data_keys.txt` text file + # to make it easier to know existing files on the bucket + + # Create a temporary directory to avoid uploading a corrupted file + tmp_dir = Path(tempfile.mkdtemp()) + data_keys_path = tmp_dir / "data_keys.txt" + logger.info("Saving data keys in %s", data_keys_path) + + with gzip.open(str(data_keys_path), "wt") as f: + f.write("\n".join(sorted(existing_keys))) + + logger.info("Uploading data keys...") + bucket.upload_file(str(data_keys_path), "data/data_keys.gz") + data_keys_path.unlink() + tmp_dir.rmdir() + + logger.info( + "Synchronization finished, uploaded: %d, kept: %d, deleted: %d", + uploaded, + kept, + deleted, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Synchronize Open Food Facts images and OCR files with AWS S3. + + This script should be run regularly, to synchronize new images. We currently + upload: + + - all raw images (ex: 1.jpg, 2.jpg,...) + - 400px resized version of the raw images + - OCR results of the raw images (ex: 1.json.gz) + + Before upload, the latest version of the dataset is downloaded from Open Food + Facts servers to get the list of images to synchronize. + """ + ) + parser.add_argument( + "image_dir", + type=Path, + help="Directory where images are stored.", + ) + parser.add_argument( + "--dataset-path", + type=Path, + help="Directory where dataset is stored.", + ) + args = parser.parse_args() + run(args.image_dir, args.dataset_path) +