Skip to content

Commit

Permalink
chore: move aws image sync script to ovh3
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Oct 29, 2024
1 parent c674291 commit 42dd4fc
Show file tree
Hide file tree
Showing 5 changed files with 268 additions and 0 deletions.
14 changes: 14 additions & 0 deletions confs/ovh3/systemd/system/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# service instance name "%i" is off only (for now)
[Unit]
Description=Synchronize images to AWS S3 %i
# __ will be replaced by @ in [email protected]
OnFailure=email-failures@sync_images_s3__%i.service

[Service]
Type=oneshot
User=off
Group=off
# Warning: this script doesn't work currently with non-off product type
ExecStart=/home/off/.cargo/bin/uv run /opt/openfoodfacts-infrastructure/scripts/ovh3/sync-s3-images/sync_s3_images.py /rpool/off/images/products
KillMode=process

13 changes: 13 additions & 0 deletions confs/ovh3/systemd/system/[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# service instance name "%i" is off only (for now)
[Unit]
Description=Synchronize images to AWS S3 daily

[Timer]
# every tuesday
OnCalendar=Tue *-*-* 02:00:00
# service instance name "%i" is off / obf / opff / opf
Unit=sync_images_s3@%i.service

[Install]
WantedBy=multi-user.target

9 changes: 9 additions & 0 deletions scripts/ovh3/sync-s3-images/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# AWS Open Dataset: Open Food Facts images

This directory contains the [script](./sync_s3_images.py) that synchronizes
images and OCR results, from a directory containing all images to `openfoodfacts-images` bucket,
as part of AWS Open Dataset program.

The dataset YAML description sent to [AWS Open Data
registry](https://github.com/awslabs/open-data-registry/tree/main) can be found
at [openfoodfacts-images.yml](./openfoodfacts-images.yml).
17 changes: 17 additions & 0 deletions scripts/ovh3/sync-s3-images/openfoodfacts-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Name: Open Food Facts Images
Description: A dataset of all images of Open Food Facts, the biggest open
dataset of food products in the world.
Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/aws-images-dataset
Contact: [email protected]
ManagedBy: "[Open Food Facts](https://world.openfoodfacts.org)"
UpdateFrequency: Monthly
License: All data contained in this dataset is licenced under the [Creative Commons Attribution ShareAlike licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en)
Tags:
- machine learning
- image processing
Resources:
- Description: Open Food Facts image dataset
ARN: arn:aws:s3:::openfoodfacts-images
Region: eu-west-3
Type: S3 Bucket

215 changes: 215 additions & 0 deletions scripts/ovh3/sync-s3-images/sync_s3_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
# /// script
# dependencies = [
# "openfoodfacts==1.1.5",
# "orjson==3.10.7",
# "boto3==1.35.32",
# "tqdm==4.66.5",
# ]
# requires-python = ">=3.7"
# ///

"""This script is used to synchronize Open Food Facts images and OCR JSONs on
AWS S3. As part of AWS Open Dataset program, we can host free of charge data on
AWS S3.
This dataset can be used by researchers to access easily OFF data, without
overloading OFF servers.
This script should be run regularly, to synchronize new images. We currently
upload:
- all raw images (ex: 1.jpg, 2.jpg,...)
- 400px resized version of the raw images
- OCR results of the raw images (ex: 1.json.gz)
"""

import argparse
import gzip
import logging
import tempfile
from logging import getLogger
from pathlib import Path
from typing import Iterator, Optional, Tuple

import boto3
import tqdm
from openfoodfacts import ProductDataset
from openfoodfacts.images import split_barcode

logger = getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s :: %(processName)s :: "
"%(threadName)s :: %(levelname)s :: "
"%(message)s"
)
handler.setFormatter(formatter)
handler.setLevel(logging.INFO)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

s3 = boto3.resource("s3", region_name="eu-west-3")
bucket = s3.Bucket("openfoodfacts-images")



def generate_product_path(barcode: str) -> str:
if not barcode.isdigit():
raise ValueError("unknown barcode format: {}".format(barcode))

splitted_barcode = split_barcode(barcode)
return "/".join(splitted_barcode)


def get_sync_filepaths(
base_dir: Path, ds: ProductDataset
) -> Iterator[Tuple[str, Path]]:
"""Return an iterator containing files to synchronize with AWS S3 bucket.
The iterator returns (barcode, file_path) tuples, where `barcode` is the
product barcode, and `file_path` is the path of the file to synchronize.
We use the product dataset to know images associated with each products,
this way we don't push to S3 deleted images.
We currently synchronize:
- all raw images (ex: 1.jpg, 2.jpg,...)
- 400px resized version of the raw images
- OCR results of the raw images (ex: 1.json.gz)
:param base_dir: directory where images are stored
:param ds: product dataset
"""
for item in tqdm.tqdm(ds, desc="products"):
barcode = item["code"]
if not barcode:
continue
product_path = generate_product_path(barcode)
product_dir = Path(product_path)
full_product_dir = base_dir / product_dir

for image_id in item.get("images", {}).keys():
if not image_id.isdigit():
# Ignore selected image keys
continue

# Only synchronize raw and 400px version of images
for image_name in (
"{}.jpg".format(image_id),
"{}.400.jpg".format(image_id),
):
full_image_path = full_product_dir / image_name
if not full_image_path.is_file():
logger.warning("image {} not found".format(full_image_path))
continue
yield barcode, product_dir / image_name

# Synchronize OCR JSON if it exists
ocr_file_name = "{}.json.gz".format(image_id)
if (full_product_dir / ocr_file_name).is_file():
yield barcode, product_dir / ocr_file_name


def run(image_dir: Path, dataset_path: Optional[Path]) -> None:
"""Launch the synchronization.
:param image_dir: directory where images are stored
:param dataset_path: path to the JSONL dataset
"""
ds = ProductDataset(dataset_path=dataset_path)
logger.info("Fetching existing keys...")
existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/"))
logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys))
dataset_keys = set()

uploaded = 0
kept = 0
deleted = 0
for barcode, file_path in get_sync_filepaths(image_dir, ds):
full_file_path = image_dir / file_path
key = "data/{}".format(file_path)
dataset_keys.add(key)

if key in existing_keys:
logger.debug("File %s already exists on S3", key)
kept += 1
continue

extra_args = {"Metadata": {"barcode": barcode}}
if key.endswith(".jpg"):
extra_args["ContentType"] = "image/jpeg"

logger.debug("Uploading file %s -> %s", full_file_path, key)
bucket.upload_file(str(full_file_path), key, ExtraArgs=extra_args)
uploaded += 1
existing_keys.add(key)

if (kept + uploaded) % 1000 == 0:
logger.info("uploaded: %d, kept: %d", uploaded, kept)

logger.info("Removing deleted files...")
for missing_key in existing_keys - dataset_keys:
# Removing files associated with deleted images
logger.debug("Deleting S3 file %s", missing_key)
deleted += 1
bucket.delete_objects(
Delete={
"Objects": [
{"Key": missing_key},
],
},
)

# We upload all S3 keys in a single `data_keys.txt` text file
# to make it easier to know existing files on the bucket

# Create a temporary directory to avoid uploading a corrupted file
tmp_dir = Path(tempfile.mkdtemp())
data_keys_path = tmp_dir / "data_keys.txt"
logger.info("Saving data keys in %s", data_keys_path)

with gzip.open(str(data_keys_path), "wt") as f:
f.write("\n".join(sorted(existing_keys)))

logger.info("Uploading data keys...")
bucket.upload_file(str(data_keys_path), "data/data_keys.gz")
data_keys_path.unlink()
tmp_dir.rmdir()

logger.info(
"Synchronization finished, uploaded: %d, kept: %d, deleted: %d",
uploaded,
kept,
deleted,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Synchronize Open Food Facts images and OCR files with AWS S3.
This script should be run regularly, to synchronize new images. We currently
upload:
- all raw images (ex: 1.jpg, 2.jpg,...)
- 400px resized version of the raw images
- OCR results of the raw images (ex: 1.json.gz)
Before upload, the latest version of the dataset is downloaded from Open Food
Facts servers to get the list of images to synchronize.
"""
)
parser.add_argument(
"image_dir",
type=Path,
help="Directory where images are stored.",
)
parser.add_argument(
"--dataset-path",
type=Path,
help="Directory where dataset is stored.",
)
args = parser.parse_args()
run(args.image_dir, args.dataset_path)

0 comments on commit 42dd4fc

Please sign in to comment.