-
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: move aws image sync script to ovh3
- Loading branch information
root
committed
Oct 29, 2024
1 parent
c674291
commit 42dd4fc
Showing
5 changed files
with
268 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# service instance name "%i" is off only (for now) | ||
[Unit] | ||
Description=Synchronize images to AWS S3 %i | ||
# __ will be replaced by @ in [email protected] | ||
OnFailure=email-failures@sync_images_s3__%i.service | ||
|
||
[Service] | ||
Type=oneshot | ||
User=off | ||
Group=off | ||
# Warning: this script doesn't work currently with non-off product type | ||
ExecStart=/home/off/.cargo/bin/uv run /opt/openfoodfacts-infrastructure/scripts/ovh3/sync-s3-images/sync_s3_images.py /rpool/off/images/products | ||
KillMode=process | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# service instance name "%i" is off only (for now) | ||
[Unit] | ||
Description=Synchronize images to AWS S3 daily | ||
|
||
[Timer] | ||
# every tuesday | ||
OnCalendar=Tue *-*-* 02:00:00 | ||
# service instance name "%i" is off / obf / opff / opf | ||
Unit=sync_images_s3@%i.service | ||
|
||
[Install] | ||
WantedBy=multi-user.target | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# AWS Open Dataset: Open Food Facts images | ||
|
||
This directory contains the [script](./sync_s3_images.py) that synchronizes | ||
images and OCR results, from a directory containing all images to `openfoodfacts-images` bucket, | ||
as part of AWS Open Dataset program. | ||
|
||
The dataset YAML description sent to [AWS Open Data | ||
registry](https://github.com/awslabs/open-data-registry/tree/main) can be found | ||
at [openfoodfacts-images.yml](./openfoodfacts-images.yml). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
Name: Open Food Facts Images | ||
Description: A dataset of all images of Open Food Facts, the biggest open | ||
dataset of food products in the world. | ||
Documentation: https://openfoodfacts.github.io/openfoodfacts-server/api/aws-images-dataset | ||
Contact: [email protected] | ||
ManagedBy: "[Open Food Facts](https://world.openfoodfacts.org)" | ||
UpdateFrequency: Monthly | ||
License: All data contained in this dataset is licenced under the [Creative Commons Attribution ShareAlike licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en) | ||
Tags: | ||
- machine learning | ||
- image processing | ||
Resources: | ||
- Description: Open Food Facts image dataset | ||
ARN: arn:aws:s3:::openfoodfacts-images | ||
Region: eu-west-3 | ||
Type: S3 Bucket | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
# /// script | ||
# dependencies = [ | ||
# "openfoodfacts==1.1.5", | ||
# "orjson==3.10.7", | ||
# "boto3==1.35.32", | ||
# "tqdm==4.66.5", | ||
# ] | ||
# requires-python = ">=3.7" | ||
# /// | ||
|
||
"""This script is used to synchronize Open Food Facts images and OCR JSONs on | ||
AWS S3. As part of AWS Open Dataset program, we can host free of charge data on | ||
AWS S3. | ||
This dataset can be used by researchers to access easily OFF data, without | ||
overloading OFF servers. | ||
This script should be run regularly, to synchronize new images. We currently | ||
upload: | ||
- all raw images (ex: 1.jpg, 2.jpg,...) | ||
- 400px resized version of the raw images | ||
- OCR results of the raw images (ex: 1.json.gz) | ||
""" | ||
|
||
import argparse | ||
import gzip | ||
import logging | ||
import tempfile | ||
from logging import getLogger | ||
from pathlib import Path | ||
from typing import Iterator, Optional, Tuple | ||
|
||
import boto3 | ||
import tqdm | ||
from openfoodfacts import ProductDataset | ||
from openfoodfacts.images import split_barcode | ||
|
||
logger = getLogger() | ||
handler = logging.StreamHandler() | ||
formatter = logging.Formatter( | ||
"%(asctime)s :: %(processName)s :: " | ||
"%(threadName)s :: %(levelname)s :: " | ||
"%(message)s" | ||
) | ||
handler.setFormatter(formatter) | ||
handler.setLevel(logging.INFO) | ||
logger.addHandler(handler) | ||
logger.setLevel(logging.INFO) | ||
|
||
s3 = boto3.resource("s3", region_name="eu-west-3") | ||
bucket = s3.Bucket("openfoodfacts-images") | ||
|
||
|
||
|
||
def generate_product_path(barcode: str) -> str: | ||
if not barcode.isdigit(): | ||
raise ValueError("unknown barcode format: {}".format(barcode)) | ||
|
||
splitted_barcode = split_barcode(barcode) | ||
return "/".join(splitted_barcode) | ||
|
||
|
||
def get_sync_filepaths( | ||
base_dir: Path, ds: ProductDataset | ||
) -> Iterator[Tuple[str, Path]]: | ||
"""Return an iterator containing files to synchronize with AWS S3 bucket. | ||
The iterator returns (barcode, file_path) tuples, where `barcode` is the | ||
product barcode, and `file_path` is the path of the file to synchronize. | ||
We use the product dataset to know images associated with each products, | ||
this way we don't push to S3 deleted images. | ||
We currently synchronize: | ||
- all raw images (ex: 1.jpg, 2.jpg,...) | ||
- 400px resized version of the raw images | ||
- OCR results of the raw images (ex: 1.json.gz) | ||
:param base_dir: directory where images are stored | ||
:param ds: product dataset | ||
""" | ||
for item in tqdm.tqdm(ds, desc="products"): | ||
barcode = item["code"] | ||
if not barcode: | ||
continue | ||
product_path = generate_product_path(barcode) | ||
product_dir = Path(product_path) | ||
full_product_dir = base_dir / product_dir | ||
|
||
for image_id in item.get("images", {}).keys(): | ||
if not image_id.isdigit(): | ||
# Ignore selected image keys | ||
continue | ||
|
||
# Only synchronize raw and 400px version of images | ||
for image_name in ( | ||
"{}.jpg".format(image_id), | ||
"{}.400.jpg".format(image_id), | ||
): | ||
full_image_path = full_product_dir / image_name | ||
if not full_image_path.is_file(): | ||
logger.warning("image {} not found".format(full_image_path)) | ||
continue | ||
yield barcode, product_dir / image_name | ||
|
||
# Synchronize OCR JSON if it exists | ||
ocr_file_name = "{}.json.gz".format(image_id) | ||
if (full_product_dir / ocr_file_name).is_file(): | ||
yield barcode, product_dir / ocr_file_name | ||
|
||
|
||
def run(image_dir: Path, dataset_path: Optional[Path]) -> None: | ||
"""Launch the synchronization. | ||
:param image_dir: directory where images are stored | ||
:param dataset_path: path to the JSONL dataset | ||
""" | ||
ds = ProductDataset(dataset_path=dataset_path) | ||
logger.info("Fetching existing keys...") | ||
existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/")) | ||
logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys)) | ||
dataset_keys = set() | ||
|
||
uploaded = 0 | ||
kept = 0 | ||
deleted = 0 | ||
for barcode, file_path in get_sync_filepaths(image_dir, ds): | ||
full_file_path = image_dir / file_path | ||
key = "data/{}".format(file_path) | ||
dataset_keys.add(key) | ||
|
||
if key in existing_keys: | ||
logger.debug("File %s already exists on S3", key) | ||
kept += 1 | ||
continue | ||
|
||
extra_args = {"Metadata": {"barcode": barcode}} | ||
if key.endswith(".jpg"): | ||
extra_args["ContentType"] = "image/jpeg" | ||
|
||
logger.debug("Uploading file %s -> %s", full_file_path, key) | ||
bucket.upload_file(str(full_file_path), key, ExtraArgs=extra_args) | ||
uploaded += 1 | ||
existing_keys.add(key) | ||
|
||
if (kept + uploaded) % 1000 == 0: | ||
logger.info("uploaded: %d, kept: %d", uploaded, kept) | ||
|
||
logger.info("Removing deleted files...") | ||
for missing_key in existing_keys - dataset_keys: | ||
# Removing files associated with deleted images | ||
logger.debug("Deleting S3 file %s", missing_key) | ||
deleted += 1 | ||
bucket.delete_objects( | ||
Delete={ | ||
"Objects": [ | ||
{"Key": missing_key}, | ||
], | ||
}, | ||
) | ||
|
||
# We upload all S3 keys in a single `data_keys.txt` text file | ||
# to make it easier to know existing files on the bucket | ||
|
||
# Create a temporary directory to avoid uploading a corrupted file | ||
tmp_dir = Path(tempfile.mkdtemp()) | ||
data_keys_path = tmp_dir / "data_keys.txt" | ||
logger.info("Saving data keys in %s", data_keys_path) | ||
|
||
with gzip.open(str(data_keys_path), "wt") as f: | ||
f.write("\n".join(sorted(existing_keys))) | ||
|
||
logger.info("Uploading data keys...") | ||
bucket.upload_file(str(data_keys_path), "data/data_keys.gz") | ||
data_keys_path.unlink() | ||
tmp_dir.rmdir() | ||
|
||
logger.info( | ||
"Synchronization finished, uploaded: %d, kept: %d, deleted: %d", | ||
uploaded, | ||
kept, | ||
deleted, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="""Synchronize Open Food Facts images and OCR files with AWS S3. | ||
This script should be run regularly, to synchronize new images. We currently | ||
upload: | ||
- all raw images (ex: 1.jpg, 2.jpg,...) | ||
- 400px resized version of the raw images | ||
- OCR results of the raw images (ex: 1.json.gz) | ||
Before upload, the latest version of the dataset is downloaded from Open Food | ||
Facts servers to get the list of images to synchronize. | ||
""" | ||
) | ||
parser.add_argument( | ||
"image_dir", | ||
type=Path, | ||
help="Directory where images are stored.", | ||
) | ||
parser.add_argument( | ||
"--dataset-path", | ||
type=Path, | ||
help="Directory where dataset is stored.", | ||
) | ||
args = parser.parse_args() | ||
run(args.image_dir, args.dataset_path) | ||
|