From 75ebf64de1dba87ac06b5de42afc227b144de2f2 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 21 Aug 2024 17:23:49 +0200 Subject: [PATCH 01/22] feat(Batch job - Spellcheck): :zap: --- batch/spellcheck/Dockerfile | 15 ++ batch/spellcheck/README.md | 40 ++++ batch/spellcheck/job.py | 150 +++++++++++++++ batch/spellcheck/requirements.txt | 2 + poetry.lock | 252 ++++++++++++++++++++++--- pyproject.toml | 3 +- robotoff/app/api.py | 25 ++- robotoff/batch/__init__.py | 11 ++ robotoff/batch/batch.py | 206 ++++++++++++++++++++ robotoff/batch/configs/spellcheck.yaml | 13 ++ robotoff/settings.py | 4 + robotoff/types.py | 40 +++- tests/unit/test_batch.py | 18 ++ 13 files changed, 747 insertions(+), 32 deletions(-) create mode 100644 batch/spellcheck/Dockerfile create mode 100644 batch/spellcheck/README.md create mode 100644 batch/spellcheck/job.py create mode 100644 batch/spellcheck/requirements.txt create mode 100644 robotoff/batch/__init__.py create mode 100644 robotoff/batch/batch.py create mode 100644 robotoff/batch/configs/spellcheck.yaml create mode 100644 tests/unit/test_batch.py diff --git a/batch/spellcheck/Dockerfile b/batch/spellcheck/Dockerfile new file mode 100644 index 0000000000..61b73b5b1b --- /dev/null +++ b/batch/spellcheck/Dockerfile @@ -0,0 +1,15 @@ +FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=on + +WORKDIR /app + +COPY job.py /app +COPY requirements.txt /app + +RUN pip install --no-cache-dir -r requirements.txt + +# Set the entrypoint to the batch job script +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/batch/spellcheck/README.md b/batch/spellcheck/README.md new file mode 100644 index 0000000000..975f63733e --- /dev/null +++ b/batch/spellcheck/README.md @@ -0,0 +1,40 @@ +# Google Batch job + +## Notes + +* Netherland (europe-west4) has GPUs (A100, L4) +* Check [CLOUD-LOGGING](https://console.cloud.google.com/logs/query;query=SEARCH%2528%22spellcheck%22%2529;cursorTimestamp=2024-08-14T11:21:32.485988660Z;duration=PT1H?referrer=search&project=robotoff) for logs +* Require deep learning image to run: [deep learning containers list](https://cloud.google.com/deep-learning-containers/docs/choosing-container#pytorch) +* Custom storage capacity to host the heavy docker image (~24GB) by adding BootDisk +* 1000 products processed: 1:30min (g2-instance-with 8) (overall batch job: 3:25min) + * L4: g2-instance-8 hourly cost: $0.896306 -> ~ 0.05$ to process batch of 1000 + * A100: a2-highgpu-1g: $3.748064 +* A100/Cuda doesn't support FP8 +* A100 has less availability than L4: need to wait for batch job (can be long) + +## Links + +* [GPU availability per region](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) +* [Batch job with GPU](https://cloud.google.com/batch/docs/create-run-job-gpus#create-job-gpu-examples) +* [VM Instance pricing](https://cloud.google.com/compute/vm-instance-pricing#vm-instance-pricing) +* [Trigger cloud function with bucket updates](https://cloud.google.com/functions/docs/calling/storage) +* [Python Google Batch](https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch) + +## Commands + +### List GPUs per region +```bash +gcloud compute accelerator-types list +``` + +### List deep learning images +```bash +gcloud compute images list \ +--project deeplearning-platform-release \ +--format="value(NAME)" \ +--no-standard-images +``` + +## Workflow / Orchestration + +* [Workflow](https://cloud.google.com/workflows/docs/overview) diff --git a/batch/spellcheck/job.py b/batch/spellcheck/job.py new file mode 100644 index 0000000000..6c629e4c56 --- /dev/null +++ b/batch/spellcheck/job.py @@ -0,0 +1,150 @@ +import argparse +import tempfile +import logging +from typing import List + +import pandas as pd +from vllm import LLM, SamplingParams +from google.cloud import storage + + +LOGGER = logging.getLogger(__name__) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + +FEATURES_VALIDATION = ["code", "text"] + + +def parse() -> argparse.Namespace: + """Parse command line arguments. + """ + parser = argparse.ArgumentParser(description="Spellcheck module.") + parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.") + parser.add_argument("--pre_data_suffix", type=str, default="data/test_data.parquet", help="Dataset suffix containing the data to be processed.") + parser.add_argument("--post_data_suffix", type=str, default="data/test_processed_data.parquet", help="Dataset suffix containing the processed data.") + parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.") + parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.") + parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.") + parser.add_argument("--max_tokens", default=1024, type=int, help="Maximum number of tokens to generate.") + parser.add_argument("--quantization", default="fp8", type=str, help="Quantization type.") + parser.add_argument("--dtype", default="auto", type=str, help="Model weights precision. Default corresponds to the modle config (float16 here)") + return parser.parse_args() + + +def main(): + """Batch processing job. + + Original lists of ingredients are stored in a gs bucket before being loaded then processed by the model. + The corrected lists of ingredients are then stored back in gs. + + We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository. + """ + LOGGER.info("Starting batch processing job.") + args = parse() + + LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}") + data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix) + LOGGER.info(f"Feature in uploaded data: {data.columns}") + if not all(feature in data.columns for feature in FEATURES_VALIDATION): + raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}") + + instructions = [prepare_instruction(text) for text in data["text"]] + llm = LLM( + model=args.model_path, + max_model_len=args.max_model_len, + dtype=args.dtype, + quantization=args.quantization, + ) + sampling_params = SamplingParams( + temperature=args.temperature, + max_tokens=args.max_tokens + ) + + LOGGER.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}") + data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params) + + LOGGER.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}") + # Save DataFrame as Parquet to a temporary file + with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file: + data.to_parquet(temp_file.name) + temp_file_name = temp_file.name + upload_gcs( + temp_file_name, + bucket_name=args.data_bucket, + suffix=args.post_data_suffix + ) + LOGGER.info("Batch processing job completed.") + + +def prepare_instruction(text: str) -> str: + """Prepare instruction prompt for fine-tuning and inference. + + Args: + text (str): List of ingredients + + Returns: + str: Instruction. + """ + instruction = ( + "###Correct the list of ingredients:\n" + + text + + "\n\n###Correction:\n" + ) + return instruction + + +def batch_inference( + texts: List[str], + llm: LLM, + sampling_params: SamplingParams + ) -> List[str]: + """Process batch of texts with vLLM. + + Args: + texts (List[str]): Batch + llm (LLM): Model engine optimized with vLLM + sampling_params (SamplingParams): Generation parameters + + Returns: + List[str]: Processed batch of texts + """ + outputs = llm.generate(texts, sampling_params,) + corrections = [output.outputs[0].text for output in outputs] + return corrections + + +def load_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: + """Load data from Google Cloud Storage bucket. + + Args: + bucket_name (str): + suffix (str): Path inside the bucket + + Returns: + pd.DataFrame: Df from parquet file. + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + with blob.open("rb") as f: + df = pd.read_parquet(f) + return df + + +def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None: + """Upload data to GCS. + + Args: + filepath (str): File path to export. + bucket_name (str): Bucket name. + suffix (str): Path inside the bucket. + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + blob.upload_from_filename(filename=file_path) + +if __name__ == "__main__": + main() diff --git a/batch/spellcheck/requirements.txt b/batch/spellcheck/requirements.txt new file mode 100644 index 0000000000..0ab3046f20 --- /dev/null +++ b/batch/spellcheck/requirements.txt @@ -0,0 +1,2 @@ +vllm==0.5.4 +google-cloud-storage==2.18.0 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 2c370798fb..c6ddc1774b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -987,6 +987,87 @@ python-dateutil = ">=2.8.1" [package.extras] dev = ["flake8", "markdown", "twine", "wheel"] +[[package]] +name = "google-api-core" +version = "1.34.1" +description = "Google API client core library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google-api-core-1.34.1.tar.gz", hash = "sha256:3399c92887a97d33038baa4bfd3bf07acc05d474b0171f333e1f641c1364e552"}, + {file = "google_api_core-1.34.1-py3-none-any.whl", hash = "sha256:52bcc9d9937735f8a3986fa0bbf9135ae9cf5393a722387e5eced520e39c774a"}, +] + +[package.dependencies] +google-auth = ">=1.25.0,<3.0dev" +googleapis-common-protos = ">=1.56.2,<2.0dev" +grpcio = {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""} +grpcio-status = {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""} +protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.0.0dev" +requests = ">=2.18.0,<3.0.0dev" + +[package.extras] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)"] +grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] +grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] + +[[package]] +name = "google-auth" +version = "2.34.0" +description = "Google Authentication Library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_auth-2.34.0-py2.py3-none-any.whl", hash = "sha256:72fd4733b80b6d777dcde515628a9eb4a577339437012874ea286bca7261ee65"}, + {file = "google_auth-2.34.0.tar.gz", hash = "sha256:8eb87396435c19b20d32abd2f984e31c191a15284af72eb922f10e5bde9c04cc"}, +] + +[package.dependencies] +cachetools = ">=2.0.0,<6.0" +pyasn1-modules = ">=0.2.1" +rsa = ">=3.1.4,<5" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"] +enterprise-cert = ["cryptography", "pyopenssl"] +pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] +reauth = ["pyu2f (>=0.1.5)"] +requests = ["requests (>=2.20.0,<3.0.0.dev0)"] + +[[package]] +name = "google-cloud-batch" +version = "0.17.26" +description = "Google Cloud Batch API client library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_cloud_batch-0.17.26-py2.py3-none-any.whl", hash = "sha256:2cbed78f6fe612b540c08f92e01cca22fa66c38505b4a084d0c6e4da88dea335"}, + {file = "google_cloud_batch-0.17.26.tar.gz", hash = "sha256:9d86f703ed990d223c386883047c83a70ecab2378e1a686c8f67b113b00644cf"}, +] + +[package.dependencies] +google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} +google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" +proto-plus = ">=1.22.3,<2.0.0dev" +protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" + +[[package]] +name = "googleapis-common-protos" +version = "1.63.2" +description = "Common protobufs used in Google APIs" +optional = false +python-versions = ">=3.7" +files = [ + {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"}, + {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"}, +] + +[package.dependencies] +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" + +[package.extras] +grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] + [[package]] name = "grpcio" version = "1.65.1" @@ -1045,6 +1126,22 @@ files = [ [package.extras] protobuf = ["grpcio-tools (>=1.65.1)"] +[[package]] +name = "grpcio-status" +version = "1.48.2" +description = "Status proto mapping for gRPC" +optional = false +python-versions = ">=3.6" +files = [ + {file = "grpcio-status-1.48.2.tar.gz", hash = "sha256:53695f45da07437b7c344ee4ef60d370fd2850179f5a28bb26d8e2aa1102ec11"}, + {file = "grpcio_status-1.48.2-py3-none-any.whl", hash = "sha256:2c33bbdbe20188b2953f46f31af669263b6ee2a9b2d38fa0d36ee091532e21bf"}, +] + +[package.dependencies] +googleapis-common-protos = ">=1.5.5" +grpcio = ">=1.48.2" +protobuf = ">=3.12.0" + [[package]] name = "gunicorn" version = "22.0.0" @@ -2117,38 +2214,52 @@ pyyaml = ">=5.1" toml = "*" virtualenv = ">=20.0.8" +[[package]] +name = "proto-plus" +version = "1.24.0" +description = "Beautiful, Pythonic protocol buffers." +optional = false +python-versions = ">=3.7" +files = [ + {file = "proto-plus-1.24.0.tar.gz", hash = "sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445"}, + {file = "proto_plus-1.24.0-py3-none-any.whl", hash = "sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12"}, +] + +[package.dependencies] +protobuf = ">=3.19.0,<6.0.0dev" + +[package.extras] +testing = ["google-api-core (>=1.31.5)"] + [[package]] name = "protobuf" -version = "3.19.6" +version = "3.20.3" description = "Protocol Buffers" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" files = [ - {file = "protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1"}, - {file = "protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6"}, - {file = "protobuf-3.19.6-cp310-cp310-win32.whl", hash = "sha256:559670e006e3173308c9254d63facb2c03865818f22204037ab76f7a0ff70b5f"}, - {file = "protobuf-3.19.6-cp310-cp310-win_amd64.whl", hash = "sha256:347b393d4dd06fb93a77620781e11c058b3b0a5289262f094379ada2920a3730"}, - {file = "protobuf-3.19.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a8ce5ae0de28b51dff886fb922012dad885e66176663950cb2344c0439ecb473"}, - {file = "protobuf-3.19.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b0d02163c4e67279ddb6dc25e063db0130fc299aefabb5d481053509fae5c8"}, - {file = "protobuf-3.19.6-cp36-cp36m-win32.whl", hash = "sha256:30f5370d50295b246eaa0296533403961f7e64b03ea12265d6dfce3a391d8992"}, - {file = "protobuf-3.19.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0c0714b025ec057b5a7600cb66ce7c693815f897cfda6d6efb58201c472e3437"}, - {file = "protobuf-3.19.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5057c64052a1f1dd7d4450e9aac25af6bf36cfbfb3a1cd89d16393a036c49157"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bb6776bd18f01ffe9920e78e03a8676530a5d6c5911934c6a1ac6eb78973ecb6"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a04134866861b11556a82dd91ea6daf1f4925746b992f277b84013a7cc1229"}, - {file = "protobuf-3.19.6-cp37-cp37m-win32.whl", hash = "sha256:4bc98de3cdccfb5cd769620d5785b92c662b6bfad03a202b83799b6ed3fa1fa7"}, - {file = "protobuf-3.19.6-cp37-cp37m-win_amd64.whl", hash = "sha256:aa3b82ca1f24ab5326dcf4ea00fcbda703e986b22f3d27541654f749564d778b"}, - {file = "protobuf-3.19.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2b2d2913bcda0e0ec9a784d194bc490f5dc3d9d71d322d070b11a0ade32ff6ba"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d0b635cefebd7a8a0f92020562dead912f81f401af7e71f16bf9506ff3bdbb38"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a552af4dc34793803f4e735aabe97ffc45962dfd3a237bdde242bff5a3de684"}, - {file = "protobuf-3.19.6-cp38-cp38-win32.whl", hash = "sha256:0469bc66160180165e4e29de7f445e57a34ab68f49357392c5b2f54c656ab25e"}, - {file = "protobuf-3.19.6-cp38-cp38-win_amd64.whl", hash = "sha256:91d5f1e139ff92c37e0ff07f391101df77e55ebb97f46bbc1535298d72019462"}, - {file = "protobuf-3.19.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c0ccd3f940fe7f3b35a261b1dd1b4fc850c8fde9f74207015431f174be5976b3"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0"}, - {file = "protobuf-3.19.6-cp39-cp39-win32.whl", hash = "sha256:5a0d7539a1b1fb7e76bf5faa0b44b30f812758e989e59c40f77a7dab320e79b9"}, - {file = "protobuf-3.19.6-cp39-cp39-win_amd64.whl", hash = "sha256:bbf5cea5048272e1c60d235c7bd12ce1b14b8a16e76917f371c718bd3005f045"}, - {file = "protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4"}, - {file = "protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4"}, + {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"}, + {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"}, + {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"}, + {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"}, + {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"}, + {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"}, + {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"}, + {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"}, + {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"}, + {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"}, + {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"}, + {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"}, + {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"}, + {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"}, + {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"}, + {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"}, + {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"}, + {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"}, + {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"}, + {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"}, + {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"}, + {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"}, ] [[package]] @@ -2158,6 +2269,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false python-versions = ">=3.7" files = [ + {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"}, {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"}, {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"}, {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"}, @@ -2183,6 +2295,7 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -2191,10 +2304,43 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"}, ] [[package]] @@ -2211,6 +2357,31 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "pyasn1" +version = "0.6.0" +description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyasn1-0.6.0-py2.py3-none-any.whl", hash = "sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473"}, + {file = "pyasn1-0.6.0.tar.gz", hash = "sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c"}, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.0" +description = "A collection of ASN.1-based protocols modules" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyasn1_modules-0.4.0-py3-none-any.whl", hash = "sha256:be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b"}, + {file = "pyasn1_modules-0.4.0.tar.gz", hash = "sha256:831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6"}, +] + +[package.dependencies] +pyasn1 = ">=0.4.6,<0.7.0" + [[package]] name = "pycodestyle" version = "2.8.0" @@ -2808,6 +2979,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -2815,8 +2987,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2833,6 +3013,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -2840,6 +3021,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -3047,6 +3229,20 @@ redis = "*" redis-sentinel-url = "*" rq = ">=1.0" +[[package]] +name = "rsa" +version = "4.9" +description = "Pure-Python RSA implementation" +optional = false +python-versions = ">=3.6,<4" +files = [ + {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, + {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, +] + +[package.dependencies] +pyasn1 = ">=0.1.3" + [[package]] name = "safetensors" version = "0.4.3" @@ -3908,4 +4104,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "f3a66804cfb8e94e6e7e8c149a9921e590942da2e03c526041906c5b473ca6fd" +content-hash = "080e1a8ef09819c49742c8270b5c2da81ff49469d77a8cd304567aeba79e0741" diff --git a/pyproject.toml b/pyproject.toml index 7f36bb121f..7b5d309f9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ jsonschema = "~4.4.0" orjson = ">=3.8.2,<3.10.0" Pillow = ">=9.3,<10.4" numpy = "~1.26.4" -protobuf = "~3.19.0" +protobuf = "^3.19.0" Pint = "0.22" APScheduler = "~3.10.1" more-itertools = "~8.9.0" @@ -78,6 +78,7 @@ openfoodfacts = "1.1.1" imagehash = "~4.3.1" peewee-migrate = "~1.12.2" diskcache = "~5.6.3" +google-cloud-batch = "^0.17.26" [tool.poetry.dependencies.sentry-sdk] version = "~1.14.0" diff --git a/robotoff/app/api.py b/robotoff/app/api.py index ea3531c0a9..64b23fa83e 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -86,6 +86,11 @@ from robotoff.utils.text import get_tag from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue from robotoff.workers.tasks import download_product_dataset_job +from robotoff.batch import ( + BatchJobType, + GoogleBatchJob, + GoogleBatchJobConfig +) logger = get_logger() @@ -1748,6 +1753,24 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.status = falcon.HTTP_200 +class BatchJobResource: + def on_post(self, req: falcon.Request, resp: falcon.Response): + job_type_str: str = req.get_param("job_type", required=True) + + # Batch extraction + + # Launch Batch job + logger.info(f"Start batch with job_type: {job_type_str}") + try: + job_type = BatchJobType[job_type_str] + except KeyError: + raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}") + + batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) + batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) + resp.media = {"batch_job_details": batch_job} + + def custom_handle_uncaught_exception( req: falcon.Request, resp: falcon.Response, ex: Exception, params ): @@ -1785,7 +1808,7 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predict/nutrition", NutritionPredictorResource()) api.add_route("/api/v1/predict/ocr_prediction", OCRPredictionPredictorResource()) api.add_route("/api/v1/predict/category", CategoryPredictorResource()) -api.add_route("/api/v1/predict/ingredient_list", IngredientListPredictorResource()) +api.add_route("/api/v1/predict/ ", IngredientListPredictorResource()) api.add_route("/api/v1/predict/lang", LanguagePredictorResource()) api.add_route("/api/v1/predict/lang/product", ProductLanguagePredictorResource()) api.add_route("/api/v1/products/dataset", UpdateDatasetResource()) diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py new file mode 100644 index 0000000000..3febe071f0 --- /dev/null +++ b/robotoff/batch/__init__.py @@ -0,0 +1,11 @@ +from .batch import ( + GoogleBatchJob, + GoogleBatchJobConfig, + BatchJobType, +) + +__all__ = [ + "GoogleBatchJob", + "GoogleBatchJobConfig", + "BatchJobType", +] \ No newline at end of file diff --git a/robotoff/batch/batch.py b/robotoff/batch/batch.py new file mode 100644 index 0000000000..a21468aea6 --- /dev/null +++ b/robotoff/batch/batch.py @@ -0,0 +1,206 @@ +import abc +from typing import List, Optional +import enum +import yaml +import datetime + +from google.cloud import batch_v1 +from pydantic import BaseModel, Field + +from robotoff import settings + + +@enum.unique +class BatchJobType(enum.Enum): + """Each job type correspond to a task that will be executed in the batch job.""" + + ingredients_spellcheck = "ingredients_spellcheck" + + +# Paths batch job config files +BATCH_JOB_TYPE_TO_CONFIG_PATH = { + BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR + / "spellcheck.yaml", +} + + +class GoogleBatchJobConfig(BaseModel): + """Batch job configuration class.""" + + job_name: str = Field( + description="The name of the job. It needs to be unique amongst exisiting batch job names.", + ) + location: str = Field( + pattern=r"^europe-west\d{1,2}$", + description="The region in which the job will run. Regions that are available for Batch are listed on: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones. We restrict to Europe-West for now.", + ) + entrypoint: Optional[str] = Field( + default=None, + description="The entrypoint for the container. If None, use default entrypoint.", + examples=["python main.py"], + ) + commands: List[str] = Field( + default_factory=list, + description="Commands to run in the container. If None, use default commands. Can be used to add arguments to the job script.", + examples=[["--max_tokens", "1024"]], + ) + cpu_milli: int = Field( + default=1000, + description="The number of CPU milliseconds to allocate to the job. 1000 corresponds to 1 CPU core.", + ge=1000, + ) + memory_mib: int = Field( + default=8000, # 8GB + description="The amount of RAM in MiB to allocate to each CPU core.", + le=64000, + ) + boot_disk_mib: Optional[int] = Field( + default=None, + description="The size of the boot disk in MiB. It is deleted once the job finished. If None, no bootDisk is added.", + le=200000, # 200 GB + ) + max_retry_count: int = Field( + default=1, + ge=1, + description="The maximum number of times a task should be retried in case of failure.", + ) + max_run_duration: str = Field( + pattern=r"^\d{1,5}s$", + default="3600s", + description="The maximum duration of the job in seconds.", + ) + task_count: str = Field( + pattern=r"^\d+$", + default="1", + description="The number of tasks to run in the job.", + ) + parallelism: str = Field( + pattern=r"^\d+$", + default="1", + description="The number of tasks to run in parallel.", + ) + machine_type: str = Field( + description="The machine type to use for the job. Read more about machine types here: https://cloud.google.com/compute/docs/general-purpose-machines", + ) + accelerators_type: str = Field( + description="The type of accelerator to use for the job. Depends on the machine type. Read more about accelerators here: https://cloud.google.com/compute/docs/gpus", + ) + accelerators_count: int = Field( + ge=1, + description="The number of accelerators to use for the job.", + ) + install_gpu_drivers: bool = Field( + default=True, + description="Required if GPUs.", + ) + + @classmethod + def init(cls, job_type: BatchJobType): + """Initialize the class with the configuration file corresponding to the job type. + + :param job_type: Batch job type. + :type job_type: BatchJobType + """ + # Generate unique id for the job + unique_job_name = ( + job_type.name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + ) + + # Load config from job_type + config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type] + with open(config_path, "r") as f: + config = yaml.safe_load(f) + return cls(job_name=unique_job_name, **config) + + +class BatchJob(abc.ABC): + """Abstract class to launch and manage batch jobs: Google, AWS, Azure, Triton...""" + + @staticmethod + @abc.abstractmethod + def launch_job() -> str: + """Launch batch job.""" + pass + + +class GoogleBatchJob(BatchJob): + """GCP Batch class. It uses the Google Cloud Batch API to launch and manage jobs. + + More information on: + https://cloud.google.com/batch/docs/get-started + """ + + @staticmethod + def launch_job( + batch_job_config: GoogleBatchJobConfig, + ) -> batch_v1.Job: + """This method creates a Batch Job on GCP. + + Method copied from https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create + + :param google_batch_launch_config: Config to run a job on Google Batch. + :type google_batch_launch_config: GoogleBatchLaunchConfig + :param batch_job_config: Config to run a specific job on Google Batch. + :type batch_job_config: BatchJobConfig + :return: Batch job information. + :rtype: batch_v1.Job + + Returns: + A job object representing the job created. + """ + + client = batch_v1.BatchServiceClient() + + # Define what will be done as part of the job. + runnable = batch_v1.Runnable() + runnable.container = batch_v1.Runnable.Container() + runnable.container.image_uri = batch_job_config.container_image_uri + runnable.container.entrypoint = batch_job_config.entrypoint + runnable.container.commands = batch_job_config.commands + + # Jobs can be divided into tasks. In this case, we have only one task. + task = batch_v1.TaskSpec() + task.runnables = [runnable] + + # We can specify what resources are requested by each task. + resources = batch_v1.ComputeResource() + resources.cpu_milli = batch_job_config.cpu_milli + resources.memory_mib = batch_job_config.memory_mib + resources.boot_disk_mib = batch_job_config.boot_disk_mib + task.compute_resource = resources + + task.max_retry_count = batch_job_config.max_retry_count + task.max_run_duration = batch_job_config.max_run_duration + + # Tasks are grouped inside a job using TaskGroups. + group = batch_v1.TaskGroup() + group.task_count = batch_job_config.task_count + group.task_spec = task + + # Policies are used to define on what kind of virtual machines the tasks will run on. + policy = batch_v1.AllocationPolicy.InstancePolicy() + policy.machine_type = batch_job_config.machine_type + instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate() + instances.install_gpu_drivers = batch_job_config.install_gpu_drivers + instances.policy = policy + allocation_policy = batch_v1.AllocationPolicy() + allocation_policy.instances = [instances] + + accelerator = batch_v1.AllocationPolicy.Accelerator() + accelerator.type_ = batch_job_config.accelerators_type + accelerator.count = batch_job_config.accelerators_count + + job = batch_v1.Job() + job.task_groups = [group] + job.allocation_policy = allocation_policy + # We use Cloud Logging as it's an out of the box available option + job.logs_policy = batch_v1.LogsPolicy() + job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING + + create_request = batch_v1.CreateJobRequest() + create_request.job = job + create_request.job_id = batch_job_config.job_name + # The job's parent is the region in which the job will run + create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}" + + return client.create_job(create_request) diff --git a/robotoff/batch/configs/spellcheck.yaml b/robotoff/batch/configs/spellcheck.yaml new file mode 100644 index 0000000000..18562f6f09 --- /dev/null +++ b/robotoff/batch/configs/spellcheck.yaml @@ -0,0 +1,13 @@ +container_image_uri: "europe-west9-docker.pkg.dev/robotoff/gcf-artifacts/spellcheck-batch-vllm" +cpu_milli: 1000 +memory_mib: 32000 +boot_disk_mib: 100000 +max_retry_count: 1 +max_run_duration: "3600s" +task_count: "1" +parallelism: "1" +machine_type: "g2-standard-8" +accelerators_type: "nvidia-l4" +accelerators_count: "1" +install_gpu_drivers: true +location: "europe-west4" diff --git a/robotoff/settings.py b/robotoff/settings.py index 20a6ad5bd8..4db6f20126 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -133,6 +133,7 @@ def event_api() -> str: JSONL_DATASET_ETAG_PATH = DATASET_DIR / "products-etag.txt" JSONL_MIN_DATASET_PATH = DATASET_DIR / "products-min.jsonl.gz" DATASET_CHECK_MIN_PRODUCT_COUNT = 2_800_000 +BATCH_JOB_CONFIG_DIR = PROJECT_DIR / "robotoff/batch/configs" # Products JSONL @@ -356,3 +357,6 @@ def get_package_version() -> str: # Domains allowed to be used as image sources while cropping CROP_ALLOWED_DOMAINS = os.environ.get("CROP_ALLOWED_DOMAINS", "").split(",") + +# Batch jobs +GOOGLE_PROJECT_NAME= "robotoff" \ No newline at end of file diff --git a/robotoff/types.py b/robotoff/types.py index 99db6dce6f..8105d2030a 100644 --- a/robotoff/types.py +++ b/robotoff/types.py @@ -74,8 +74,7 @@ class InsightType(str, enum.Enum): """InsightType defines the type of the insight.""" # The 'ingredient spellcheck' insight corrects the spelling in the given - # ingredients list. NOTE: this insight is deprecated until a new spellcheck - # method is developed + # ingredients list. ingredient_spellcheck = "ingredient_spellcheck" # The 'packager code' insight extracts the packager code using regex from @@ -359,3 +358,40 @@ class PackagingElementProperty(enum.Enum): LogoLabelType = tuple[str, Optional[str]] InsightAnnotation = Literal[-1, 0, 1, 2] + + + + + +@enum.unique +class Lang(str, enum.Enum): + english = "en" + french = "fr" + german = "de" + spanish = "es" + italian = "it" + portuguese = "pt" + dutch = "nl" + polish = "pl" + russian = "ru" + japanese = "ja" + chinese = "zh" + arabic = "ar" + turkish = "tr" + vietnamese = "vi" + thai = "th" + korean = "ko" + ukrainian = "uk" + indonesian = "id" + hungarian = "hu" + greek = "el" + romanian = "ro" + danish = "da" + swedish = "sv" + norwegian = "no" + finnish = "fi" + bulgarian = "bg" + czech = "cs" + slovak = "sk" + croatian = "hr" + \ No newline at end of file diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py new file mode 100644 index 0000000000..4501ad1277 --- /dev/null +++ b/tests/unit/test_batch.py @@ -0,0 +1,18 @@ +import pytest + +from robotoff.batch import ( + GoogleBatchJobConfig, + BatchJobType, +) + +# Add future job types here for testing. +@pytest.mark.parametrize( + "job_type_str", + [ + "ingredients_spellcheck", + ], +) +def test_batch_job_config_file(job_type_str): + "Test indirectly the batch job config file by validating with the Pydantic class model." + job_type = BatchJobType[job_type_str] + GoogleBatchJobConfig.init(job_type) From eb15bab345d1489cf02a261e66d7f1cc0543d5ed Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 21 Aug 2024 18:14:39 +0200 Subject: [PATCH 02/22] fix(batch-spellcheck): :lipstick: Fix Spellcheck Batch job file name for Dockerfile ENTRYPOINT --- batch/spellcheck/{job.py => main.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename batch/spellcheck/{job.py => main.py} (100%) diff --git a/batch/spellcheck/job.py b/batch/spellcheck/main.py similarity index 100% rename from batch/spellcheck/job.py rename to batch/spellcheck/main.py From d36648b5a9d1227f16ba08525290aa6d45eb9188 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Thu, 22 Aug 2024 19:51:58 +0200 Subject: [PATCH 03/22] feat(batch-spellcheck): :zap: Batch extraction from database before Batch processing operational --- batch/spellcheck/Dockerfile | 2 +- batch/spellcheck/main.py | 8 +- poetry.lock | 195 +++++++++++++++++- pyproject.toml | 2 + robotoff/app/api.py | 28 ++- robotoff/batch/__init__.py | 14 +- robotoff/batch/buckets.py | 71 +++++++ .../configs/{ => job_configs}/spellcheck.yaml | 0 robotoff/batch/configs/sql/spellcheck.sql | 7 + robotoff/batch/extraction.py | 81 ++++++++ robotoff/batch/{batch.py => launch.py} | 3 +- robotoff/utils/buckets.py | 41 ++++ tests/unit/data/dataset_sample.jsonl.gz | 3 + tests/unit/test_batch.py | 31 ++- 14 files changed, 461 insertions(+), 25 deletions(-) create mode 100644 robotoff/batch/buckets.py rename robotoff/batch/configs/{ => job_configs}/spellcheck.yaml (100%) create mode 100644 robotoff/batch/configs/sql/spellcheck.sql create mode 100644 robotoff/batch/extraction.py rename robotoff/batch/{batch.py => launch.py} (99%) create mode 100644 robotoff/utils/buckets.py create mode 100644 tests/unit/data/dataset_sample.jsonl.gz diff --git a/batch/spellcheck/Dockerfile b/batch/spellcheck/Dockerfile index 61b73b5b1b..0c9f31dad7 100644 --- a/batch/spellcheck/Dockerfile +++ b/batch/spellcheck/Dockerfile @@ -6,7 +6,7 @@ ENV PYTHONUNBUFFERED=1 \ WORKDIR /app -COPY job.py /app +COPY main.py /app COPY requirements.txt /app RUN pip install --no-cache-dir -r requirements.txt diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 6c629e4c56..3b4d0339a9 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -1,6 +1,7 @@ import argparse import tempfile import logging +import sys from typing import List import pandas as pd @@ -12,6 +13,7 @@ logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stdout)], ) FEATURES_VALIDATION = ["code", "text"] @@ -22,8 +24,8 @@ def parse() -> argparse.Namespace: """ parser = argparse.ArgumentParser(description="Spellcheck module.") parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.") - parser.add_argument("--pre_data_suffix", type=str, default="data/test_data.parquet", help="Dataset suffix containing the data to be processed.") - parser.add_argument("--post_data_suffix", type=str, default="data/test_processed_data.parquet", help="Dataset suffix containing the processed data.") + parser.add_argument("--pre_data_suffix", type=str, default="data/preprocessed_data.parquet", help="Dataset suffix containing the data to be processed.") + parser.add_argument("--post_data_suffix", type=str, default="data/postprocessed_data.parquet", help="Dataset suffix containing the processed data.") parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.") parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.") parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.") @@ -47,7 +49,7 @@ def main(): LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}") data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix) LOGGER.info(f"Feature in uploaded data: {data.columns}") - if not all(feature in data.columns for feature in FEATURES_VALIDATION): + if not all(feature in FEATURES_VALIDATION for feature in data.columns): raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}") instructions = [prepare_instruction(text) for text in data["text"]] diff --git a/poetry.lock b/poetry.lock index c6ddc1774b..5dad07cdf6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -654,6 +654,61 @@ typing-extensions = ">=3.7.4.1" all = ["pytz (>=2019.1)"] dates = ["pytz (>=2019.1)"] +[[package]] +name = "duckdb" +version = "1.0.0" +description = "DuckDB in-process database" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4a8ce2d1f9e1c23b9bab3ae4ca7997e9822e21563ff8f646992663f66d050211"}, + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:19797670f20f430196e48d25d082a264b66150c264c1e8eae8e22c64c2c5f3f5"}, + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b71c342090fe117b35d866a91ad6bffce61cd6ff3e0cff4003f93fc1506da0d8"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dd69f44ad212c35ae2ea736b0e643ea2b70f204b8dff483af1491b0e2a4cec"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8da5f293ecb4f99daa9a9352c5fd1312a6ab02b464653a0c3a25ab7065c45d4d"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3207936da9967ddbb60644ec291eb934d5819b08169bc35d08b2dedbe7068c60"}, + {file = "duckdb-1.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1128d6c9c33e883b1f5df6b57c1eb46b7ab1baf2650912d77ee769aaa05111f9"}, + {file = "duckdb-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:02310d263474d0ac238646677feff47190ffb82544c018b2ff732a4cb462c6ef"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:75586791ab2702719c284157b65ecefe12d0cca9041da474391896ddd9aa71a4"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:83bb415fc7994e641344f3489e40430ce083b78963cb1057bf714ac3a58da3ba"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:bee2e0b415074e84c5a2cefd91f6b5ebeb4283e7196ba4ef65175a7cef298b57"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa5a4110d2a499312609544ad0be61e85a5cdad90e5b6d75ad16b300bf075b90"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa389e6a382d4707b5f3d1bc2087895925ebb92b77e9fe3bfb23c9b98372fdc"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ede6f5277dd851f1a4586b0c78dc93f6c26da45e12b23ee0e88c76519cbdbe0"}, + {file = "duckdb-1.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0b88cdbc0d5c3e3d7545a341784dc6cafd90fc035f17b2f04bf1e870c68456e5"}, + {file = "duckdb-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd1693cdd15375156f7fff4745debc14e5c54928589f67b87fb8eace9880c370"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c65a7fe8a8ce21b985356ee3ec0c3d3b3b2234e288e64b4cfb03356dbe6e5583"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:e5a8eda554379b3a43b07bad00968acc14dd3e518c9fbe8f128b484cf95e3d16"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:a1b6acdd54c4a7b43bd7cb584975a1b2ff88ea1a31607a2b734b17960e7d3088"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a677bb1b6a8e7cab4a19874249d8144296e6e39dae38fce66a80f26d15e670df"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:752e9d412b0a2871bf615a2ede54be494c6dc289d076974eefbf3af28129c759"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3aadb99d098c5e32d00dc09421bc63a47134a6a0de9d7cd6abf21780b678663c"}, + {file = "duckdb-1.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83b7091d4da3e9301c4f9378833f5ffe934fb1ad2b387b439ee067b2c10c8bb0"}, + {file = "duckdb-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:6a8058d0148b544694cb5ea331db44f6c2a00a7b03776cc4dd1470735c3d5ff7"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e40cb20e5ee19d44bc66ec99969af791702a049079dc5f248c33b1c56af055f4"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7bce1bc0de9af9f47328e24e6e7e39da30093179b1c031897c042dd94a59c8e"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8355507f7a04bc0a3666958f4414a58e06141d603e91c0fa5a7c50e49867fb6d"}, + {file = "duckdb-1.0.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:39f1a46f5a45ad2886dc9b02ce5b484f437f90de66c327f86606d9ba4479d475"}, + {file = "duckdb-1.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d29ba477b27ae41676b62c8fae8d04ee7cbe458127a44f6049888231ca58fa"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:1bea713c1925918714328da76e79a1f7651b2b503511498ccf5e007a7e67d49e"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:bfe67f3bcf181edbf6f918b8c963eb060e6aa26697d86590da4edc5707205450"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:dbc6093a75242f002be1d96a6ace3fdf1d002c813e67baff52112e899de9292f"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba1881a2b11c507cee18f8fd9ef10100be066fddaa2c20fba1f9a664245cd6d8"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:445d0bb35087c522705c724a75f9f1c13f1eb017305b694d2686218d653c8142"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:224553432e84432ffb9684f33206572477049b371ce68cc313a01e214f2fbdda"}, + {file = "duckdb-1.0.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d3914032e47c4e76636ad986d466b63fdea65e37be8a6dfc484ed3f462c4fde4"}, + {file = "duckdb-1.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:af9128a2eb7e1bb50cd2c2020d825fb2946fdad0a2558920cd5411d998999334"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dd2659a5dbc0df0de68f617a605bf12fe4da85ba24f67c08730984a0892087e8"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:ac5a4afb0bc20725e734e0b2c17e99a274de4801aff0d4e765d276b99dad6d90"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c5a53bee3668d6e84c0536164589d5127b23d298e4c443d83f55e4150fafe61"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b980713244d7708b25ee0a73de0c65f0e5521c47a0e907f5e1b933d79d972ef6"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21cbd4f9fe7b7a56eff96c3f4d6778770dd370469ca2212eddbae5dd63749db5"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed228167c5d49888c5ef36f6f9cbf65011c2daf9dcb53ea8aa7a041ce567b3e4"}, + {file = "duckdb-1.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46d8395fbcea7231fd5032a250b673cc99352fef349b718a23dea2c0dd2b8dec"}, + {file = "duckdb-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:6ad1fc1a4d57e7616944166a5f9417bdbca1ea65c490797e3786e3a42e162d8a"}, + {file = "duckdb-1.0.0.tar.gz", hash = "sha256:a2a059b77bc7d5b76ae9d88e267372deff19c291048d59450c431e166233d453"}, +] + [[package]] name = "elastic-transport" version = "8.13.1" @@ -1051,6 +1106,144 @@ google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" proto-plus = ">=1.22.3,<2.0.0dev" protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" +[[package]] +name = "google-cloud-core" +version = "2.4.1" +description = "Google Cloud API client core library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073"}, + {file = "google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61"}, +] + +[package.dependencies] +google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-auth = ">=1.25.0,<3.0dev" + +[package.extras] +grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] + +[[package]] +name = "google-cloud-storage" +version = "2.14.0" +description = "Google Cloud Storage API client library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google-cloud-storage-2.14.0.tar.gz", hash = "sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e"}, + {file = "google_cloud_storage-2.14.0-py2.py3-none-any.whl", hash = "sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd"}, +] + +[package.dependencies] +google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-auth = ">=2.23.3,<3.0dev" +google-cloud-core = ">=2.3.0,<3.0dev" +google-crc32c = ">=1.0,<2.0dev" +google-resumable-media = ">=2.6.0" +requests = ">=2.18.0,<3.0.0dev" + +[package.extras] +protobuf = ["protobuf (<5.0.0dev)"] + +[[package]] +name = "google-crc32c" +version = "1.5.0" +description = "A python wrapper of the C library 'Google CRC32C'" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"}, + {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"}, + {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346"}, + {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65"}, + {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b"}, + {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02"}, + {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4"}, + {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e"}, + {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c"}, + {file = "google_crc32c-1.5.0-cp310-cp310-win32.whl", hash = "sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee"}, + {file = "google_crc32c-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289"}, + {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273"}, + {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298"}, + {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57"}, + {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438"}, + {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906"}, + {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183"}, + {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd"}, + {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c"}, + {file = "google_crc32c-1.5.0-cp311-cp311-win32.whl", hash = "sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709"}, + {file = "google_crc32c-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-win32.whl", hash = "sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94"}, + {file = "google_crc32c-1.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740"}, + {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8"}, + {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a"}, + {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946"}, + {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a"}, + {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d"}, + {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a"}, + {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37"}, + {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894"}, + {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a"}, + {file = "google_crc32c-1.5.0-cp38-cp38-win32.whl", hash = "sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4"}, + {file = "google_crc32c-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c"}, + {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7"}, + {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d"}, + {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100"}, + {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9"}, + {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57"}, + {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210"}, + {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd"}, + {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96"}, + {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61"}, + {file = "google_crc32c-1.5.0-cp39-cp39-win32.whl", hash = "sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c"}, + {file = "google_crc32c-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541"}, + {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325"}, + {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd"}, + {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091"}, + {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178"}, + {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2"}, + {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d"}, + {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2"}, + {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5"}, + {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462"}, + {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314"}, + {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728"}, + {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88"}, + {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb"}, + {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31"}, + {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93"}, +] + +[package.extras] +testing = ["pytest"] + +[[package]] +name = "google-resumable-media" +version = "2.7.2" +description = "Utilities for Google Media Downloads and Resumable Uploads" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"}, + {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"}, +] + +[package.dependencies] +google-crc32c = ">=1.0,<2.0dev" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"] +requests = ["requests (>=2.18.0,<3.0.0dev)"] + [[package]] name = "googleapis-common-protos" version = "1.63.2" @@ -4104,4 +4297,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "080e1a8ef09819c49742c8270b5c2da81ff49469d77a8cd304567aeba79e0741" +content-hash = "5ee1d05103d9616c3968c619f716555dc4d151a7f02d0580bb11c06c26dd3612" diff --git a/pyproject.toml b/pyproject.toml index d115e16f22..9f99676272 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,8 @@ imagehash = "~4.3.1" peewee-migrate = "~1.12.2" diskcache = "~5.6.3" google-cloud-batch = "^0.17.26" +duckdb = "1.0.0" +google-cloud-storage = "<2.18.2" [tool.poetry.dependencies.sentry-sdk] version = "~1.14.0" diff --git a/robotoff/app/api.py b/robotoff/app/api.py index e8f7c67d95..4cf3c5eaa7 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -89,7 +89,9 @@ from robotoff.batch import ( BatchJobType, GoogleBatchJob, - GoogleBatchJobConfig + GoogleBatchJobConfig, + BatchExtraction, + GoogleStorageBucketForBatchJob, ) logger = get_logger() @@ -1763,16 +1765,25 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): class BatchJobResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) - - # Batch extraction - - # Launch Batch job - logger.info(f"Start batch with job_type: {job_type_str}") + try: job_type = BatchJobType[job_type_str] except KeyError: raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}") - + + # Batch extraction + with tempfile.TemporaryDirectory() as tmp_dir: + BatchExtraction.extract_from_dataset( + job_type=job_type, + output_dir=tmp_dir, + ) + if not BatchExtraction.extracted_file_path: + raise ValueError("The extracted file was not found.") + + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) + + # Launch batch job batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) resp.media = {"batch_job_details": batch_job} @@ -1815,7 +1826,7 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predict/nutrition", NutritionPredictorResource()) api.add_route("/api/v1/predict/ocr_prediction", OCRPredictionPredictorResource()) api.add_route("/api/v1/predict/category", CategoryPredictorResource()) -api.add_route("/api/v1/predict/ ", IngredientListPredictorResource()) +api.add_route("/api/v1/predict/ingredient_list", IngredientListPredictorResource()) api.add_route("/api/v1/predict/lang", LanguagePredictorResource()) api.add_route("/api/v1/predict/lang/product", ProductLanguagePredictorResource()) api.add_route("/api/v1/products/dataset", UpdateDatasetResource()) @@ -1845,3 +1856,4 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) api.add_route("/robots.txt", RobotsTxtResource()) +api.add_route("/api/v1/batch/launch", BatchJobResource()) \ No newline at end of file diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 3febe071f0..7bb0a17d87 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,11 +1,11 @@ -from .batch import ( +from .launch import ( GoogleBatchJob, GoogleBatchJobConfig, BatchJobType, ) - -__all__ = [ - "GoogleBatchJob", - "GoogleBatchJobConfig", - "BatchJobType", -] \ No newline at end of file +from .extraction import ( + BatchExtraction, +) +from .buckets import ( + GoogleStorageBucketForBatchJob, +) diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py new file mode 100644 index 0000000000..19c5ae1d02 --- /dev/null +++ b/robotoff/batch/buckets.py @@ -0,0 +1,71 @@ +import io + +from robotoff.utils.buckets import GoogleStorageBucket +from robotoff.batch import BatchJobType + + +BATCH_JOB_TYPE_TO_BUCKET = { + BatchJobType.ingredients_spellcheck: { + "bucket": "robotoff-spellcheck", + "suffix_preprocess": "data/preprocessed_data.parquet", + "suffix_postprocess": "data/postprocessed_data.parquet", + }, +} + + +class GoogleStorageBucketForBatchJob(GoogleStorageBucket): + """Class to handle the Google Storage bucket for depending on the batch job. + + :param bucket: Bucket name + :type bucket: str + :param suffix_preprocess: Path inside the bucket before batch processing. + :type suffix_preprocess: str + :param suffix_postprocess: Path inside the bucket after batch processing. + :type suffix_postprocess: str + """ + + def __init__( + self, + bucket: str, + suffix_preprocess: str, + suffix_postprocess: str, + ) -> None: + self.bucket = bucket + self.suffix_preprocess = suffix_preprocess + self.suffix_postprocess = suffix_postprocess + + @classmethod + def from_job_type(cls, job_type:BatchJobType) -> "GoogleStorageBucketForBatchJob": + """Initialize the class with the configuration file corresponding to the batch job type. + Useful to adapt bucket upload and download during the batch job process. + + :param job_type: Batch job type. + :type job_type: BatchJobType + :return: Instantiated class. + :rtype: GoogleStorageBucketForBatchJob + """ + try: + bucket_dict = BATCH_JOB_TYPE_TO_BUCKET[job_type] + except KeyError: + raise ValueError(f"Batch job type {job_type} not found in the configuration. Expected {BATCH_JOB_TYPE_TO_BUCKET}.") + return cls(**bucket_dict) + + def upload_file(self, file_path: str): + """Upload file to the bucket. + + :param file_path: File path to upload. + :type file_path: str + """ + self.upload_gcs( + file_path=file_path, + bucket_name=self.bucket, + suffix=self.suffix_preprocess, + ) + + def download_file(self) -> io.BufferedReader: + """Download file from bucket + """ + return self.download_gcs( + bucket_name=self.bucket, + suffix=self.suffix_postprocess, + ) diff --git a/robotoff/batch/configs/spellcheck.yaml b/robotoff/batch/configs/job_configs/spellcheck.yaml similarity index 100% rename from robotoff/batch/configs/spellcheck.yaml rename to robotoff/batch/configs/job_configs/spellcheck.yaml diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql new file mode 100644 index 0000000000..5e53938a49 --- /dev/null +++ b/robotoff/batch/configs/sql/spellcheck.sql @@ -0,0 +1,7 @@ +SELECT code, ingredients_text AS text, product_name, (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction +FROM read_ndjson('DATASET_PATH', ignore_errors=True) +WHERE ingredients_text NOT LIKE '' +AND fraction > 0 AND fraction <= 0.4 +ORDER BY random() +LIMIT 100 +; \ No newline at end of file diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py new file mode 100644 index 0000000000..d4f517836f --- /dev/null +++ b/robotoff/batch/extraction.py @@ -0,0 +1,81 @@ +import os +from pathlib import Path + +import duckdb + +from robotoff import settings +from robotoff.batch import BatchJobType + + +BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = { + BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql", +} + + +class BatchExtraction: + """Handle batch extraction from the dataset. + Extraction varies regarding the batch job. + """ + + file_name: str = "batch.parquet" + extracted_file_path: str = None + + @classmethod + def extract_from_dataset( + cls, + job_type: BatchJobType, + output_dir: str, + dataset_path: str = str(settings.JSONL_DATASET_PATH), + ) -> None: + """Using SQL queries, extract data from the dataset and save it as a parquet file. + + :param job_type: Batch job type. + :type job_type: BatchJobType + :param output_dir: Directory to save the extracted data. + :type output_dir: str + :param dataset_path: Path to the jsonl.gz dataset. + :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing. + """ + if not isinstance(dataset_path, str): + raise ValueError(f"The dataset path should be a string. Current type {type(dataset_path)}") + + query_file_path = BATCH_JOB_TYPE_TO_QUERY_FILE_PATH[job_type] + query = cls._load_query(query_file_path=query_file_path, dataset_path=dataset_path) + cls._extract_and_save_batch_data(query=query, output_dir=output_dir) + # We save the file path for later usage in the pipeline + cls.extracted_file_path = os.path.join(output_dir, cls.file_name) + + @staticmethod + def _load_query(query_file_path: Path, dataset_path: str) -> str: + """Load the SQL query from a corresponding file. + + :param query_file_path: File path containing the SQL query. + :type query_file_path: Path + :param dataset_path: Path to the jsonl.gz dataset. + :type dataset_path: Path + :raises ValueError: In case the Dataset path is not found in the SQL query. + :return: the SQL/DuckDB query. + :rtype: str + """ + query = query_file_path.read_text() + if "DATASET_PATH" not in query: + raise ValueError( + "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path." + ) + query = query.replace("DATASET_PATH", dataset_path) + return query + + @classmethod + def _extract_and_save_batch_data(cls, query: str, output_dir: str) -> None: + """Query and save the data. + + :param query: DuckDB/SQL query. + :type query: str + :param output_dir: Extracted data directory + :type output_dir: str + """ + ( + duckdb + .sql(query) + .write_parquet(os.path.join(output_dir, cls.file_name)) + ) diff --git a/robotoff/batch/batch.py b/robotoff/batch/launch.py similarity index 99% rename from robotoff/batch/batch.py rename to robotoff/batch/launch.py index a21468aea6..8b00f097aa 100644 --- a/robotoff/batch/batch.py +++ b/robotoff/batch/launch.py @@ -19,8 +19,7 @@ class BatchJobType(enum.Enum): # Paths batch job config files BATCH_JOB_TYPE_TO_CONFIG_PATH = { - BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR - / "spellcheck.yaml", + BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml", } diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py new file mode 100644 index 0000000000..73c9295afa --- /dev/null +++ b/robotoff/utils/buckets.py @@ -0,0 +1,41 @@ +import io +from typing import Any + +from google.cloud import storage + + +class GoogleStorageBucket: + + @staticmethod + def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader: + """Download file from Google Storage Bucket. + + :param bucket_name: Bucket name + :type bucket_name: str + :param suffix: Path inside the bucket + :type suffix: str + :return: + :rtype: Any + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + with blob.open("rb") as f: + return f + + + @staticmethod + def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None: + """Upload file to Google Storage Bucket. + + :param file_path: File path. + :type file_path: str + :param bucket_name: Bucket name. + :type bucket_name: str + :param suffix: Path inside the bucket. + :type suffix: str + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + blob.upload_from_filename(filename=file_path) diff --git a/tests/unit/data/dataset_sample.jsonl.gz b/tests/unit/data/dataset_sample.jsonl.gz new file mode 100644 index 0000000000..abb852f4d6 --- /dev/null +++ b/tests/unit/data/dataset_sample.jsonl.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7148cdb2415a8156e39cf0f876b998676d1b2489d5c3ce269407c93769e7151f +size 262180 diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py index 4501ad1277..25f8c94e66 100644 --- a/tests/unit/test_batch.py +++ b/tests/unit/test_batch.py @@ -1,18 +1,43 @@ import pytest +import tempfile +from pathlib import Path from robotoff.batch import ( GoogleBatchJobConfig, BatchJobType, + BatchExtraction, ) + +DIR = Path(__file__).parent +JOB_TYPES = [ + "ingredients_spellcheck", +] + + # Add future job types here for testing. @pytest.mark.parametrize( "job_type_str", - [ - "ingredients_spellcheck", - ], + JOB_TYPES, ) def test_batch_job_config_file(job_type_str): "Test indirectly the batch job config file by validating with the Pydantic class model." job_type = BatchJobType[job_type_str] GoogleBatchJobConfig.init(job_type) + + +# Add future job types here for testing. +@pytest.mark.parametrize( + "job_type_str", + JOB_TYPES, +) +def test_batch_extraction(job_type_str): + """Test extraction of a batch of data from the dataset depending on the job type. + """ + job_type_str = BatchJobType[job_type_str] + with tempfile.TemporaryDirectory() as tmp_dir: + BatchExtraction.extract_from_dataset( + job_type=job_type_str, + output_dir=tmp_dir, + dataset_path=str(DIR / "data/dataset_sample.jsonl.gz"), + ) From c14338d0ae9eff3ac26255db8b57c196dd9581aa Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Fri, 23 Aug 2024 11:38:00 +0200 Subject: [PATCH 04/22] refactor(batch-spellcheck): :green_heart: Fix some bugs: batch-extraction & batch-launch --- batch/spellcheck/README.md | 1 + robotoff/batch/buckets.py | 6 +++--- robotoff/batch/extraction.py | 2 +- robotoff/batch/launch.py | 26 +++++++++++++++++++------- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/batch/spellcheck/README.md b/batch/spellcheck/README.md index 975f63733e..fb2898d4d4 100644 --- a/batch/spellcheck/README.md +++ b/batch/spellcheck/README.md @@ -11,6 +11,7 @@ * A100: a2-highgpu-1g: $3.748064 * A100/Cuda doesn't support FP8 * A100 has less availability than L4: need to wait for batch job (can be long) +* Don't forget to enable **Batch & Storage API** if used without gcloud ## Links diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py index 19c5ae1d02..d27b1c2d73 100644 --- a/robotoff/batch/buckets.py +++ b/robotoff/batch/buckets.py @@ -35,9 +35,9 @@ def __init__( self.suffix_postprocess = suffix_postprocess @classmethod - def from_job_type(cls, job_type:BatchJobType) -> "GoogleStorageBucketForBatchJob": - """Initialize the class with the configuration file corresponding to the batch job type. - Useful to adapt bucket upload and download during the batch job process. + def from_job_type(cls, job_type: BatchJobType) -> "GoogleStorageBucketForBatchJob": + """Initialize the class with the bucket and suffix names corresponding to the batch job type. + Used to adapt bucket upload and download during the batch job process. :param job_type: Batch job type. :type job_type: BatchJobType diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py index d4f517836f..bc75be9d95 100644 --- a/robotoff/batch/extraction.py +++ b/robotoff/batch/extraction.py @@ -31,7 +31,7 @@ def extract_from_dataset( :param job_type: Batch job type. :type job_type: BatchJobType - :param output_dir: Directory to save the extracted data. + :param output_dir: Directory to save the extracted data as a parquet file. :type output_dir: str :param dataset_path: Path to the jsonl.gz dataset. :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing. diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index 8b00f097aa..d8467c6e0c 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -3,9 +3,10 @@ import enum import yaml import datetime +import re from google.cloud import batch_v1 -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ConfigDict from robotoff import settings @@ -14,7 +15,7 @@ class BatchJobType(enum.Enum): """Each job type correspond to a task that will be executed in the batch job.""" - ingredients_spellcheck = "ingredients_spellcheck" + ingredients_spellcheck = "ingredients-spellcheck" # Paths batch job config files @@ -25,6 +26,8 @@ class BatchJobType(enum.Enum): class GoogleBatchJobConfig(BaseModel): """Batch job configuration class.""" + # By default, extra fields are just ignored. We raise an error in case of extra fields. + model_config: ConfigDict = {"extra": "forbid"} job_name: str = Field( description="The name of the job. It needs to be unique amongst exisiting batch job names.", @@ -33,6 +36,9 @@ class GoogleBatchJobConfig(BaseModel): pattern=r"^europe-west\d{1,2}$", description="The region in which the job will run. Regions that are available for Batch are listed on: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones. We restrict to Europe-West for now.", ) + container_image_uri: str = Field( + description="The URI of the container image to use for the job. SHould be a valid Image URI.", + ) entrypoint: Optional[str] = Field( default=None, description="The entrypoint for the container. If None, use default entrypoint.", @@ -100,12 +106,16 @@ def init(cls, job_type: BatchJobType): :param job_type: Batch job type. :type job_type: BatchJobType """ + # Batch job name should respect a specific pattern, or returns an error + pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$" + if not re.match(pattern, job_type.value): + raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_type.value}") + # Generate unique id for the job unique_job_name = ( - job_type.name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + job_type.value + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") ) - - # Load config from job_type + # Load config file from job_type config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type] with open(config_path, "r") as f: config = yaml.safe_load(f) @@ -135,8 +145,10 @@ def launch_job( ) -> batch_v1.Job: """This method creates a Batch Job on GCP. - Method copied from https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create - + Sources: + * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create + * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types + :param google_batch_launch_config: Config to run a job on Google Batch. :type google_batch_launch_config: GoogleBatchLaunchConfig :param batch_job_config: Config to run a specific job on Google Batch. From 6c83b8c9141314305da52d20acb2b43122ec4db2 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Sat, 24 Aug 2024 18:03:52 +0200 Subject: [PATCH 05/22] feat(batch - spellcheck): :zap: From predictions to insights --- robotoff/app/api.py | 28 ++++++++++++++++-- robotoff/batch/__init__.py | 1 + robotoff/batch/importer.py | 55 +++++++++++++++++++++++++++++++++++ robotoff/insights/importer.py | 29 ++++++++++++++++++ 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 robotoff/batch/importer.py diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 4cf3c5eaa7..5daf219fb3 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -92,6 +92,8 @@ GoogleBatchJobConfig, BatchExtraction, GoogleStorageBucketForBatchJob, + generate_predictions_from_batch, + ) logger = get_logger() @@ -1762,7 +1764,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.status = falcon.HTTP_200 -class BatchJobResource: +class BatchJobLaunchResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) @@ -1779,7 +1781,6 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): ) if not BatchExtraction.extracted_file_path: raise ValueError("The extracted file was not found.") - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) @@ -1789,6 +1790,27 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): resp.media = {"batch_job_details": batch_job} +class BatchJobImportResource: + def on_post(self, req: falcon.Request, resp: falcon.Response): + job_type_str: str = req.get_param("job_type", required=True) + + from robotoff.insights.importer import import_insights + try: + job_type = BatchJobType[job_type_str] + except KeyError: + raise falcon.HTTPBadRequest( + description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" + ) + + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + predictions = generate_predictions_from_batch( + bucket_handler.download_file, + job_type + ) + with db: + import_insights(predictions=predictions, server_type="off") + + def custom_handle_uncaught_exception( req: falcon.Request, resp: falcon.Response, ex: Exception, params ): @@ -1856,4 +1878,4 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) api.add_route("/robots.txt", RobotsTxtResource()) -api.add_route("/api/v1/batch/launch", BatchJobResource()) \ No newline at end of file +api.add_route("/api/v1/batch/launch", BatchJobLaunchResource()) \ No newline at end of file diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 7bb0a17d87..d9470f8e2b 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -9,3 +9,4 @@ from .buckets import ( GoogleStorageBucketForBatchJob, ) +from .importer import generate_predictions_from_batch diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py new file mode 100644 index 0000000000..d8df8d48ec --- /dev/null +++ b/robotoff/batch/importer.py @@ -0,0 +1,55 @@ +import io +from typing import Iterator + +import pandas as pd + +from robotoff.batch import BatchJobType +from robotoff.types import Prediction, PredictionType + + +BATCH_JOB_TYPE_TO_FEATURES = { + BatchJobType.ingredients_spellcheck: { + "barcode": "code", + "value": "correction", + "value_tag": "lang", + }, +} + +BATCH_JOB_TYPE_TO_PREDICTION_TYPE = { + BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, +} + +PREDICTOR_VERSION = "1" + + +def generate_predictions_from_batch( + f: io.BufferedReader, + job_type: BatchJobType +) -> Iterator[Prediction]: + """From a file imported from google storage, generate predictions depending on the job type. + + :param f: Readable object. Should be a parquet file. + :type f: io.BufferedReader + :param job_type: Batch job type. + :type job_type: BatchJobType + :rtype: Iterable[Prediction] + :yield: Predictions. + :rtype: Iterator[Prediction] + """ + features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] + prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] + + try: + df = pd.read_parquet(f) + except Exception as e: + raise ValueError(f"Failed to read parquet file: {e}") + + for _, row in df.iterrows(): + yield Prediction( + type=prediction_type, + value=row[features_dict["value"]], + value_tag=[features_dict["value_tag"]], + barcode=row[features_dict["barcode"]], + predictor_version=PREDICTOR_VERSION, + predictor="llm", + ) diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index bc5cdcebde..275393b9eb 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1475,6 +1475,35 @@ def compute_crop_bounding_box( return results +class IngredientsSpellcheckImporter(InsightImporter): + + @staticmethod + def get_type() -> InsightType: + return InsightType.ingredient_spellcheck + + @classmethod + def get_required_prediction_types(cls) -> set[PredictionType]: + return {PredictionType.ingredient_spellcheck} + + @classmethod + def generate_candidates( + cls, + product: Optional[Product], + predictions: list[Prediction], + product_id: ProductIdentifier, + ) -> Iterator[ProductInsight]: + # No reason to have different candidates for now + candidate = predictions[0] + yield ProductInsight(**candidate.to_dict()) + + @classmethod + def is_conflicting_insight( + cls, + candidate: ProductInsight, + reference: ProductInsight + ) -> bool: + candidate.value == reference.value + class PackagingElementTaxonomyException(Exception): pass From a369a5937cb174ec4fdbe8a6e62ff30680a6505e Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Mon, 26 Aug 2024 19:02:40 +0200 Subject: [PATCH 06/22] feat(batch - spellcheck): :zap: API endpoint batch/launch ok: Batch extraction with DuckDB and launch on GCP . --- batch/spellcheck/main.py | 2 +- credentials/.gitkeep | 0 docker-compose.yml | 5 ++- poetry.lock | 78 ++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + robotoff/app/api.py | 34 ++++++---------- robotoff/batch/__init__.py | 33 +++++++++++++++ robotoff/batch/extraction.py | 4 ++ robotoff/batch/launch.py | 2 +- 9 files changed, 132 insertions(+), 27 deletions(-) create mode 100644 credentials/.gitkeep diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 3b4d0339a9..6853ea288b 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -49,7 +49,7 @@ def main(): LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}") data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix) LOGGER.info(f"Feature in uploaded data: {data.columns}") - if not all(feature in FEATURES_VALIDATION for feature in data.columns): + if not all(feature in data.columns for feature in FEATURES_VALIDATION): raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}") instructions = [prepare_instruction(text) for text in data["text"]] diff --git a/credentials/.gitkeep b/credentials/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docker-compose.yml b/docker-compose.yml index 60a914f0fb..83a5022945 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ x-robotoff-base-volumes: - ./cache:/opt/robotoff/cache - ./datasets:/opt/robotoff/datasets - ./models:/opt/robotoff/models + - ./credentials:/opt/credentials x-robotoff-base: &robotoff-base @@ -53,7 +54,9 @@ x-robotoff-base-env: IMAGE_MODERATION_SERVICE_URL: CROP_ALLOWED_DOMAINS: NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings - + GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json + GOOGLE_CLOUD_PROJECT: "robotoff" + x-robotoff-worker-base: &robotoff-worker restart: $RESTART_POLICY diff --git a/poetry.lock b/poetry.lock index 5dad07cdf6..11fcef4863 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2117,8 +2117,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -2211,6 +2211,78 @@ files = [ {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] +[[package]] +name = "pandas" +version = "2.2.2" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + [[package]] name = "pathspec" version = "0.12.1" @@ -2612,8 +2684,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4297,4 +4369,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "5ee1d05103d9616c3968c619f716555dc4d151a7f02d0580bb11c06c26dd3612" +content-hash = "30d2c1f1df11a9fdbecd73ec5c64732a361053cf7b350350d433def955691df8" diff --git a/pyproject.toml b/pyproject.toml index 9f99676272..7926145882 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ diskcache = "~5.6.3" google-cloud-batch = "^0.17.26" duckdb = "1.0.0" google-cloud-storage = "<2.18.2" +pandas = "^2.2.2" [tool.poetry.dependencies.sentry-sdk] version = "~1.14.0" diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 5daf219fb3..531cde9e4c 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -87,10 +87,8 @@ from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue from robotoff.workers.tasks import download_product_dataset_job from robotoff.batch import ( - BatchJobType, - GoogleBatchJob, - GoogleBatchJobConfig, - BatchExtraction, + BatchJobType, + launch_batch_job, GoogleStorageBucketForBatchJob, generate_predictions_from_batch, @@ -1771,23 +1769,17 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): try: job_type = BatchJobType[job_type_str] except KeyError: - raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}") - - # Batch extraction - with tempfile.TemporaryDirectory() as tmp_dir: - BatchExtraction.extract_from_dataset( - job_type=job_type, - output_dir=tmp_dir, + raise falcon.HTTPBadRequest( + description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" ) - if not BatchExtraction.extracted_file_path: - raise ValueError("The extracted file was not found.") - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) - bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) - - # Launch batch job - batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) - batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) - resp.media = {"batch_job_details": batch_job} + # Batch extraction can take some time, so we queue it + enqueue_job( + launch_batch_job, + queue=low_queue, + job_type=job_type, + job_kwargs={"timeout": "10m"}, + ) + logger.info("Batch job %s queued", job_type) class BatchJobImportResource: @@ -1804,7 +1796,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) predictions = generate_predictions_from_batch( - bucket_handler.download_file, + bucket_handler.download_file(), job_type ) with db: diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index d9470f8e2b..c7e681bf5d 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,3 +1,5 @@ +import tempfile + from .launch import ( GoogleBatchJob, GoogleBatchJobConfig, @@ -10,3 +12,34 @@ GoogleStorageBucketForBatchJob, ) from .importer import generate_predictions_from_batch + +from robotoff.utils import get_logger + + +LOGGER = get_logger(__name__) + + +def launch_batch_job(job_type: BatchJobType) -> None: + """_summary_ + + :param job_type: _description_ + :type job_type: BatchJobType + """ + with tempfile.TemporaryDirectory() as tmp_dir: + BatchExtraction.extract_from_dataset( + job_type=job_type, + output_dir=tmp_dir, + ) + if not BatchExtraction.extracted_file_path: + raise ValueError("The extracted file was not found.") + LOGGER.debug(f"Batch data succesfully extracted and saved at {BatchExtraction.extracted_file_path}") + + # Upload the extracted file to the bucket + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) + LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}") + + # Launch batch job + batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) + batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) + LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}") diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py index bc75be9d95..c0054c3b8e 100644 --- a/robotoff/batch/extraction.py +++ b/robotoff/batch/extraction.py @@ -5,8 +5,11 @@ from robotoff import settings from robotoff.batch import BatchJobType +from robotoff.utils import get_logger +LOGGER = get_logger(__name__) + BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = { BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql", } @@ -63,6 +66,7 @@ def _load_query(query_file_path: Path, dataset_path: str) -> str: "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path." ) query = query.replace("DATASET_PATH", dataset_path) + LOGGER.debug(f"Query used to extract batch from dataset: {query}") return query @classmethod diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index d8467c6e0c..cdd17b8b6f 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -157,7 +157,7 @@ def launch_job( :rtype: batch_v1.Job Returns: - A job object representing the job created. + Batch job information. """ client = batch_v1.BatchServiceClient() From 729d4e16ce91c76c3f500046f71f476f640f8ef9 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 27 Aug 2024 17:46:44 +0200 Subject: [PATCH 07/22] feat(batch - spellcheck): :zap: Integrate batch data from job into Robotoff sql tables --- .gitignore | 2 + batch/spellcheck/main.py | 20 ++++++++ poetry.lock | 53 ++++++++++++++++++++- pyproject.toml | 1 + robotoff/app/api.py | 23 ++++----- robotoff/batch/__init__.py | 7 ++- robotoff/batch/buckets.py | 5 +- robotoff/batch/configs/sql/spellcheck.sql | 7 ++- robotoff/batch/importer.py | 58 +++++++++++++++-------- robotoff/insights/importer.py | 11 +++-- robotoff/utils/buckets.py | 10 ++-- 11 files changed, 144 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 3a4dd3e70a..0443dcd510 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,5 @@ site/ gh_pages/ doc/README.md doc/references/cli.md + +credentials \ No newline at end of file diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 6853ea288b..0960765fef 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -2,6 +2,7 @@ import tempfile import logging import sys +import requests from typing import List import pandas as pd @@ -77,6 +78,10 @@ def main(): bucket_name=args.data_bucket, suffix=args.post_data_suffix ) + + LOGGER.info("Request Robotoff API batch import endpoint.") + run_robotoff_endpoint_batch_import() + LOGGER.info("Batch processing job completed.") @@ -148,5 +153,20 @@ def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None: blob = bucket.blob(suffix) blob.upload_from_filename(filename=file_path) + +def run_robotoff_endpoint_batch_import(): + """Run Robotoff api endpoint to import batch data into tables. + """ + url = "https://robotoff.openfoodfacts.org/api/v1/batch/import" + data = {"job_type": "ingredients_spellcheck"} + + try: + response = requests.post(url, data=data) + except requests.exceptions.RequestException as e: + raise SystemExit(e) + + LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") + + if __name__ == "__main__": main() diff --git a/poetry.lock b/poetry.lock index 11fcef4863..c0680af5a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2622,6 +2622,57 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "pyarrow" +version = "17.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, + {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, + {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, + {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, + {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, + {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, + {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pyasn1" version = "0.6.0" @@ -4369,4 +4420,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "30d2c1f1df11a9fdbecd73ec5c64732a361053cf7b350350d433def955691df8" +content-hash = "07551d5c2b36e7b3321ac361d384ef02a74de7686051a450ecd692a0b832929b" diff --git a/pyproject.toml b/pyproject.toml index 7926145882..133f61c850 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ google-cloud-batch = "^0.17.26" duckdb = "1.0.0" google-cloud-storage = "<2.18.2" pandas = "^2.2.2" +pyarrow = "^17.0.0" [tool.poetry.dependencies.sentry-sdk] version = "~1.14.0" diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 531cde9e4c..1e761b10f8 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -89,9 +89,7 @@ from robotoff.batch import ( BatchJobType, launch_batch_job, - GoogleStorageBucketForBatchJob, - generate_predictions_from_batch, - + import_batch_predictions, ) logger = get_logger() @@ -1779,28 +1777,26 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): job_type=job_type, job_kwargs={"timeout": "10m"}, ) - logger.info("Batch job %s queued", job_type) + logger.info("Batch job launch %s has been queued.", job_type) class BatchJobImportResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) - from robotoff.insights.importer import import_insights try: job_type = BatchJobType[job_type_str] except KeyError: raise falcon.HTTPBadRequest( description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" ) - - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) - predictions = generate_predictions_from_batch( - bucket_handler.download_file(), - job_type + enqueue_job( + import_batch_predictions, + job_type=job_type, + queue=low_queue, + job_kwargs={"timeout": "10m"}, ) - with db: - import_insights(predictions=predictions, server_type="off") + logger.info("Batch import %s has been queued.", job_type) def custom_handle_uncaught_exception( @@ -1870,4 +1866,5 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) api.add_route("/robots.txt", RobotsTxtResource()) -api.add_route("/api/v1/batch/launch", BatchJobLaunchResource()) \ No newline at end of file +api.add_route("/api/v1/batch/launch", BatchJobLaunchResource()) +api.add_route("/api/v1/batch/import", BatchJobImportResource()) diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index c7e681bf5d..1d2e20c521 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -11,8 +11,7 @@ from .buckets import ( GoogleStorageBucketForBatchJob, ) -from .importer import generate_predictions_from_batch - +from .importer import import_batch_predictions from robotoff.utils import get_logger @@ -37,9 +36,9 @@ def launch_batch_job(job_type: BatchJobType) -> None: # Upload the extracted file to the bucket bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) - LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}") + LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}/{bucket_handler.suffix_preprocess}") # Launch batch job batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) - LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}") + LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py index d27b1c2d73..655c8f0af9 100644 --- a/robotoff/batch/buckets.py +++ b/robotoff/batch/buckets.py @@ -1,5 +1,4 @@ -import io - +import pandas as pd from robotoff.utils.buckets import GoogleStorageBucket from robotoff.batch import BatchJobType @@ -62,7 +61,7 @@ def upload_file(self, file_path: str): suffix=self.suffix_preprocess, ) - def download_file(self) -> io.BufferedReader: + def download_file(self) -> pd.DataFrame: """Download file from bucket """ return self.download_gcs( diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql index 5e53938a49..0cfebcb09a 100644 --- a/robotoff/batch/configs/sql/spellcheck.sql +++ b/robotoff/batch/configs/sql/spellcheck.sql @@ -1,4 +1,9 @@ -SELECT code, ingredients_text AS text, product_name, (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction +SELECT +code, +ingredients_text AS text, +product_name, +lang, +(CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction FROM read_ndjson('DATASET_PATH', ignore_errors=True) WHERE ingredients_text NOT LIKE '' AND fraction > 0 AND fraction <= 0.4 diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py index d8df8d48ec..5ebbc9dff0 100644 --- a/robotoff/batch/importer.py +++ b/robotoff/batch/importer.py @@ -1,12 +1,17 @@ -import io -from typing import Iterator +from typing import List import pandas as pd -from robotoff.batch import BatchJobType +from robotoff.insights.importer import import_insights +from robotoff.batch import BatchJobType, GoogleStorageBucketForBatchJob from robotoff.types import Prediction, PredictionType +from robotoff.models import db +from robotoff.utils import get_logger +from robotoff.types import ServerType +LOGGER = get_logger(__name__) + BATCH_JOB_TYPE_TO_FEATURES = { BatchJobType.ingredients_spellcheck: { "barcode": "code", @@ -19,13 +24,28 @@ BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, } -PREDICTOR_VERSION = "1" +PREDICTOR_VERSION = "2" + + +def import_batch_predictions(job_type: BatchJobType) -> None: + """Import predictions from remote storage. + """ + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}") + df = bucket_handler.download_file() + predictions = _generate_predictions_from_batch(df, job_type) + with db: + import_results = import_insights( + predictions=predictions, + server_type=ServerType.off + ) + LOGGER.info(f"Batch import results: {repr(import_results)}.") -def generate_predictions_from_batch( - f: io.BufferedReader, +def _generate_predictions_from_batch( + df: pd.DataFrame, job_type: BatchJobType -) -> Iterator[Prediction]: +) -> List[Prediction]: """From a file imported from google storage, generate predictions depending on the job type. :param f: Readable object. Should be a parquet file. @@ -36,20 +56,18 @@ def generate_predictions_from_batch( :yield: Predictions. :rtype: Iterator[Prediction] """ + predictions = [] features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] - - try: - df = pd.read_parquet(f) - except Exception as e: - raise ValueError(f"Failed to read parquet file: {e}") - for _, row in df.iterrows(): - yield Prediction( - type=prediction_type, - value=row[features_dict["value"]], - value_tag=[features_dict["value_tag"]], - barcode=row[features_dict["barcode"]], - predictor_version=PREDICTOR_VERSION, - predictor="llm", + predictions.append( + Prediction( + type=prediction_type, + value=row[features_dict["value"]], + value_tag=row[features_dict["value_tag"]], + barcode=row[features_dict["barcode"]], + predictor_version=PREDICTOR_VERSION, + predictor="llm", + ) ) + return predictions diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index 275393b9eb..029f9aefcd 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1475,7 +1475,7 @@ def compute_crop_bounding_box( return results -class IngredientsSpellcheckImporter(InsightImporter): +class IngredientSpellcheckImporter(InsightImporter): @staticmethod def get_type() -> InsightType: @@ -1492,9 +1492,9 @@ def generate_candidates( predictions: list[Prediction], product_id: ProductIdentifier, ) -> Iterator[ProductInsight]: - # No reason to have different candidates for now - candidate = predictions[0] - yield ProductInsight(**candidate.to_dict()) + # Only one prediction + for candidate in predictions: + yield ProductInsight(**candidate.to_dict()) @classmethod def is_conflicting_insight( @@ -1502,7 +1502,7 @@ def is_conflicting_insight( candidate: ProductInsight, reference: ProductInsight ) -> bool: - candidate.value == reference.value + candidate.value_tag == reference.value_tag class PackagingElementTaxonomyException(Exception): pass @@ -1839,6 +1839,7 @@ def import_product_predictions( PackagingImporter, UPCImageImporter, NutritionImageImporter, + IngredientSpellcheckImporter, ] diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py index 73c9295afa..cc92cadfda 100644 --- a/robotoff/utils/buckets.py +++ b/robotoff/utils/buckets.py @@ -1,14 +1,12 @@ -import io -from typing import Any - +import pandas as pd from google.cloud import storage class GoogleStorageBucket: @staticmethod - def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader: - """Download file from Google Storage Bucket. + def download_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: + """Download parquet file from Google Storage Bucket. :param bucket_name: Bucket name :type bucket_name: str @@ -21,7 +19,7 @@ def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader: bucket = client.get_bucket(bucket_name) blob = bucket.blob(suffix) with blob.open("rb") as f: - return f + return pd.read_parquet(f) @staticmethod From 34ce80e42e76143e37a1eada1c031ab78b047e8c Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 27 Aug 2024 18:39:04 +0200 Subject: [PATCH 08/22] feat: :sparkles: Restructure code --- robotoff/batch/__init__.py | 75 ++++++++++++++++++++++++++++++++---- robotoff/batch/buckets.py | 11 +----- robotoff/batch/extraction.py | 5 +-- robotoff/batch/importer.py | 73 ----------------------------------- robotoff/batch/launch.py | 16 +------- robotoff/batch/types.py | 35 +++++++++++++++++ robotoff/types.py | 39 ++----------------- 7 files changed, 112 insertions(+), 142 deletions(-) delete mode 100644 robotoff/batch/importer.py create mode 100644 robotoff/batch/types.py diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 1d2e20c521..87e45fe69a 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,9 +1,20 @@ import tempfile +from typing import List + +import pandas as pd + +from robotoff.utils import get_logger +from robotoff.types import ( + BatchJobType, + Prediction, + ServerType +) +from robotoff.models import db +from robotoff.insights.importer import import_insights from .launch import ( GoogleBatchJob, GoogleBatchJobConfig, - BatchJobType, ) from .extraction import ( BatchExtraction, @@ -11,18 +22,21 @@ from .buckets import ( GoogleStorageBucketForBatchJob, ) -from .importer import import_batch_predictions -from robotoff.utils import get_logger +from .types import ( + BATCH_JOB_TYPE_TO_FEATURES, + BATCH_JOB_TYPE_TO_PREDICTION_TYPE, +) LOGGER = get_logger(__name__) +PREDICTOR_VERSION = "1" #TODO: shard HF model version? instead of manual change? -def launch_batch_job(job_type: BatchJobType) -> None: - """_summary_ +PREDICTOR = "llm" - :param job_type: _description_ - :type job_type: BatchJobType + +def launch_batch_job(job_type: BatchJobType) -> None: + """Launch a batch job. """ with tempfile.TemporaryDirectory() as tmp_dir: BatchExtraction.extract_from_dataset( @@ -42,3 +56,50 @@ def launch_batch_job(job_type: BatchJobType) -> None: batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") + + +def import_batch_predictions(job_type: BatchJobType) -> None: + """Import predictions from remote storage. + """ + bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) + LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}") + df = bucket_handler.download_file() + predictions = _generate_predictions_from_batch(df, job_type) + with db: + import_results = import_insights( + predictions=predictions, + server_type=ServerType.off + ) + LOGGER.info(f"Batch import results: {repr(import_results)}.") + + +def _generate_predictions_from_batch( + df: pd.DataFrame, + job_type: BatchJobType +) -> List[Prediction]: + """From a file imported from google storage, generate predictions depending on the job type. + + :param f: Readable object. Should be a parquet file. + :type f: io.BufferedReader + :param job_type: Batch job type. + :type job_type: BatchJobType + :rtype: Iterable[Prediction] + :yield: Predictions. + :rtype: Iterator[Prediction] + """ + predictions = [] + features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] + prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] + for _, row in df.iterrows(): + predictions.append( + Prediction( + type=prediction_type, + value=row[features_dict["value"]], + value_tag=row[features_dict["value_tag"]], + barcode=row[features_dict["barcode"]], + predictor_version=PREDICTOR_VERSION, + predictor=PREDICTOR, + ) + ) + return predictions + diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py index 655c8f0af9..278b62bf18 100644 --- a/robotoff/batch/buckets.py +++ b/robotoff/batch/buckets.py @@ -1,15 +1,8 @@ import pandas as pd + from robotoff.utils.buckets import GoogleStorageBucket from robotoff.batch import BatchJobType - - -BATCH_JOB_TYPE_TO_BUCKET = { - BatchJobType.ingredients_spellcheck: { - "bucket": "robotoff-spellcheck", - "suffix_preprocess": "data/preprocessed_data.parquet", - "suffix_postprocess": "data/postprocessed_data.parquet", - }, -} +from robotoff.batch.types import BATCH_JOB_TYPE_TO_BUCKET class GoogleStorageBucketForBatchJob(GoogleStorageBucket): diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py index c0054c3b8e..5ca4d2f0e1 100644 --- a/robotoff/batch/extraction.py +++ b/robotoff/batch/extraction.py @@ -5,15 +5,12 @@ from robotoff import settings from robotoff.batch import BatchJobType +from robotoff.batch.types import BATCH_JOB_TYPE_TO_QUERY_FILE_PATH from robotoff.utils import get_logger LOGGER = get_logger(__name__) -BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = { - BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql", -} - class BatchExtraction: """Handle batch extraction from the dataset. diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py deleted file mode 100644 index 5ebbc9dff0..0000000000 --- a/robotoff/batch/importer.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import List - -import pandas as pd - -from robotoff.insights.importer import import_insights -from robotoff.batch import BatchJobType, GoogleStorageBucketForBatchJob -from robotoff.types import Prediction, PredictionType -from robotoff.models import db -from robotoff.utils import get_logger -from robotoff.types import ServerType - - -LOGGER = get_logger(__name__) - -BATCH_JOB_TYPE_TO_FEATURES = { - BatchJobType.ingredients_spellcheck: { - "barcode": "code", - "value": "correction", - "value_tag": "lang", - }, -} - -BATCH_JOB_TYPE_TO_PREDICTION_TYPE = { - BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, -} - -PREDICTOR_VERSION = "2" - - -def import_batch_predictions(job_type: BatchJobType) -> None: - """Import predictions from remote storage. - """ - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) - LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}") - df = bucket_handler.download_file() - predictions = _generate_predictions_from_batch(df, job_type) - with db: - import_results = import_insights( - predictions=predictions, - server_type=ServerType.off - ) - LOGGER.info(f"Batch import results: {repr(import_results)}.") - - -def _generate_predictions_from_batch( - df: pd.DataFrame, - job_type: BatchJobType -) -> List[Prediction]: - """From a file imported from google storage, generate predictions depending on the job type. - - :param f: Readable object. Should be a parquet file. - :type f: io.BufferedReader - :param job_type: Batch job type. - :type job_type: BatchJobType - :rtype: Iterable[Prediction] - :yield: Predictions. - :rtype: Iterator[Prediction] - """ - predictions = [] - features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] - prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] - for _, row in df.iterrows(): - predictions.append( - Prediction( - type=prediction_type, - value=row[features_dict["value"]], - value_tag=row[features_dict["value_tag"]], - barcode=row[features_dict["barcode"]], - predictor_version=PREDICTOR_VERSION, - predictor="llm", - ) - ) - return predictions diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index cdd17b8b6f..a5ca69132c 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -1,6 +1,5 @@ import abc from typing import List, Optional -import enum import yaml import datetime import re @@ -9,19 +8,8 @@ from pydantic import BaseModel, Field, ConfigDict from robotoff import settings - - -@enum.unique -class BatchJobType(enum.Enum): - """Each job type correspond to a task that will be executed in the batch job.""" - - ingredients_spellcheck = "ingredients-spellcheck" - - -# Paths batch job config files -BATCH_JOB_TYPE_TO_CONFIG_PATH = { - BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml", -} +from robotoff.types import BatchJobType +from robotoff.batch.types import BATCH_JOB_TYPE_TO_CONFIG_PATH class GoogleBatchJobConfig(BaseModel): diff --git a/robotoff/batch/types.py b/robotoff/batch/types.py new file mode 100644 index 0000000000..c0c452cefd --- /dev/null +++ b/robotoff/batch/types.py @@ -0,0 +1,35 @@ +from robotoff.types import BatchJobType, PredictionType +from robotoff import settings + + +# Bucket structure to enable the batch job to load and upload data +BATCH_JOB_TYPE_TO_BUCKET = { + BatchJobType.ingredients_spellcheck: { + "bucket": "robotoff-spellcheck", + "suffix_preprocess": "data/preprocessed_data.parquet", + "suffix_postprocess": "data/postprocessed_data.parquet", + }, +} + +# Paths batch job config files +BATCH_JOB_TYPE_TO_CONFIG_PATH = { + BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml", +} + +BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = { + BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql", +} + +# Mapping between batch job type and prediction type +BATCH_JOB_TYPE_TO_PREDICTION_TYPE = { + BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, +} + +# Column names in the processed batch of data +BATCH_JOB_TYPE_TO_FEATURES = { + BatchJobType.ingredients_spellcheck: { + "barcode": "code", + "value": "correction", + "value_tag": "lang", + }, +} diff --git a/robotoff/types.py b/robotoff/types.py index 8105d2030a..52704e0ec5 100644 --- a/robotoff/types.py +++ b/robotoff/types.py @@ -359,39 +359,8 @@ class PackagingElementProperty(enum.Enum): InsightAnnotation = Literal[-1, 0, 1, 2] - - - - @enum.unique -class Lang(str, enum.Enum): - english = "en" - french = "fr" - german = "de" - spanish = "es" - italian = "it" - portuguese = "pt" - dutch = "nl" - polish = "pl" - russian = "ru" - japanese = "ja" - chinese = "zh" - arabic = "ar" - turkish = "tr" - vietnamese = "vi" - thai = "th" - korean = "ko" - ukrainian = "uk" - indonesian = "id" - hungarian = "hu" - greek = "el" - romanian = "ro" - danish = "da" - swedish = "sv" - norwegian = "no" - finnish = "fi" - bulgarian = "bg" - czech = "cs" - slovak = "sk" - croatian = "hr" - \ No newline at end of file +class BatchJobType(enum.Enum): + """Each job type correspond to a task that will be executed in the batch job. + """ + ingredients_spellcheck = "ingredients-spellcheck" \ No newline at end of file From 92cb5f34a0e3e1b0024fa958fb7cf1608cfbe53f Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 28 Aug 2024 14:19:13 +0200 Subject: [PATCH 09/22] feat: :sparkles: Change batch job launch from api endpoint to CLI There's no reason to configure the launch from endpoint. So we put in CLI instead of manual launch --- robotoff/app/api.py | 21 --------------------- robotoff/batch/__init__.py | 4 ++-- robotoff/cli/main.py | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 1e761b10f8..0c60637090 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -1760,26 +1760,6 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.status = falcon.HTTP_200 -class BatchJobLaunchResource: - def on_post(self, req: falcon.Request, resp: falcon.Response): - job_type_str: str = req.get_param("job_type", required=True) - - try: - job_type = BatchJobType[job_type_str] - except KeyError: - raise falcon.HTTPBadRequest( - description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" - ) - # Batch extraction can take some time, so we queue it - enqueue_job( - launch_batch_job, - queue=low_queue, - job_type=job_type, - job_kwargs={"timeout": "10m"}, - ) - logger.info("Batch job launch %s has been queued.", job_type) - - class BatchJobImportResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) @@ -1866,5 +1846,4 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) api.add_route("/robots.txt", RobotsTxtResource()) -api.add_route("/api/v1/batch/launch", BatchJobLaunchResource()) api.add_route("/api/v1/batch/import", BatchJobImportResource()) diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 87e45fe69a..855a4cebe1 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -79,8 +79,8 @@ def _generate_predictions_from_batch( ) -> List[Prediction]: """From a file imported from google storage, generate predictions depending on the job type. - :param f: Readable object. Should be a parquet file. - :type f: io.BufferedReader + :param df: Post-processed dataset + :type df: pd.DataFrame :param job_type: Batch job type. :type job_type: BatchJobType :rtype: Iterable[Prediction] diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 4d69a00cc0..481b5bb57c 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -998,5 +998,22 @@ def create_migration( router.create(name, auto=auto) +@app.command() +def launch_batch_job( + job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"), +) -> None: + """Launch a batch job.""" + from robotoff.batch import launch_batch_job + from robotoff.utils import get_logger + from robotoff.types import BatchJobType + + if job_type not in BatchJobType.__members__: + raise ValueError(f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}") + + get_logger() + job_type = BatchJobType[job_type] + launch_batch_job(job_type) + + def main() -> None: app() From 54f1734306b9b1ee0383b1b138b52958bde4d556 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 28 Aug 2024 16:26:12 +0200 Subject: [PATCH 10/22] feat: :lock: Secure Batch Data Import endpoint with a token key --- docker-compose.yml | 1 + robotoff/app/api.py | 45 +++++++++++++++++++++++++++++++++++++------- robotoff/app/auth.py | 20 ++++++++++++++++++++ 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index e1de341f05..1fe85dacd2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,6 +57,7 @@ x-robotoff-base-env: NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json GOOGLE_CLOUD_PROJECT: "robotoff" + BATCH_JOB_KEY: # Secure Batch job import with a token key x-robotoff-worker-base: &robotoff-worker diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 0c60637090..ca09dddbc3 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -25,7 +25,11 @@ from robotoff import settings from robotoff.app import schema -from robotoff.app.auth import BasicAuthDecodeError, basic_decode +from robotoff.app.auth import ( + BasicAuthDecodeError, + basic_decode, + validate_token, +) from robotoff.app.core import ( SkipVotedOn, SkipVotedType, @@ -302,6 +306,29 @@ def parse_auth(req: falcon.Request) -> Optional[OFFAuthentication]: ) +def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool: + """Parse and validate authentification token from request. + + :param req: Request. + :type req: falcon.Request + :param ref_token_name: Secret environment variable name. + :type ref_token_name: str + :return: Token valid or not. + :rtype: bool + """ + auth_header = req.get_header("Authorization", required=True) + + scheme, token = auth_header.split() + if scheme.lower() != 'bearer': + raise falcon.HTTPUnauthorized('Invalid authentication scheme.') + + is_token_valid = validate_token(token, ref_token_name) + if not is_token_valid: + raise falcon.HTTPUnauthorized('Invalid token.') + else: + return True + + def device_id_from_request(req: falcon.Request) -> str: """Returns the 'device_id' from the request parameters, or a hash of the access route (which should be the IPs of the proxies and the client).""" @@ -1770,12 +1797,16 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): raise falcon.HTTPBadRequest( description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" ) - enqueue_job( - import_batch_predictions, - job_type=job_type, - queue=low_queue, - job_kwargs={"timeout": "10m"}, - ) + # We secure the endpoint + if parse_valid_token(req, "batch_job_key"): + enqueue_job( + import_batch_predictions, + job_type=job_type, + queue=low_queue, + job_kwargs={"timeout": "10m"}, + ) + else: + raise falcon.HTTPForbidden(description="Invalid batch_job_key. Be sure to indicate the authentification key in the request.") logger.info("Batch import %s has been queued.", job_type) diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py index 8fbf84c0d6..899b62470d 100644 --- a/robotoff/app/auth.py +++ b/robotoff/app/auth.py @@ -1,3 +1,4 @@ +import os from base64 import b64decode from urllib.parse import unquote @@ -6,6 +7,10 @@ class BasicAuthDecodeError(Exception): pass +class APITokenError(Exception): + pass + + def basic_decode(encoded_str: str) -> tuple[str, str]: """Decode an encrypted HTTP basic authentication string. Returns a tuple of the form (username, password), and raises a BasicAuthDecodeError exception @@ -39,3 +44,18 @@ def basic_decode(encoded_str: str) -> tuple[str, str]: raise BasicAuthDecodeError() return unquote(username), unquote(password) + + +def validate_token(token: str, ref_token_name: str) -> bool: + """Validate token. + + :param token: Authentification token + :type token: str + :param api_token_name: Validation token, stored in environment variables. + :type api_token_name: str + :rtype: bool + """ + api_token = os.getenv(ref_token_name.upper()) + if not api_token: + raise APITokenError("API token not set in environment variables.") + return token == api_token From 4aabf4ba0780fc6eed2c7cdaf27ec22fc1a02d1c Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 28 Aug 2024 16:33:29 +0200 Subject: [PATCH 11/22] feat: :art: Add key during request by the batch job --- batch/spellcheck/main.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 0960765fef..3f4972628e 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -1,3 +1,4 @@ +import os import argparse import tempfile import logging @@ -78,7 +79,7 @@ def main(): bucket_name=args.data_bucket, suffix=args.post_data_suffix ) - + LOGGER.info("Request Robotoff API batch import endpoint.") run_robotoff_endpoint_batch_import() @@ -159,14 +160,20 @@ def run_robotoff_endpoint_batch_import(): """ url = "https://robotoff.openfoodfacts.org/api/v1/batch/import" data = {"job_type": "ingredients_spellcheck"} - + headers = { + "Authorization": f"Bearer {os.getenv("BATCH_JOB_KEY")}", + "Content-Type": "application/json" + } try: - response = requests.post(url, data=data) + response = requests.post( + url, + data=data, + headers=headers, + ) + LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") except requests.exceptions.RequestException as e: raise SystemExit(e) - - LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") - + if __name__ == "__main__": main() From 01d884a02f2bfff2e41d6e955b716212b22b9fab Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Mon, 2 Sep 2024 17:28:31 +0200 Subject: [PATCH 12/22] feat: :sparkles: Implemenation reviews Simplify abstractions - Change data in insights instead of value - Other small changes --- batch/spellcheck/main.py | 18 ++-- poetry.lock | 2 +- pyproject.toml | 12 +-- robotoff/app/api.py | 40 +++++---- robotoff/app/auth.py | 1 - robotoff/batch/__init__.py | 142 +++++++++++++++-------------- robotoff/batch/buckets.py | 84 +++++++---------- robotoff/batch/extraction.py | 112 ++++++++++------------- robotoff/batch/launch.py | 170 +++++++++++++++-------------------- robotoff/batch/types.py | 35 -------- robotoff/cli/main.py | 4 +- robotoff/utils/buckets.py | 39 -------- tests/unit/test_batch.py | 46 +++++----- 13 files changed, 283 insertions(+), 422 deletions(-) delete mode 100644 robotoff/batch/types.py delete mode 100644 robotoff/utils/buckets.py diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 3f4972628e..6c73648c7e 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -11,7 +11,7 @@ from google.cloud import storage -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -45,12 +45,12 @@ def main(): We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository. """ - LOGGER.info("Starting batch processing job.") + logger.info("Starting batch processing job.") args = parse() - LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}") + logger.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}") data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix) - LOGGER.info(f"Feature in uploaded data: {data.columns}") + logger.info(f"Feature in uploaded data: {data.columns}") if not all(feature in data.columns for feature in FEATURES_VALIDATION): raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}") @@ -66,10 +66,10 @@ def main(): max_tokens=args.max_tokens ) - LOGGER.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}") + logger.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}") data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params) - LOGGER.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}") + logger.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}") # Save DataFrame as Parquet to a temporary file with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file: data.to_parquet(temp_file.name) @@ -80,10 +80,10 @@ def main(): suffix=args.post_data_suffix ) - LOGGER.info("Request Robotoff API batch import endpoint.") + logger.info("Request Robotoff API batch import endpoint.") run_robotoff_endpoint_batch_import() - LOGGER.info("Batch processing job completed.") + logger.info("Batch processing job completed.") def prepare_instruction(text: str) -> str: @@ -170,7 +170,7 @@ def run_robotoff_endpoint_batch_import(): data=data, headers=headers, ) - LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") + logger.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") except requests.exceptions.RequestException as e: raise SystemExit(e) diff --git a/poetry.lock b/poetry.lock index c0680af5a8..43da024f22 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4420,4 +4420,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "07551d5c2b36e7b3321ac361d384ef02a74de7686051a450ecd692a0b832929b" +content-hash = "9a02871efced91ed473a7af971e0f46acd4209165cb7597ca5d9b417267992a6" diff --git a/pyproject.toml b/pyproject.toml index 133f61c850..43669c7848 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ jsonschema = "~4.4.0" orjson = ">=3.8.2,<3.10.0" Pillow = ">=9.3,<10.4" numpy = "~1.26.4" -protobuf = "^3.19.0" +protobuf = "~3.20.3" Pint = "0.22" APScheduler = "~3.10.1" more-itertools = "~8.9.0" @@ -78,11 +78,11 @@ openfoodfacts = "1.1.1" imagehash = "~4.3.1" peewee-migrate = "~1.12.2" diskcache = "~5.6.3" -google-cloud-batch = "^0.17.26" -duckdb = "1.0.0" -google-cloud-storage = "<2.18.2" -pandas = "^2.2.2" -pyarrow = "^17.0.0" +google-cloud-batch = "~0.17.26" +duckdb = "~1.0.0" +google-cloud-storage = "~2.14.0" +pandas = "~2.2.2" +pyarrow = "~17.0.0" [tool.poetry.dependencies.sentry-sdk] version = "~1.14.0" diff --git a/robotoff/app/api.py b/robotoff/app/api.py index ca09dddbc3..e9003929ee 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -26,7 +26,8 @@ from robotoff import settings from robotoff.app import schema from robotoff.app.auth import ( - BasicAuthDecodeError, + BasicAuthDecodeError, + APITokenError, basic_decode, validate_token, ) @@ -92,7 +93,6 @@ from robotoff.workers.tasks import download_product_dataset_job from robotoff.batch import ( BatchJobType, - launch_batch_job, import_batch_predictions, ) @@ -314,14 +314,15 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool: :param ref_token_name: Secret environment variable name. :type ref_token_name: str :return: Token valid or not. - :rtype: bool """ auth_header = req.get_header("Authorization", required=True) - - scheme, token = auth_header.split() - if scheme.lower() != 'bearer': - raise falcon.HTTPUnauthorized('Invalid authentication scheme.') + try: + scheme, token = auth_header.split() + except APITokenError: + raise falcon.HTTPUnauthorized("Invalid authentication scheme.") + if scheme.lower() != 'bearer': + raise falcon.HTTPUnauthorized("Invalid authentication scheme: 'Bearer Token' expected.") is_token_valid = validate_token(token, ref_token_name) if not is_token_valid: raise falcon.HTTPUnauthorized('Invalid token.') @@ -1778,14 +1779,6 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.media = response -class RobotsTxtResource: - def on_get(self, req: falcon.Request, resp: falcon.Response): - # Disallow completely indexation: otherwise web crawlers send millions - # of requests to Robotoff (420k requests/day by Google alone) - resp.body = "User-agent: *\nDisallow: /\n" - resp.content_type = falcon.MEDIA_TEXT - resp.status = falcon.HTTP_200 - class BatchJobImportResource: def on_post(self, req: falcon.Request, resp: falcon.Response): @@ -1803,12 +1796,23 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): import_batch_predictions, job_type=job_type, queue=low_queue, - job_kwargs={"timeout": "10m"}, + job_kwargs={"timeout": "30m"}, ) else: - raise falcon.HTTPForbidden(description="Invalid batch_job_key. Be sure to indicate the authentification key in the request.") + raise falcon.HTTPForbidden( + description="Invalid batch_job_key. Be sure to indicate the authentification key in the request." + ) logger.info("Batch import %s has been queued.", job_type) + +class RobotsTxtResource: + def on_get(self, req: falcon.Request, resp: falcon.Response): + # Disallow completely indexation: otherwise web crawlers send millions + # of requests to Robotoff (420k requests/day by Google alone) + resp.body = "User-agent: *\nDisallow: /\n" + resp.content_type = falcon.MEDIA_TEXT + resp.status = falcon.HTTP_200 + def custom_handle_uncaught_exception( req: falcon.Request, resp: falcon.Response, ex: Exception, params @@ -1876,5 +1880,5 @@ def custom_handle_uncaught_exception( api.add_route("/api/v1/users/statistics/{username}", UserStatisticsResource()) api.add_route("/api/v1/predictions", PredictionCollection()) api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection()) -api.add_route("/robots.txt", RobotsTxtResource()) api.add_route("/api/v1/batch/import", BatchJobImportResource()) +api.add_route("/robots.txt", RobotsTxtResource()) diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py index 899b62470d..5eef036497 100644 --- a/robotoff/app/auth.py +++ b/robotoff/app/auth.py @@ -53,7 +53,6 @@ def validate_token(token: str, ref_token_name: str) -> bool: :type token: str :param api_token_name: Validation token, stored in environment variables. :type api_token_name: str - :rtype: bool """ api_token = os.getenv(ref_token_name.upper()) if not api_token: diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 855a4cebe1..27e4b5ae80 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,105 +1,103 @@ +import os import tempfile -from typing import List - -import pandas as pd from robotoff.utils import get_logger from robotoff.types import ( BatchJobType, Prediction, - ServerType + ServerType, ) from robotoff.models import db from robotoff.insights.importer import import_insights +from robotoff import settings +from robotoff.types import PredictionType -from .launch import ( - GoogleBatchJob, - GoogleBatchJobConfig, -) -from .extraction import ( - BatchExtraction, -) -from .buckets import ( - GoogleStorageBucketForBatchJob, -) -from .types import ( - BATCH_JOB_TYPE_TO_FEATURES, - BATCH_JOB_TYPE_TO_PREDICTION_TYPE, -) - - -LOGGER = get_logger(__name__) +from .launch import launch_job, GoogleBatchJobConfig +from .extraction import extract_from_dataset +from .buckets import upload_file_to_gcs, fetch_dataframe_from_gcs -PREDICTOR_VERSION = "1" #TODO: shard HF model version? instead of manual change? -PREDICTOR = "llm" +logger = get_logger(__name__) def launch_batch_job(job_type: BatchJobType) -> None: """Launch a batch job. + Need to be updated if different batch jobs are added. + """ + if job_type is BatchJobType.ingredients_spellcheck: + launch_spellcheck_batch_job() + else: + raise NotImplementedError(f"Batch job type {job_type} not implemented.") + + +def import_batch_predictions(job_type: BatchJobType) -> None: + """Import batch predictions once the job finished. + Need to be updated if different batch jobs are added. + """ + if job_type is BatchJobType.ingredients_spellcheck: + import_spellcheck_batch_predictions() + else: + raise NotImplementedError(f"Batch job type {job_type} not implemented.") + + +def launch_spellcheck_batch_job() -> None: + """Launch spellcheck batch job. """ + # Init + JOB_NAME = "ingredients-spellcheck" + QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql" + BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" + BUCKET_NAME = "robotoff-spellcheck" + SUFFIX_PREPROCESS = "data/preprocessed_data.parquet" + + # Extract data from dataset with tempfile.TemporaryDirectory() as tmp_dir: - BatchExtraction.extract_from_dataset( - job_type=job_type, - output_dir=tmp_dir, - ) - if not BatchExtraction.extracted_file_path: - raise ValueError("The extracted file was not found.") - LOGGER.debug(f"Batch data succesfully extracted and saved at {BatchExtraction.extracted_file_path}") + file_path = os.path.join(tmp_dir, "batch_data.parquet") + extract_from_dataset(QUERY_FILE_PATH, file_path) # Upload the extracted file to the bucket - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) - bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path) - LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}/{bucket_handler.suffix_preprocess}") + upload_file_to_gcs(file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS) + logger.debug(f"File uploaded to the bucket {BUCKET_NAME}/{SUFFIX_PREPROCESS}") # Launch batch job - batch_job_config = GoogleBatchJobConfig.init(job_type=job_type) - batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config) - LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") + batch_job_config = GoogleBatchJobConfig.init(job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH) + batch_job = launch_job(batch_job_config=batch_job_config) + logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") -def import_batch_predictions(job_type: BatchJobType) -> None: - """Import predictions from remote storage. - """ - bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type) - LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}") - df = bucket_handler.download_file() - predictions = _generate_predictions_from_batch(df, job_type) - with db: - import_results = import_insights( - predictions=predictions, - server_type=ServerType.off - ) - LOGGER.info(f"Batch import results: {repr(import_results)}.") - - -def _generate_predictions_from_batch( - df: pd.DataFrame, - job_type: BatchJobType -) -> List[Prediction]: - """From a file imported from google storage, generate predictions depending on the job type. - - :param df: Post-processed dataset - :type df: pd.DataFrame - :param job_type: Batch job type. - :type job_type: BatchJobType - :rtype: Iterable[Prediction] - :yield: Predictions. - :rtype: Iterator[Prediction] +def import_spellcheck_batch_predictions() -> None: + """Import spellcheck predictions from remote storage. """ + # Init + BUCKET_NAME = "robotoff-spellcheck" + SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet" + PREDICTION_TYPE = PredictionType.ingredient_spellcheck + PREDICTOR_VERSION = "1" #TODO: shard HF model version instead of manual change? + PREDICTOR = "fine-tuned-mistral-7b" + SERVER_TYPE = ServerType.off + + df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS) + logger.debug(f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}") + + + # Generate predictions predictions = [] - features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type] - prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type] for _, row in df.iterrows(): predictions.append( Prediction( - type=prediction_type, - value=row[features_dict["value"]], - value_tag=row[features_dict["value_tag"]], - barcode=row[features_dict["barcode"]], + type=PREDICTION_TYPE, + data={"original": row["text"], "correction": row["correction"]}, + value_tag=row["lang"], + barcode=row["code"], predictor_version=PREDICTOR_VERSION, predictor=PREDICTOR, + automatic_processing=False, ) ) - return predictions - + # Store predictions and insights + with db: + import_results = import_insights( + predictions=predictions, + server_type=SERVER_TYPE + ) + logger.info("Batch import results: %s", import_results) diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py index 278b62bf18..77ae4f4ba0 100644 --- a/robotoff/batch/buckets.py +++ b/robotoff/batch/buckets.py @@ -1,63 +1,39 @@ import pandas as pd +from google.cloud import storage -from robotoff.utils.buckets import GoogleStorageBucket -from robotoff.batch import BatchJobType -from robotoff.batch.types import BATCH_JOB_TYPE_TO_BUCKET +def upload_file_to_gcs(file_path: str, bucket_name: str, suffix: str) -> None: + """Upload file to Google Storage Bucket. -class GoogleStorageBucketForBatchJob(GoogleStorageBucket): - """Class to handle the Google Storage bucket for depending on the batch job. - - :param bucket: Bucket name - :type bucket: str - :param suffix_preprocess: Path inside the bucket before batch processing. - :type suffix_preprocess: str - :param suffix_postprocess: Path inside the bucket after batch processing. - :type suffix_postprocess: str + :param file_path: File where the data is stored + :type file_path: str + :param bucket_name: Bucket name in GCP storage + :type bucket_name: str + :param suffix: Path inside the bucket + :type suffix: str """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + blob.upload_from_filename(filename=file_path) - def __init__( - self, - bucket: str, - suffix_preprocess: str, - suffix_postprocess: str, - ) -> None: - self.bucket = bucket - self.suffix_preprocess = suffix_preprocess - self.suffix_postprocess = suffix_postprocess - - @classmethod - def from_job_type(cls, job_type: BatchJobType) -> "GoogleStorageBucketForBatchJob": - """Initialize the class with the bucket and suffix names corresponding to the batch job type. - Used to adapt bucket upload and download during the batch job process. - - :param job_type: Batch job type. - :type job_type: BatchJobType - :return: Instantiated class. - :rtype: GoogleStorageBucketForBatchJob - """ - try: - bucket_dict = BATCH_JOB_TYPE_TO_BUCKET[job_type] - except KeyError: - raise ValueError(f"Batch job type {job_type} not found in the configuration. Expected {BATCH_JOB_TYPE_TO_BUCKET}.") - return cls(**bucket_dict) - def upload_file(self, file_path: str): - """Upload file to the bucket. +def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: + """Download parquet file from Google Storage Bucket. - :param file_path: File path to upload. - :type file_path: str - """ - self.upload_gcs( - file_path=file_path, - bucket_name=self.bucket, - suffix=self.suffix_preprocess, - ) - def download_file(self) -> pd.DataFrame: - """Download file from bucket - """ - return self.download_gcs( - bucket_name=self.bucket, - suffix=self.suffix_postprocess, - ) + :param bucket_name: Bucket name in GCP storage + :type bucket_name: str + :param suffix: Path inside the bucket. Should lead to a parquet file. + :type suffix: str + :return: Dataframe + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(suffix) + with blob.open("rb") as f: + try: + df = pd.read_parquet(f) + except Exception as e: + raise ValueError(f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}") + return df diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py index 5ca4d2f0e1..2013e3fa26 100644 --- a/robotoff/batch/extraction.py +++ b/robotoff/batch/extraction.py @@ -1,82 +1,64 @@ -import os from pathlib import Path import duckdb from robotoff import settings -from robotoff.batch import BatchJobType -from robotoff.batch.types import BATCH_JOB_TYPE_TO_QUERY_FILE_PATH from robotoff.utils import get_logger -LOGGER = get_logger(__name__) +logger = get_logger(__name__) -class BatchExtraction: - """Handle batch extraction from the dataset. - Extraction varies regarding the batch job. - """ +def extract_from_dataset( + query_file_path: Path, + output_file_path: str, + dataset_path: Path = settings.JSONL_DATASET_PATH, +) -> None: + """Using SQL queries, extract data from the dataset and save it as a parquet file. - file_name: str = "batch.parquet" - extracted_file_path: str = None + :param query_file_path: Path to the SQL file relative to the job. + :type query_file_path: Path + :param output_file_path: Path to save the extracted data. + :type output_file_path: str + :param dataset_path: Compressed jsonl database, defaults to settings.JSONL_DATASET_PATH + :type dataset_path: Path, optional + """ + if not dataset_path.exists(): + raise FileNotFoundError(f"Dataset path {str(dataset_path)} not found.") + query = _load_query(query_file_path=query_file_path, dataset_path=dataset_path) + _extract_and_save_batch_data(query=query, output_file_path=output_file_path) + logger.debug(f"Batch data succesfully extracted and saved at {output_file_path}") - @classmethod - def extract_from_dataset( - cls, - job_type: BatchJobType, - output_dir: str, - dataset_path: str = str(settings.JSONL_DATASET_PATH), - ) -> None: - """Using SQL queries, extract data from the dataset and save it as a parquet file. - :param job_type: Batch job type. - :type job_type: BatchJobType - :param output_dir: Directory to save the extracted data as a parquet file. - :type output_dir: str - :param dataset_path: Path to the jsonl.gz dataset. - :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing. - """ - if not isinstance(dataset_path, str): - raise ValueError(f"The dataset path should be a string. Current type {type(dataset_path)}") - - query_file_path = BATCH_JOB_TYPE_TO_QUERY_FILE_PATH[job_type] - query = cls._load_query(query_file_path=query_file_path, dataset_path=dataset_path) - cls._extract_and_save_batch_data(query=query, output_dir=output_dir) - # We save the file path for later usage in the pipeline - cls.extracted_file_path = os.path.join(output_dir, cls.file_name) - @staticmethod - def _load_query(query_file_path: Path, dataset_path: str) -> str: - """Load the SQL query from a corresponding file. +def _load_query(query_file_path: Path, dataset_path: Path) -> str: + """Load the SQL query from a corresponding file. - :param query_file_path: File path containing the SQL query. - :type query_file_path: Path - :param dataset_path: Path to the jsonl.gz dataset. - :type dataset_path: Path - :raises ValueError: In case the Dataset path is not found in the SQL query. - :return: the SQL/DuckDB query. - :rtype: str - """ - query = query_file_path.read_text() - if "DATASET_PATH" not in query: - raise ValueError( - "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path." - ) - query = query.replace("DATASET_PATH", dataset_path) - LOGGER.debug(f"Query used to extract batch from dataset: {query}") - return query + :param query_file_path: Path to the SQL file relative to the job. + :type query_file_path: Path + :param dataset_path: Path to the dataset. + :type dataset_path: Path + :return: SQL query. + """ + query = query_file_path.read_text() + if "DATASET_PATH" not in query: + raise ValueError( + "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path." + ) + query = query.replace("DATASET_PATH", str(dataset_path)) + logger.debug(f"Query used to extract batch from dataset: {query}") + return query - @classmethod - def _extract_and_save_batch_data(cls, query: str, output_dir: str) -> None: - """Query and save the data. +def _extract_and_save_batch_data(query: str, output_file_path: str) -> None: + """Query and save the data. - :param query: DuckDB/SQL query. - :type query: str - :param output_dir: Extracted data directory - :type output_dir: str - """ - ( - duckdb - .sql(query) - .write_parquet(os.path.join(output_dir, cls.file_name)) - ) + :param query: SQL query. + :type query: str + :param output_file_path: Path to save the extracted data. + :type output_file_path: str + """ + ( + duckdb + .sql(query) + .write_parquet(output_file_path) + ) diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index a5ca69132c..247f7a6c05 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -1,15 +1,13 @@ -import abc from typing import List, Optional import yaml import datetime import re +from pathlib import Path from google.cloud import batch_v1 from pydantic import BaseModel, Field, ConfigDict from robotoff import settings -from robotoff.types import BatchJobType -from robotoff.batch.types import BATCH_JOB_TYPE_TO_CONFIG_PATH class GoogleBatchJobConfig(BaseModel): @@ -88,7 +86,7 @@ class GoogleBatchJobConfig(BaseModel): ) @classmethod - def init(cls, job_type: BatchJobType): + def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig": """Initialize the class with the configuration file corresponding to the job type. :param job_type: Batch job type. @@ -96,110 +94,88 @@ def init(cls, job_type: BatchJobType): """ # Batch job name should respect a specific pattern, or returns an error pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$" - if not re.match(pattern, job_type.value): - raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_type.value}") + if not re.match(pattern, job_name): + raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_name}") # Generate unique id for the job unique_job_name = ( - job_type.value + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") ) # Load config file from job_type - config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type] with open(config_path, "r") as f: config = yaml.safe_load(f) return cls(job_name=unique_job_name, **config) -class BatchJob(abc.ABC): - """Abstract class to launch and manage batch jobs: Google, AWS, Azure, Triton...""" +def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: + """This method creates a Batch Job on GCP. - @staticmethod - @abc.abstractmethod - def launch_job() -> str: - """Launch batch job.""" - pass + Sources: + * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create + * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types + + :param google_batch_launch_config: Config to run a job on Google Batch. + :type google_batch_launch_config: GoogleBatchLaunchConfig + :param batch_job_config: Config to run a specific job on Google Batch. + :type batch_job_config: BatchJobConfig + :return: Batch job information. - -class GoogleBatchJob(BatchJob): - """GCP Batch class. It uses the Google Cloud Batch API to launch and manage jobs. - - More information on: - https://cloud.google.com/batch/docs/get-started + Returns: + Batch job information. """ - @staticmethod - def launch_job( - batch_job_config: GoogleBatchJobConfig, - ) -> batch_v1.Job: - """This method creates a Batch Job on GCP. - - Sources: - * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create - * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types - - :param google_batch_launch_config: Config to run a job on Google Batch. - :type google_batch_launch_config: GoogleBatchLaunchConfig - :param batch_job_config: Config to run a specific job on Google Batch. - :type batch_job_config: BatchJobConfig - :return: Batch job information. - :rtype: batch_v1.Job - - Returns: - Batch job information. - """ - - client = batch_v1.BatchServiceClient() - - # Define what will be done as part of the job. - runnable = batch_v1.Runnable() - runnable.container = batch_v1.Runnable.Container() - runnable.container.image_uri = batch_job_config.container_image_uri - runnable.container.entrypoint = batch_job_config.entrypoint - runnable.container.commands = batch_job_config.commands - - # Jobs can be divided into tasks. In this case, we have only one task. - task = batch_v1.TaskSpec() - task.runnables = [runnable] - - # We can specify what resources are requested by each task. - resources = batch_v1.ComputeResource() - resources.cpu_milli = batch_job_config.cpu_milli - resources.memory_mib = batch_job_config.memory_mib - resources.boot_disk_mib = batch_job_config.boot_disk_mib - task.compute_resource = resources - - task.max_retry_count = batch_job_config.max_retry_count - task.max_run_duration = batch_job_config.max_run_duration - - # Tasks are grouped inside a job using TaskGroups. - group = batch_v1.TaskGroup() - group.task_count = batch_job_config.task_count - group.task_spec = task - - # Policies are used to define on what kind of virtual machines the tasks will run on. - policy = batch_v1.AllocationPolicy.InstancePolicy() - policy.machine_type = batch_job_config.machine_type - instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate() - instances.install_gpu_drivers = batch_job_config.install_gpu_drivers - instances.policy = policy - allocation_policy = batch_v1.AllocationPolicy() - allocation_policy.instances = [instances] - - accelerator = batch_v1.AllocationPolicy.Accelerator() - accelerator.type_ = batch_job_config.accelerators_type - accelerator.count = batch_job_config.accelerators_count - - job = batch_v1.Job() - job.task_groups = [group] - job.allocation_policy = allocation_policy - # We use Cloud Logging as it's an out of the box available option - job.logs_policy = batch_v1.LogsPolicy() - job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING - - create_request = batch_v1.CreateJobRequest() - create_request.job = job - create_request.job_id = batch_job_config.job_name - # The job's parent is the region in which the job will run - create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}" - - return client.create_job(create_request) + client = batch_v1.BatchServiceClient() + + # Define what will be done as part of the job. + runnable = batch_v1.Runnable() + runnable.container = batch_v1.Runnable.Container() + runnable.container.image_uri = batch_job_config.container_image_uri + runnable.container.entrypoint = batch_job_config.entrypoint + runnable.container.commands = batch_job_config.commands + + # Jobs can be divided into tasks. In this case, we have only one task. + task = batch_v1.TaskSpec() + task.runnables = [runnable] + + # We can specify what resources are requested by each task. + resources = batch_v1.ComputeResource() + resources.cpu_milli = batch_job_config.cpu_milli + resources.memory_mib = batch_job_config.memory_mib + resources.boot_disk_mib = batch_job_config.boot_disk_mib + task.compute_resource = resources + + task.max_retry_count = batch_job_config.max_retry_count + task.max_run_duration = batch_job_config.max_run_duration + + # Tasks are grouped inside a job using TaskGroups. + group = batch_v1.TaskGroup() + group.task_count = batch_job_config.task_count + group.task_spec = task + + # Policies are used to define on what kind of virtual machines the tasks will run on. + policy = batch_v1.AllocationPolicy.InstancePolicy() + policy.machine_type = batch_job_config.machine_type + instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate() + instances.install_gpu_drivers = batch_job_config.install_gpu_drivers + instances.policy = policy + allocation_policy = batch_v1.AllocationPolicy() + allocation_policy.instances = [instances] + + accelerator = batch_v1.AllocationPolicy.Accelerator() + accelerator.type_ = batch_job_config.accelerators_type + accelerator.count = batch_job_config.accelerators_count + + job = batch_v1.Job() + job.task_groups = [group] + job.allocation_policy = allocation_policy + # We use Cloud Logging as it's an out of the box available option + job.logs_policy = batch_v1.LogsPolicy() + job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING + + create_request = batch_v1.CreateJobRequest() + create_request.job = job + create_request.job_id = batch_job_config.job_name + # The job's parent is the region in which the job will run + create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}" + + return client.create_job(create_request) diff --git a/robotoff/batch/types.py b/robotoff/batch/types.py deleted file mode 100644 index c0c452cefd..0000000000 --- a/robotoff/batch/types.py +++ /dev/null @@ -1,35 +0,0 @@ -from robotoff.types import BatchJobType, PredictionType -from robotoff import settings - - -# Bucket structure to enable the batch job to load and upload data -BATCH_JOB_TYPE_TO_BUCKET = { - BatchJobType.ingredients_spellcheck: { - "bucket": "robotoff-spellcheck", - "suffix_preprocess": "data/preprocessed_data.parquet", - "suffix_postprocess": "data/postprocessed_data.parquet", - }, -} - -# Paths batch job config files -BATCH_JOB_TYPE_TO_CONFIG_PATH = { - BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml", -} - -BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = { - BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql", -} - -# Mapping between batch job type and prediction type -BATCH_JOB_TYPE_TO_PREDICTION_TYPE = { - BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck, -} - -# Column names in the processed batch of data -BATCH_JOB_TYPE_TO_FEATURES = { - BatchJobType.ingredients_spellcheck: { - "barcode": "code", - "value": "correction", - "value_tag": "lang", - }, -} diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 481b5bb57c..47fe2a0c24 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -1003,7 +1003,7 @@ def launch_batch_job( job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"), ) -> None: """Launch a batch job.""" - from robotoff.batch import launch_batch_job + from robotoff.batch import launch_batch_job as _launch_batch_job from robotoff.utils import get_logger from robotoff.types import BatchJobType @@ -1012,7 +1012,7 @@ def launch_batch_job( get_logger() job_type = BatchJobType[job_type] - launch_batch_job(job_type) + _launch_batch_job(job_type) def main() -> None: diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py deleted file mode 100644 index cc92cadfda..0000000000 --- a/robotoff/utils/buckets.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd -from google.cloud import storage - - -class GoogleStorageBucket: - - @staticmethod - def download_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: - """Download parquet file from Google Storage Bucket. - - :param bucket_name: Bucket name - :type bucket_name: str - :param suffix: Path inside the bucket - :type suffix: str - :return: - :rtype: Any - """ - client = storage.Client() - bucket = client.get_bucket(bucket_name) - blob = bucket.blob(suffix) - with blob.open("rb") as f: - return pd.read_parquet(f) - - - @staticmethod - def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None: - """Upload file to Google Storage Bucket. - - :param file_path: File path. - :type file_path: str - :param bucket_name: Bucket name. - :type bucket_name: str - :param suffix: Path inside the bucket. - :type suffix: str - """ - client = storage.Client() - bucket = client.get_bucket(bucket_name) - blob = bucket.blob(suffix) - blob.upload_from_filename(filename=file_path) diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py index 25f8c94e66..d42297e51f 100644 --- a/tests/unit/test_batch.py +++ b/tests/unit/test_batch.py @@ -1,43 +1,43 @@ +import os import pytest import tempfile from pathlib import Path -from robotoff.batch import ( - GoogleBatchJobConfig, - BatchJobType, - BatchExtraction, -) +from robotoff.batch import GoogleBatchJobConfig +from robotoff.batch.extraction import extract_from_dataset +from robotoff import settings DIR = Path(__file__).parent -JOB_TYPES = [ - "ingredients_spellcheck", -] +SPELLCHECK_QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql" +SPELLCHECK_BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" -# Add future job types here for testing. @pytest.mark.parametrize( - "job_type_str", - JOB_TYPES, + "inputs", + [ + ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH), + ], ) -def test_batch_job_config_file(job_type_str): +def test_batch_job_config_file(inputs): "Test indirectly the batch job config file by validating with the Pydantic class model." - job_type = BatchJobType[job_type_str] - GoogleBatchJobConfig.init(job_type) + job_name, config_path = inputs + GoogleBatchJobConfig.init(job_name, config_path) -# Add future job types here for testing. @pytest.mark.parametrize( - "job_type_str", - JOB_TYPES, + "query_file_path", + [ + SPELLCHECK_QUERY_FILE_PATH, + ] ) -def test_batch_extraction(job_type_str): +def test_batch_extraction(query_file_path): """Test extraction of a batch of data from the dataset depending on the job type. """ - job_type_str = BatchJobType[job_type_str] with tempfile.TemporaryDirectory() as tmp_dir: - BatchExtraction.extract_from_dataset( - job_type=job_type_str, - output_dir=tmp_dir, - dataset_path=str(DIR / "data/dataset_sample.jsonl.gz"), + file_path = os.path.join(tmp_dir, "data.parquet") + extract_from_dataset( + output_file_path=file_path, + query_file_path=SPELLCHECK_QUERY_FILE_PATH, + dataset_path=DIR / "data/dataset_sample.jsonl.gz", ) From fda7b5d50ba95e05ac3034dd712df7b710e75060 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Mon, 2 Sep 2024 17:33:42 +0200 Subject: [PATCH 13/22] style: :sparkles: make lint --- robotoff/app/api.py | 24 ++++++++-------- robotoff/batch/__init__.py | 52 +++++++++++++++++------------------ robotoff/batch/buckets.py | 6 ++-- robotoff/batch/extraction.py | 9 ++---- robotoff/batch/launch.py | 19 ++++++++----- robotoff/cli/main.py | 12 +++++--- robotoff/insights/importer.py | 9 +++--- robotoff/settings.py | 2 +- robotoff/types.py | 7 +++-- tests/unit/test_batch.py | 15 +++++----- 10 files changed, 80 insertions(+), 75 deletions(-) diff --git a/robotoff/app/api.py b/robotoff/app/api.py index e9003929ee..dd71a32f6e 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -26,9 +26,9 @@ from robotoff import settings from robotoff.app import schema from robotoff.app.auth import ( + APITokenError, BasicAuthDecodeError, - APITokenError, - basic_decode, + basic_decode, validate_token, ) from robotoff.app.core import ( @@ -45,6 +45,7 @@ validate_params, ) from robotoff.app.middleware import DBConnectionMiddleware +from robotoff.batch import BatchJobType, import_batch_predictions from robotoff.elasticsearch import get_es_client from robotoff.insights.extraction import ( DEFAULT_OCR_PREDICTION_TYPES, @@ -91,10 +92,6 @@ from robotoff.utils.text import get_tag from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue from robotoff.workers.tasks import download_product_dataset_job -from robotoff.batch import ( - BatchJobType, - import_batch_predictions, -) logger = get_logger() @@ -311,7 +308,7 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool: :param req: Request. :type req: falcon.Request - :param ref_token_name: Secret environment variable name. + :param ref_token_name: Secret environment variable name. :type ref_token_name: str :return: Token valid or not. """ @@ -321,11 +318,13 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool: scheme, token = auth_header.split() except APITokenError: raise falcon.HTTPUnauthorized("Invalid authentication scheme.") - if scheme.lower() != 'bearer': - raise falcon.HTTPUnauthorized("Invalid authentication scheme: 'Bearer Token' expected.") + if scheme.lower() != "bearer": + raise falcon.HTTPUnauthorized( + "Invalid authentication scheme: 'Bearer Token' expected." + ) is_token_valid = validate_token(token, ref_token_name) if not is_token_valid: - raise falcon.HTTPUnauthorized('Invalid token.') + raise falcon.HTTPUnauthorized("Invalid token.") else: return True @@ -1779,14 +1778,13 @@ def on_get(self, req: falcon.Request, resp: falcon.Response): resp.media = response - class BatchJobImportResource: def on_post(self, req: falcon.Request, resp: falcon.Response): job_type_str: str = req.get_param("job_type", required=True) try: job_type = BatchJobType[job_type_str] - except KeyError: + except KeyError: raise falcon.HTTPBadRequest( description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" ) @@ -1804,7 +1802,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): ) logger.info("Batch import %s has been queued.", job_type) - + class RobotsTxtResource: def on_get(self, req: falcon.Request, resp: falcon.Response): # Disallow completely indexation: otherwise web crawlers send millions diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 27e4b5ae80..c78b839315 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,21 +1,15 @@ import os import tempfile -from robotoff.utils import get_logger -from robotoff.types import ( - BatchJobType, - Prediction, - ServerType, -) -from robotoff.models import db -from robotoff.insights.importer import import_insights from robotoff import settings -from robotoff.types import PredictionType +from robotoff.insights.importer import import_insights +from robotoff.models import db +from robotoff.types import BatchJobType, Prediction, PredictionType, ServerType +from robotoff.utils import get_logger -from .launch import launch_job, GoogleBatchJobConfig +from .buckets import fetch_dataframe_from_gcs, upload_file_to_gcs from .extraction import extract_from_dataset -from .buckets import upload_file_to_gcs, fetch_dataframe_from_gcs - +from .launch import GoogleBatchJobConfig, launch_job logger = get_logger(__name__) @@ -28,7 +22,7 @@ def launch_batch_job(job_type: BatchJobType) -> None: launch_spellcheck_batch_job() else: raise NotImplementedError(f"Batch job type {job_type} not implemented.") - + def import_batch_predictions(job_type: BatchJobType) -> None: """Import batch predictions once the job finished. @@ -41,12 +35,13 @@ def import_batch_predictions(job_type: BatchJobType) -> None: def launch_spellcheck_batch_job() -> None: - """Launch spellcheck batch job. - """ + """Launch spellcheck batch job.""" # Init JOB_NAME = "ingredients-spellcheck" QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql" - BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" + BATCH_JOB_CONFIG_PATH = ( + settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" + ) BUCKET_NAME = "robotoff-spellcheck" SUFFIX_PREPROCESS = "data/preprocessed_data.parquet" @@ -56,29 +51,35 @@ def launch_spellcheck_batch_job() -> None: extract_from_dataset(QUERY_FILE_PATH, file_path) # Upload the extracted file to the bucket - upload_file_to_gcs(file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS) + upload_file_to_gcs( + file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS + ) logger.debug(f"File uploaded to the bucket {BUCKET_NAME}/{SUFFIX_PREPROCESS}") # Launch batch job - batch_job_config = GoogleBatchJobConfig.init(job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH) + batch_job_config = GoogleBatchJobConfig.init( + job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH + ) batch_job = launch_job(batch_job_config=batch_job_config) logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") def import_spellcheck_batch_predictions() -> None: - """Import spellcheck predictions from remote storage. - """ + """Import spellcheck predictions from remote storage.""" # Init BUCKET_NAME = "robotoff-spellcheck" SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet" PREDICTION_TYPE = PredictionType.ingredient_spellcheck - PREDICTOR_VERSION = "1" #TODO: shard HF model version instead of manual change? + PREDICTOR_VERSION = "1" # TODO: shard HF model version instead of manual change? PREDICTOR = "fine-tuned-mistral-7b" SERVER_TYPE = ServerType.off - - df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS) - logger.debug(f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}") + df = fetch_dataframe_from_gcs( + bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS + ) + logger.debug( + f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}" + ) # Generate predictions predictions = [] @@ -97,7 +98,6 @@ def import_spellcheck_batch_predictions() -> None: # Store predictions and insights with db: import_results = import_insights( - predictions=predictions, - server_type=SERVER_TYPE + predictions=predictions, server_type=SERVER_TYPE ) logger.info("Batch import results: %s", import_results) diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py index 77ae4f4ba0..8a90d657c8 100644 --- a/robotoff/batch/buckets.py +++ b/robotoff/batch/buckets.py @@ -32,8 +32,10 @@ def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: bucket = client.get_bucket(bucket_name) blob = bucket.blob(suffix) with blob.open("rb") as f: - try: + try: df = pd.read_parquet(f) except Exception as e: - raise ValueError(f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}") + raise ValueError( + f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}" + ) return df diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py index 2013e3fa26..3f3637e9a8 100644 --- a/robotoff/batch/extraction.py +++ b/robotoff/batch/extraction.py @@ -5,7 +5,6 @@ from robotoff import settings from robotoff.utils import get_logger - logger = get_logger(__name__) @@ -30,7 +29,6 @@ def extract_from_dataset( logger.debug(f"Batch data succesfully extracted and saved at {output_file_path}") - def _load_query(query_file_path: Path, dataset_path: Path) -> str: """Load the SQL query from a corresponding file. @@ -49,6 +47,7 @@ def _load_query(query_file_path: Path, dataset_path: Path) -> str: logger.debug(f"Query used to extract batch from dataset: {query}") return query + def _extract_and_save_batch_data(query: str, output_file_path: str) -> None: """Query and save the data. @@ -57,8 +56,4 @@ def _extract_and_save_batch_data(query: str, output_file_path: str) -> None: :param output_file_path: Path to save the extracted data. :type output_file_path: str """ - ( - duckdb - .sql(query) - .write_parquet(output_file_path) - ) + (duckdb.sql(query).write_parquet(output_file_path)) diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index 247f7a6c05..22428991c6 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -1,17 +1,18 @@ -from typing import List, Optional -import yaml import datetime import re from pathlib import Path +from typing import List, Optional +import yaml from google.cloud import batch_v1 -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from robotoff import settings class GoogleBatchJobConfig(BaseModel): """Batch job configuration class.""" + # By default, extra fields are just ignored. We raise an error in case of extra fields. model_config: ConfigDict = {"extra": "forbid"} @@ -95,8 +96,10 @@ def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig": # Batch job name should respect a specific pattern, or returns an error pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$" if not re.match(pattern, job_name): - raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_name}") - + raise ValueError( + f"Job name should respect the pattern: {pattern}. Current job name: {job_name}" + ) + # Generate unique id for the job unique_job_name = ( job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") @@ -113,7 +116,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: Sources: * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types - + :param google_batch_launch_config: Config to run a job on Google Batch. :type google_batch_launch_config: GoogleBatchLaunchConfig :param batch_job_config: Config to run a specific job on Google Batch. @@ -176,6 +179,8 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: create_request.job = job create_request.job_id = batch_job_config.job_name # The job's parent is the region in which the job will run - create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}" + create_request.parent = ( + f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}" + ) return client.create_job(create_request) diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 47fe2a0c24..ef742bb9dc 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -1000,16 +1000,20 @@ def create_migration( @app.command() def launch_batch_job( - job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"), + job_type: str = typer.Argument( + ..., help="Type of job to launch. Ex: 'ingredients_spellcheck'" + ), ) -> None: """Launch a batch job.""" from robotoff.batch import launch_batch_job as _launch_batch_job - from robotoff.utils import get_logger from robotoff.types import BatchJobType + from robotoff.utils import get_logger if job_type not in BatchJobType.__members__: - raise ValueError(f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}") - + raise ValueError( + f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}" + ) + get_logger() job_type = BatchJobType[job_type] _launch_batch_job(job_type) diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index 029f9aefcd..5c7b142524 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1480,7 +1480,7 @@ class IngredientSpellcheckImporter(InsightImporter): @staticmethod def get_type() -> InsightType: return InsightType.ingredient_spellcheck - + @classmethod def get_required_prediction_types(cls) -> set[PredictionType]: return {PredictionType.ingredient_spellcheck} @@ -1495,15 +1495,14 @@ def generate_candidates( # Only one prediction for candidate in predictions: yield ProductInsight(**candidate.to_dict()) - + @classmethod def is_conflicting_insight( - cls, - candidate: ProductInsight, - reference: ProductInsight + cls, candidate: ProductInsight, reference: ProductInsight ) -> bool: candidate.value_tag == reference.value_tag + class PackagingElementTaxonomyException(Exception): pass diff --git a/robotoff/settings.py b/robotoff/settings.py index 4db6f20126..be5669f406 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -359,4 +359,4 @@ def get_package_version() -> str: CROP_ALLOWED_DOMAINS = os.environ.get("CROP_ALLOWED_DOMAINS", "").split(",") # Batch jobs -GOOGLE_PROJECT_NAME= "robotoff" \ No newline at end of file +GOOGLE_PROJECT_NAME = "robotoff" diff --git a/robotoff/types.py b/robotoff/types.py index 52704e0ec5..9f15b1a1fd 100644 --- a/robotoff/types.py +++ b/robotoff/types.py @@ -359,8 +359,9 @@ class PackagingElementProperty(enum.Enum): InsightAnnotation = Literal[-1, 0, 1, 2] + @enum.unique class BatchJobType(enum.Enum): - """Each job type correspond to a task that will be executed in the batch job. - """ - ingredients_spellcheck = "ingredients-spellcheck" \ No newline at end of file + """Each job type correspond to a task that will be executed in the batch job.""" + + ingredients_spellcheck = "ingredients-spellcheck" diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py index d42297e51f..0698835c6c 100644 --- a/tests/unit/test_batch.py +++ b/tests/unit/test_batch.py @@ -1,16 +1,18 @@ import os -import pytest import tempfile from pathlib import Path +import pytest + +from robotoff import settings from robotoff.batch import GoogleBatchJobConfig from robotoff.batch.extraction import extract_from_dataset -from robotoff import settings - DIR = Path(__file__).parent SPELLCHECK_QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql" -SPELLCHECK_BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" +SPELLCHECK_BATCH_JOB_CONFIG_PATH = ( + settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" +) @pytest.mark.parametrize( @@ -29,11 +31,10 @@ def test_batch_job_config_file(inputs): "query_file_path", [ SPELLCHECK_QUERY_FILE_PATH, - ] + ], ) def test_batch_extraction(query_file_path): - """Test extraction of a batch of data from the dataset depending on the job type. - """ + """Test extraction of a batch of data from the dataset depending on the job type.""" with tempfile.TemporaryDirectory() as tmp_dir: file_path = os.path.join(tmp_dir, "data.parquet") extract_from_dataset( From f8ed76aa222b6d3e7d57e05b20e1764b22ea0ebd Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 3 Sep 2024 10:16:05 +0200 Subject: [PATCH 14/22] fix: :bug: Fixed bug & Better error handling with Falcon --- robotoff/app/api.py | 35 +++++++++++++++++------------------ robotoff/app/auth.py | 6 +----- robotoff/batch/__init__.py | 2 +- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/robotoff/app/api.py b/robotoff/app/api.py index dd71a32f6e..ab27b0483e 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -26,7 +26,6 @@ from robotoff import settings from robotoff.app import schema from robotoff.app.auth import ( - APITokenError, BasicAuthDecodeError, basic_decode, validate_token, @@ -45,7 +44,7 @@ validate_params, ) from robotoff.app.middleware import DBConnectionMiddleware -from robotoff.batch import BatchJobType, import_batch_predictions +from robotoff.batch import import_batch_predictions from robotoff.elasticsearch import get_es_client from robotoff.insights.extraction import ( DEFAULT_OCR_PREDICTION_TYPES, @@ -86,6 +85,7 @@ PredictionType, ProductIdentifier, ServerType, + BatchJobType, ) from robotoff.utils import get_image_from_url, get_logger, http_session from robotoff.utils.i18n import TranslationStore @@ -315,18 +315,18 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool: auth_header = req.get_header("Authorization", required=True) try: - scheme, token = auth_header.split() - except APITokenError: + scheme, token = auth_header.strip().split() + if scheme.lower() != "bearer": + raise falcon.HTTPUnauthorized( + "Invalid authentication scheme: 'Bearer Token' expected." + ) + is_token_valid = validate_token(token, ref_token_name) + if not is_token_valid: + raise falcon.HTTPUnauthorized("Invalid token.") + else: + return True + except ValueError: raise falcon.HTTPUnauthorized("Invalid authentication scheme.") - if scheme.lower() != "bearer": - raise falcon.HTTPUnauthorized( - "Invalid authentication scheme: 'Bearer Token' expected." - ) - is_token_valid = validate_token(token, ref_token_name) - if not is_token_valid: - raise falcon.HTTPUnauthorized("Invalid token.") - else: - return True def device_id_from_request(req: falcon.Request) -> str: @@ -1788,7 +1788,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): raise falcon.HTTPBadRequest( description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}" ) - # We secure the endpoint + # We secure the endpoint. if parse_valid_token(req, "batch_job_key"): enqueue_job( import_batch_predictions, @@ -1796,12 +1796,11 @@ def on_post(self, req: falcon.Request, resp: falcon.Response): queue=low_queue, job_kwargs={"timeout": "30m"}, ) - else: - raise falcon.HTTPForbidden( - description="Invalid batch_job_key. Be sure to indicate the authentification key in the request." - ) logger.info("Batch import %s has been queued.", job_type) + resp.media = {"status": "Request successful. Importing processed data."} + resp.status = falcon.HTTP_200 + class RobotsTxtResource: def on_get(self, req: falcon.Request, resp: falcon.Response): diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py index 5eef036497..c95e2365b1 100644 --- a/robotoff/app/auth.py +++ b/robotoff/app/auth.py @@ -7,10 +7,6 @@ class BasicAuthDecodeError(Exception): pass -class APITokenError(Exception): - pass - - def basic_decode(encoded_str: str) -> tuple[str, str]: """Decode an encrypted HTTP basic authentication string. Returns a tuple of the form (username, password), and raises a BasicAuthDecodeError exception @@ -56,5 +52,5 @@ def validate_token(token: str, ref_token_name: str) -> bool: """ api_token = os.getenv(ref_token_name.upper()) if not api_token: - raise APITokenError("API token not set in environment variables.") + raise ValueError("API token not set in environment variables.") return token == api_token diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index c78b839315..2e6d130d58 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -75,7 +75,7 @@ def import_spellcheck_batch_predictions() -> None: SERVER_TYPE = ServerType.off df = fetch_dataframe_from_gcs( - bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS + bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS ) logger.debug( f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}" From 85b7bfb672cef349be306af255db021543393a4f Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 3 Sep 2024 15:55:12 +0200 Subject: [PATCH 15/22] feat: :ambulance: Changes Enhance batch extraction with popularity_key - Add env variables to batch job - Add make deploy Spellcheck job to Artifact registry --- Makefile | 21 ++++++++++- batch/spellcheck/main.py | 2 +- robotoff/app/api.py | 8 ++--- robotoff/batch/__init__.py | 13 ++++--- .../batch/configs/job_configs/spellcheck.yaml | 2 +- robotoff/batch/configs/sql/spellcheck.sql | 13 +++---- robotoff/batch/launch.py | 36 ++++++++++++++++--- tests/unit/test_batch.py | 24 +++++++------ 8 files changed, 84 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index 4c67f91389..9b16983839 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,12 @@ DOCKER_COMPOSE=docker compose --env-file=${ENV_FILE} DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=robotoff_test COMMON_NET_NAME=po_test docker compose --env-file=${ENV_FILE} ML_OBJECT_DETECTION_MODELS := tf-universal-logo-detector tf-nutrition-table tf-nutriscore +# Spellcheck +IMAGE_NAME = spellcheck-batch-vllm +TAG = latest +GCLOUD_LOCATION = europe-west9-docker.pkg.dev +REGISTRY = ${GCLOUD_LOCATION}/robotoff/gcf-artifacts + .DEFAULT_GOAL := dev # avoid target corresponding to file names, to depends on them .PHONY: * @@ -290,4 +296,17 @@ create-migration: guard-args # create network if not exists create-po-default-network: - docker network create po_default || true + docker network create po_default || true + +# Spellcheck +build-spellcheck: + docker build -f batch/spellcheck/Dockerfile -t $(IMAGE_NAME):$(TAG) batch/spellcheck + +# Push the image to the registry +push-spellcheck: + docker tag $(IMAGE_NAME):$(TAG) $(REGISTRY)/$(IMAGE_NAME):$(TAG) + docker push $(REGISTRY)/$(IMAGE_NAME):$(TAG) + +# Build and push in one command +deploy-spellcheck: + build-spellcheck push-spellcheck diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 6c73648c7e..34c7b98c1d 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -161,7 +161,7 @@ def run_robotoff_endpoint_batch_import(): url = "https://robotoff.openfoodfacts.org/api/v1/batch/import" data = {"job_type": "ingredients_spellcheck"} headers = { - "Authorization": f"Bearer {os.getenv("BATCH_JOB_KEY")}", + "Authorization": f"Bearer {os.getenv('BATCH_JOB_KEY')}", "Content-Type": "application/json" } try: diff --git a/robotoff/app/api.py b/robotoff/app/api.py index ab27b0483e..8066bf6034 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -25,11 +25,7 @@ from robotoff import settings from robotoff.app import schema -from robotoff.app.auth import ( - BasicAuthDecodeError, - basic_decode, - validate_token, -) +from robotoff.app.auth import BasicAuthDecodeError, basic_decode, validate_token from robotoff.app.core import ( SkipVotedOn, SkipVotedType, @@ -79,13 +75,13 @@ from robotoff.products import get_image_id, get_product, get_product_dataset_etag from robotoff.taxonomy import is_prefixed_value, match_taxonomized_value from robotoff.types import ( + BatchJobType, InsightType, JSONType, NeuralCategoryClassifierModel, PredictionType, ProductIdentifier, ServerType, - BatchJobType, ) from robotoff.utils import get_image_from_url, get_logger, http_session from robotoff.utils.i18n import TranslationStore diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 2e6d130d58..4a05303193 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -44,7 +44,9 @@ def launch_spellcheck_batch_job() -> None: ) BUCKET_NAME = "robotoff-spellcheck" SUFFIX_PREPROCESS = "data/preprocessed_data.parquet" + ENV_NAMES = ["BATCH_JOB_KEY"] + logger.info("Extract batch from dataset.") # Extract data from dataset with tempfile.TemporaryDirectory() as tmp_dir: file_path = os.path.join(tmp_dir, "batch_data.parquet") @@ -58,10 +60,13 @@ def launch_spellcheck_batch_job() -> None: # Launch batch job batch_job_config = GoogleBatchJobConfig.init( - job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH + job_name=JOB_NAME, + config_path=BATCH_JOB_CONFIG_PATH, + env_names=ENV_NAMES, ) + logger.info("Batch job config: %s", batch_job_config) batch_job = launch_job(batch_job_config=batch_job_config) - logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.") + logger.info("Batch job succesfully launched. Batch job %s", batch_job) def import_spellcheck_batch_predictions() -> None: @@ -74,9 +79,7 @@ def import_spellcheck_batch_predictions() -> None: PREDICTOR = "fine-tuned-mistral-7b" SERVER_TYPE = ServerType.off - df = fetch_dataframe_from_gcs( - bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS - ) + df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS) logger.debug( f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}" ) diff --git a/robotoff/batch/configs/job_configs/spellcheck.yaml b/robotoff/batch/configs/job_configs/spellcheck.yaml index 18562f6f09..0fc435fdaf 100644 --- a/robotoff/batch/configs/job_configs/spellcheck.yaml +++ b/robotoff/batch/configs/job_configs/spellcheck.yaml @@ -3,7 +3,7 @@ cpu_milli: 1000 memory_mib: 32000 boot_disk_mib: 100000 max_retry_count: 1 -max_run_duration: "3600s" +max_run_duration: "54000s" # 15 hours task_count: "1" parallelism: "1" machine_type: "g2-standard-8" diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql index 0cfebcb09a..f3ca49970c 100644 --- a/robotoff/batch/configs/sql/spellcheck.sql +++ b/robotoff/batch/configs/sql/spellcheck.sql @@ -1,12 +1,13 @@ -SELECT -code, -ingredients_text AS text, -product_name, +SELECT +code, +ingredients_text AS text, +product_name, lang, +popularity_key, (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction FROM read_ndjson('DATASET_PATH', ignore_errors=True) WHERE ingredients_text NOT LIKE '' AND fraction > 0 AND fraction <= 0.4 -ORDER BY random() -LIMIT 100 +ORDER BY popularity_key DESC +LIMIT 10000 ; \ No newline at end of file diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index 22428991c6..b61d8a430f 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -1,7 +1,8 @@ import datetime +import os import re from pathlib import Path -from typing import List, Optional +from typing import Dict, Iterable, List, Optional import yaml from google.cloud import batch_v1 @@ -85,13 +86,26 @@ class GoogleBatchJobConfig(BaseModel): default=True, description="Required if GPUs.", ) + env_variables: Dict[str, str] = Field( + description="Environment variables to pass during the batch job.", + default_factory=dict, + ) @classmethod - def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig": + def init( + cls, + job_name: str, + config_path: Path, + env_names: Optional[Iterable[str]] = None, + ) -> "GoogleBatchJobConfig": """Initialize the class with the configuration file corresponding to the job type. - :param job_type: Batch job type. - :type job_type: BatchJobType + :param job_name: Name of the job. + :type job_name: str + :param config_path: Path to the configuration file. + :type config_path: Path + :param env_variables: List of environment variables to add to the job, defaults to None. + :type env_variables: Optional[Iterable[str]], optional """ # Batch job name should respect a specific pattern, or returns an error pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$" @@ -104,10 +118,17 @@ def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig": unique_job_name = ( job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") ) + + # Environment variables + if not env_names: + env_variables = {} + else: + env_variables = {var_name: os.getenv(var_name) for var_name in env_names} + # Load config file from job_type with open(config_path, "r") as f: config = yaml.safe_load(f) - return cls(job_name=unique_job_name, **config) + return cls(job_name=unique_job_name, env_variables=env_variables, **config) def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: @@ -140,6 +161,11 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: task = batch_v1.TaskSpec() task.runnables = [runnable] + # Environment variables. + envable = batch_v1.Environment() + envable.variables = batch_job_config.env_variables + task.environment = envable + # We can specify what resources are requested by each task. resources = batch_v1.ComputeResource() resources.cpu_milli = batch_job_config.cpu_milli diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py index 0698835c6c..7f4adbdaa3 100644 --- a/tests/unit/test_batch.py +++ b/tests/unit/test_batch.py @@ -14,26 +14,30 @@ settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml" ) +os.environ["KEY"] = "value" + @pytest.mark.parametrize( "inputs", [ - ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH), + ( + "ingredients-spellcheck", + SPELLCHECK_BATCH_JOB_CONFIG_PATH, + ["KEY"] + ), ], ) def test_batch_job_config_file(inputs): "Test indirectly the batch job config file by validating with the Pydantic class model." - job_name, config_path = inputs - GoogleBatchJobConfig.init(job_name, config_path) + job_name, config_path, env_names = inputs + GoogleBatchJobConfig.init( + job_name=job_name, + config_path=config_path, + env_names=env_names, + ) -@pytest.mark.parametrize( - "query_file_path", - [ - SPELLCHECK_QUERY_FILE_PATH, - ], -) -def test_batch_extraction(query_file_path): +def test_batch_extraction(): """Test extraction of a batch of data from the dataset depending on the job type.""" with tempfile.TemporaryDirectory() as tmp_dir: file_path = os.path.join(tmp_dir, "data.parquet") From 31ce875cf78da563fa7d55a43f8bbea696ae490f Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 3 Sep 2024 20:10:57 +0200 Subject: [PATCH 16/22] feat: :ambulance: Credential + Importer --- robotoff/batch/launch.py | 3 ++- robotoff/insights/importer.py | 42 +++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index b61d8a430f..1b4a597f4a 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -147,7 +147,8 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: Returns: Batch job information. """ - + # https://cloud.google.com/iam/docs/service-account-overview + # batch_v1.BatchServiceClient.from_service_account_info(info=json.loads(os.getenv("GOOGLE_CREDENTIALS"))) client = batch_v1.BatchServiceClient() # Define what will be done as part of the job. diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index 5c7b142524..668d0b5599 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1484,7 +1484,7 @@ def get_type() -> InsightType: @classmethod def get_required_prediction_types(cls) -> set[PredictionType]: return {PredictionType.ingredient_spellcheck} - + @classmethod def generate_candidates( cls, @@ -1492,16 +1492,50 @@ def generate_candidates( predictions: list[Prediction], product_id: ProductIdentifier, ) -> Iterator[ProductInsight]: - # Only one prediction - for candidate in predictions: - yield ProductInsight(**candidate.to_dict()) + yield from ( + ProductInsight(**prediction.to_dict()) + for prediction in predictions + if cls._keep_prediction(prediction, product_id) + ) @classmethod def is_conflicting_insight( cls, candidate: ProductInsight, reference: ProductInsight ) -> bool: + # Same language candidate.value_tag == reference.value_tag + @classmethod + def _keep_prediction( + cls, + prediction: Prediction, + product_id: ProductIdentifier + ) -> bool: + conditions = [ + prediction.data["original"] != prediction.data["correction"], + cls._has_changed(prediction, product_id), + ] + return all(conditions) + + @staticmethod + def _has_changed( + prediction: Prediction, + product_id: ProductIdentifier + ) -> bool: + """Check if the lists of ingredients has changed since the last insight.""" + if not ProductInsight.select().where( + ProductInsight.barcode == product_id.barcode, + ProductInsight.server_type == product_id.server_type.name, + ).exists(): + return True + else: + return ProductInsight.select().where( + ProductInsight.barcode == product_id.barcode, + ProductInsight.server_type == product_id.server_type.name, + ProductInsight.type == InsightType.ingredient_spellcheck, + ProductInsight.data["original"] != prediction.data["original"], + ).exists() + class PackagingElementTaxonomyException(Exception): pass From 7c92836e61954b5d8339377c8c69b8ea5f270e95 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 12:19:24 +0200 Subject: [PATCH 17/22] feat: :ambulance: Credentials + Importer + Test --- .gitignore | 2 -- credentials/.gitkeep | 0 docker-compose.yml | 4 ++-- robotoff/batch/__init__.py | 12 ++++++++++-- robotoff/batch/launch.py | 22 ++++++++++++++++++++-- robotoff/insights/importer.py | 33 +++++++-------------------------- robotoff/products.py | 4 ++++ tests/unit/test_batch.py | 6 +----- 8 files changed, 44 insertions(+), 39 deletions(-) delete mode 100644 credentials/.gitkeep diff --git a/.gitignore b/.gitignore index 0443dcd510..3a4dd3e70a 100644 --- a/.gitignore +++ b/.gitignore @@ -43,5 +43,3 @@ site/ gh_pages/ doc/README.md doc/references/cli.md - -credentials \ No newline at end of file diff --git a/credentials/.gitkeep b/credentials/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docker-compose.yml b/docker-compose.yml index 1fe85dacd2..1b2edebd5b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,6 @@ x-robotoff-base-volumes: - ./cache:/opt/robotoff/cache - ./datasets:/opt/robotoff/datasets - ./models:/opt/robotoff/models - - ./credentials:/opt/credentials - robotoff_tmp:/tmp x-robotoff-base: @@ -55,8 +54,9 @@ x-robotoff-base-env: IMAGE_MODERATION_SERVICE_URL: CROP_ALLOWED_DOMAINS: NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings - GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json + GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json GOOGLE_CLOUD_PROJECT: "robotoff" + GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable BATCH_JOB_KEY: # Secure Batch job import with a token key x-robotoff-worker-base: diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 4a05303193..937e538d1b 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -1,3 +1,4 @@ +import datetime import os import tempfile @@ -9,7 +10,7 @@ from .buckets import fetch_dataframe_from_gcs, upload_file_to_gcs from .extraction import extract_from_dataset -from .launch import GoogleBatchJobConfig, launch_job +from .launch import GoogleBatchJobConfig, check_google_credentials, launch_job logger = get_logger(__name__) @@ -46,6 +47,8 @@ def launch_spellcheck_batch_job() -> None: SUFFIX_PREPROCESS = "data/preprocessed_data.parquet" ENV_NAMES = ["BATCH_JOB_KEY"] + check_google_credentials() + logger.info("Extract batch from dataset.") # Extract data from dataset with tempfile.TemporaryDirectory() as tmp_dir: @@ -75,10 +78,15 @@ def import_spellcheck_batch_predictions() -> None: BUCKET_NAME = "robotoff-spellcheck" SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet" PREDICTION_TYPE = PredictionType.ingredient_spellcheck - PREDICTOR_VERSION = "1" # TODO: shard HF model version instead of manual change? + # We increment to allow import_insights to create a new version + PREDICTOR_VERSION = ( + "batch-job" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + ) PREDICTOR = "fine-tuned-mistral-7b" SERVER_TYPE = ServerType.off + check_google_credentials() + df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS) logger.debug( f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}" diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py index 1b4a597f4a..33fff6154c 100644 --- a/robotoff/batch/launch.py +++ b/robotoff/batch/launch.py @@ -1,4 +1,5 @@ import datetime +import json import os import re from pathlib import Path @@ -9,6 +10,25 @@ from pydantic import BaseModel, ConfigDict, Field from robotoff import settings +from robotoff.utils import get_logger + +logger = get_logger(__name__) + + +def check_google_credentials() -> None: + """Create google credentials from variable if doesn't exist""" + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if not credentials_path: + raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set") + if not os.path.exists(credentials_path): + logger.info( + "No google credentials found at %s. Creating credentials from GOOGLE_CREDENTIALS.", + credentials_path, + ) + os.makedirs(os.path.dirname(credentials_path), exist_ok=True) + credentials = json.loads(os.getenv("GOOGLE_CREDENTIALS")) + with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), "w") as f: + json.dump(credentials, f, indent=4) class GoogleBatchJobConfig(BaseModel): @@ -147,8 +167,6 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job: Returns: Batch job information. """ - # https://cloud.google.com/iam/docs/service-account-overview - # batch_v1.BatchServiceClient.from_service_account_info(info=json.loads(os.getenv("GOOGLE_CREDENTIALS"))) client = batch_v1.BatchServiceClient() # Define what will be done as part of the job. diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index 668d0b5599..6ca442e304 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1484,7 +1484,7 @@ def get_type() -> InsightType: @classmethod def get_required_prediction_types(cls) -> set[PredictionType]: return {PredictionType.ingredient_spellcheck} - + @classmethod def generate_candidates( cls, @@ -1495,7 +1495,7 @@ def generate_candidates( yield from ( ProductInsight(**prediction.to_dict()) for prediction in predictions - if cls._keep_prediction(prediction, product_id) + if cls._keep_prediction(prediction=prediction, product=product) ) @classmethod @@ -1507,35 +1507,16 @@ def is_conflicting_insight( @classmethod def _keep_prediction( - cls, - prediction: Prediction, - product_id: ProductIdentifier + cls, prediction: Prediction, product: Optional[Product] ) -> bool: conditions = [ + # Spellcheck didn't correct prediction.data["original"] != prediction.data["correction"], - cls._has_changed(prediction, product_id), + # Modification of the original ingredients between two dataset dumps (24-hour period) + product is None or prediction.data["original"] != product.ingredients_text, ] return all(conditions) - - @staticmethod - def _has_changed( - prediction: Prediction, - product_id: ProductIdentifier - ) -> bool: - """Check if the lists of ingredients has changed since the last insight.""" - if not ProductInsight.select().where( - ProductInsight.barcode == product_id.barcode, - ProductInsight.server_type == product_id.server_type.name, - ).exists(): - return True - else: - return ProductInsight.select().where( - ProductInsight.barcode == product_id.barcode, - ProductInsight.server_type == product_id.server_type.name, - ProductInsight.type == InsightType.ingredient_spellcheck, - ProductInsight.data["original"] != prediction.data["original"], - ).exists() - + class PackagingElementTaxonomyException(Exception): pass diff --git a/robotoff/products.py b/robotoff/products.py index efaec716f9..b7c1dae89c 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -417,6 +417,7 @@ class Product: "image_ids", "packagings", "lang", + "ingredients_text", ) def __init__(self, product: JSONType): @@ -439,6 +440,7 @@ def __init__(self, product: JSONType): else list(key for key in self.images.keys() if key.isdigit()) ) self.lang: Optional[str] = product.get("lang") + self.ingredients_text: Optional[str] = product.get("ingredients_text") @staticmethod def get_fields(): @@ -454,6 +456,8 @@ def get_fields(): "stores_tags", "unique_scans_n", "images", + "lang", + "ingredients_text", } diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py index 7f4adbdaa3..89a440fff0 100644 --- a/tests/unit/test_batch.py +++ b/tests/unit/test_batch.py @@ -20,11 +20,7 @@ @pytest.mark.parametrize( "inputs", [ - ( - "ingredients-spellcheck", - SPELLCHECK_BATCH_JOB_CONFIG_PATH, - ["KEY"] - ), + ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH, ["KEY"]), ], ) def test_batch_job_config_file(inputs): From be475bdb3f8eea8be1ff4d888f720b100e2437ee Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 12:41:01 +0200 Subject: [PATCH 18/22] feat: :bug: Forgot a return --- robotoff/insights/importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py index 6ca442e304..533f5a5e0b 100644 --- a/robotoff/insights/importer.py +++ b/robotoff/insights/importer.py @@ -1503,7 +1503,7 @@ def is_conflicting_insight( cls, candidate: ProductInsight, reference: ProductInsight ) -> bool: # Same language - candidate.value_tag == reference.value_tag + return candidate.value_tag == reference.value_tag @classmethod def _keep_prediction( From 762722f9fb3168abeea9173748d041fbafc6ac21 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 12:44:19 +0200 Subject: [PATCH 19/22] style: :sparkles: Black on spellcheck script --- batch/spellcheck/main.py | 112 ++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 37 deletions(-) diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py index 34c7b98c1d..0ae9adeff9 100644 --- a/batch/spellcheck/main.py +++ b/batch/spellcheck/main.py @@ -22,18 +22,53 @@ def parse() -> argparse.Namespace: - """Parse command line arguments. - """ + """Parse command line arguments.""" parser = argparse.ArgumentParser(description="Spellcheck module.") - parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.") - parser.add_argument("--pre_data_suffix", type=str, default="data/preprocessed_data.parquet", help="Dataset suffix containing the data to be processed.") - parser.add_argument("--post_data_suffix", type=str, default="data/postprocessed_data.parquet", help="Dataset suffix containing the processed data.") - parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.") - parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.") - parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.") - parser.add_argument("--max_tokens", default=1024, type=int, help="Maximum number of tokens to generate.") - parser.add_argument("--quantization", default="fp8", type=str, help="Quantization type.") - parser.add_argument("--dtype", default="auto", type=str, help="Model weights precision. Default corresponds to the modle config (float16 here)") + parser.add_argument( + "--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name." + ) + parser.add_argument( + "--pre_data_suffix", + type=str, + default="data/preprocessed_data.parquet", + help="Dataset suffix containing the data to be processed.", + ) + parser.add_argument( + "--post_data_suffix", + type=str, + default="data/postprocessed_data.parquet", + help="Dataset suffix containing the processed data.", + ) + parser.add_argument( + "--model_path", + default="openfoodfacts/spellcheck-mistral-7b", + type=str, + help="HF model path.", + ) + parser.add_argument( + "--max_model_len", + default=1024, + type=int, + help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.", + ) + parser.add_argument( + "--temperature", default=0, type=float, help="Sampling temperature." + ) + parser.add_argument( + "--max_tokens", + default=1024, + type=int, + help="Maximum number of tokens to generate.", + ) + parser.add_argument( + "--quantization", default="fp8", type=str, help="Quantization type." + ) + parser.add_argument( + "--dtype", + default="auto", + type=str, + help="Model weights precision. Default corresponds to the modle config (float16 here)", + ) return parser.parse_args() @@ -43,7 +78,7 @@ def main(): Original lists of ingredients are stored in a gs bucket before being loaded then processed by the model. The corrected lists of ingredients are then stored back in gs. - We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository. + We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository. """ logger.info("Starting batch processing job.") args = parse() @@ -52,32 +87,35 @@ def main(): data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix) logger.info(f"Feature in uploaded data: {data.columns}") if not all(feature in data.columns for feature in FEATURES_VALIDATION): - raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}") + raise ValueError( + f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}" + ) instructions = [prepare_instruction(text) for text in data["text"]] llm = LLM( - model=args.model_path, - max_model_len=args.max_model_len, + model=args.model_path, + max_model_len=args.max_model_len, dtype=args.dtype, quantization=args.quantization, ) sampling_params = SamplingParams( - temperature=args.temperature, - max_tokens=args.max_tokens + temperature=args.temperature, max_tokens=args.max_tokens ) - logger.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}") - data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params) + logger.info( + f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}" + ) + data["correction"] = batch_inference( + instructions, llm=llm, sampling_params=sampling_params + ) logger.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}") # Save DataFrame as Parquet to a temporary file - with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file: + with tempfile.NamedTemporaryFile(delete=True, suffix=".parquet") as temp_file: data.to_parquet(temp_file.name) temp_file_name = temp_file.name upload_gcs( - temp_file_name, - bucket_name=args.data_bucket, - suffix=args.post_data_suffix + temp_file_name, bucket_name=args.data_bucket, suffix=args.post_data_suffix ) logger.info("Request Robotoff API batch import endpoint.") @@ -96,18 +134,14 @@ def prepare_instruction(text: str) -> str: str: Instruction. """ instruction = ( - "###Correct the list of ingredients:\n" - + text - + "\n\n###Correction:\n" + "###Correct the list of ingredients:\n" + text + "\n\n###Correction:\n" ) return instruction def batch_inference( - texts: List[str], - llm: LLM, - sampling_params: SamplingParams - ) -> List[str]: + texts: List[str], llm: LLM, sampling_params: SamplingParams +) -> List[str]: """Process batch of texts with vLLM. Args: @@ -118,7 +152,10 @@ def batch_inference( Returns: List[str]: Processed batch of texts """ - outputs = llm.generate(texts, sampling_params,) + outputs = llm.generate( + texts, + sampling_params, + ) corrections = [output.outputs[0].text for output in outputs] return corrections @@ -127,7 +164,7 @@ def load_gcs(bucket_name: str, suffix: str) -> pd.DataFrame: """Load data from Google Cloud Storage bucket. Args: - bucket_name (str): + bucket_name (str): suffix (str): Path inside the bucket Returns: @@ -156,13 +193,12 @@ def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None: def run_robotoff_endpoint_batch_import(): - """Run Robotoff api endpoint to import batch data into tables. - """ + """Run Robotoff api endpoint to import batch data into tables.""" url = "https://robotoff.openfoodfacts.org/api/v1/batch/import" data = {"job_type": "ingredients_spellcheck"} headers = { "Authorization": f"Bearer {os.getenv('BATCH_JOB_KEY')}", - "Content-Type": "application/json" + "Content-Type": "application/json", } try: response = requests.post( @@ -170,10 +206,12 @@ def run_robotoff_endpoint_batch_import(): data=data, headers=headers, ) - logger.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}") + logger.info( + f"Import batch Robotoff API endpoint succesfully requested: {response.text}" + ) except requests.exceptions.RequestException as e: raise SystemExit(e) - + if __name__ == "__main__": main() From 10791e7fa59db8f46d18d3394aac790b1f0791d9 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 18:35:58 +0200 Subject: [PATCH 20/22] docs: :memo: Add batch/import api endpoint to doc --- doc/references/api.yml | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/doc/references/api.yml b/doc/references/api.yml index c01664cc2a..74a7f5c920 100644 --- a/doc/references/api.yml +++ b/doc/references/api.yml @@ -1105,6 +1105,28 @@ paths: "400": description: "An HTTP 400 is returned if the provided parameters are invalid" + /batch/import: + post: + tags: + - Batch Job + summary: Import batch processed data to the Robotoff database. + security: + - batch_job_key: [] + description: + Trigger import of the batch processed data to the Robotoff database. A `BATCH_JOB_KEY` is expected in the authorization header. + This endpoint is mainly used by the batch job once the job is finished. + parameters: + - $ref: "#/components/parameters/job_type" + responses: + "200": + description: Data successfully imported. + content: + application/json: + status: + type: string + description: Request successful. Importing processed data. + "400": + description: "An HTTP 400 is returned if the authentification key is invalid or if the job_type is not supported." components: schemas: @@ -1391,6 +1413,21 @@ components: schema: type: integer example: 5410041040807 + job_type: + name: job_type + in: query + required: true + description: The type of batch job launched. + schema: + type: string + enum: + - ingredients_spellcheck + + securitySchemes: + batch_job_key: + type: http + scheme: bearer + tags: - name: Questions - name: Insights @@ -1398,4 +1435,4 @@ tags: An insight is a fact about a product that has been either extracted or inferred from the product pictures, characteristics,... If the insight is correct, the Openfoodfacts DB can be updated accordingly. - Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py). + Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py). \ No newline at end of file From 400818b9de7cca87b353a960a1c6b19996ddc73e Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 18:41:18 +0200 Subject: [PATCH 21/22] docs: :memo: Because perfection --- doc/references/api.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/references/api.yml b/doc/references/api.yml index 74a7f5c920..d339bd5311 100644 --- a/doc/references/api.yml +++ b/doc/references/api.yml @@ -1435,4 +1435,4 @@ tags: An insight is a fact about a product that has been either extracted or inferred from the product pictures, characteristics,... If the insight is correct, the Openfoodfacts DB can be updated accordingly. - Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py). \ No newline at end of file + Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py). From 4ebfd87b5f5bac9299f4f241de295ea3c69aec8b Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 4 Sep 2024 18:48:31 +0200 Subject: [PATCH 22/22] fix: :art: Change predictor version to also track... the predictor version We concluded that PREDICTOR-VERSION will be used to track batch jobs and allow new data predictions to be imported. In the future, we'll find a way to detect already processed data in another way, such as before the batch job during the extraction stage. --- robotoff/batch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py index 937e538d1b..738a994605 100644 --- a/robotoff/batch/__init__.py +++ b/robotoff/batch/__init__.py @@ -80,7 +80,7 @@ def import_spellcheck_batch_predictions() -> None: PREDICTION_TYPE = PredictionType.ingredient_spellcheck # We increment to allow import_insights to create a new version PREDICTOR_VERSION = ( - "batch-job" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "llm-v1" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S") ) PREDICTOR = "fine-tuned-mistral-7b" SERVER_TYPE = ServerType.off