From 75ebf64de1dba87ac06b5de42afc227b144de2f2 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 21 Aug 2024 17:23:49 +0200
Subject: [PATCH 01/22] feat(Batch job - Spellcheck): :zap:

---
 batch/spellcheck/Dockerfile            |  15 ++
 batch/spellcheck/README.md             |  40 ++++
 batch/spellcheck/job.py                | 150 +++++++++++++++
 batch/spellcheck/requirements.txt      |   2 +
 poetry.lock                            | 252 ++++++++++++++++++++++---
 pyproject.toml                         |   3 +-
 robotoff/app/api.py                    |  25 ++-
 robotoff/batch/__init__.py             |  11 ++
 robotoff/batch/batch.py                | 206 ++++++++++++++++++++
 robotoff/batch/configs/spellcheck.yaml |  13 ++
 robotoff/settings.py                   |   4 +
 robotoff/types.py                      |  40 +++-
 tests/unit/test_batch.py               |  18 ++
 13 files changed, 747 insertions(+), 32 deletions(-)
 create mode 100644 batch/spellcheck/Dockerfile
 create mode 100644 batch/spellcheck/README.md
 create mode 100644 batch/spellcheck/job.py
 create mode 100644 batch/spellcheck/requirements.txt
 create mode 100644 robotoff/batch/__init__.py
 create mode 100644 robotoff/batch/batch.py
 create mode 100644 robotoff/batch/configs/spellcheck.yaml
 create mode 100644 tests/unit/test_batch.py

diff --git a/batch/spellcheck/Dockerfile b/batch/spellcheck/Dockerfile
new file mode 100644
index 0000000000..61b73b5b1b
--- /dev/null
+++ b/batch/spellcheck/Dockerfile
@@ -0,0 +1,15 @@
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=on
+
+WORKDIR /app
+
+COPY job.py /app
+COPY requirements.txt /app
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Set the entrypoint to the batch job script
+ENTRYPOINT ["python", "main.py"]
\ No newline at end of file
diff --git a/batch/spellcheck/README.md b/batch/spellcheck/README.md
new file mode 100644
index 0000000000..975f63733e
--- /dev/null
+++ b/batch/spellcheck/README.md
@@ -0,0 +1,40 @@
+# Google Batch job
+
+## Notes
+
+* Netherland (europe-west4) has GPUs (A100, L4)
+* Check [CLOUD-LOGGING](https://console.cloud.google.com/logs/query;query=SEARCH%2528%22spellcheck%22%2529;cursorTimestamp=2024-08-14T11:21:32.485988660Z;duration=PT1H?referrer=search&project=robotoff) for logs
+* Require deep learning image to run: [deep learning containers list](https://cloud.google.com/deep-learning-containers/docs/choosing-container#pytorch)
+* Custom storage capacity to host the heavy docker image (~24GB) by adding BootDisk
+* 1000 products processed: 1:30min (g2-instance-with 8) (overall batch job: 3:25min)
+    * L4: g2-instance-8 hourly cost: $0.896306 -> ~ 0.05$ to process batch of 1000
+    * A100: a2-highgpu-1g: $3.748064
+* A100/Cuda doesn't support FP8
+* A100 has less availability than L4: need to wait for batch job (can be long)
+
+## Links
+
+* [GPU availability per region](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones)
+* [Batch job with GPU](https://cloud.google.com/batch/docs/create-run-job-gpus#create-job-gpu-examples)
+* [VM Instance pricing](https://cloud.google.com/compute/vm-instance-pricing#vm-instance-pricing)
+* [Trigger cloud function with bucket updates](https://cloud.google.com/functions/docs/calling/storage)
+* [Python Google Batch](https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch)
+
+## Commands
+
+### List GPUs per region
+```bash
+gcloud compute accelerator-types list
+```
+
+### List deep learning images
+```bash
+gcloud compute images list \
+--project deeplearning-platform-release \
+--format="value(NAME)" \
+--no-standard-images
+```
+
+## Workflow / Orchestration
+
+* [Workflow](https://cloud.google.com/workflows/docs/overview)
diff --git a/batch/spellcheck/job.py b/batch/spellcheck/job.py
new file mode 100644
index 0000000000..6c629e4c56
--- /dev/null
+++ b/batch/spellcheck/job.py
@@ -0,0 +1,150 @@
+import argparse
+import tempfile
+import logging
+from typing import List
+
+import pandas as pd
+from vllm import LLM, SamplingParams
+from google.cloud import storage
+
+
+LOGGER = logging.getLogger(__name__)
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+FEATURES_VALIDATION = ["code", "text"]
+
+
+def parse() -> argparse.Namespace:
+    """Parse command line arguments.
+    """
+    parser = argparse.ArgumentParser(description="Spellcheck module.")
+    parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.")
+    parser.add_argument("--pre_data_suffix", type=str, default="data/test_data.parquet", help="Dataset suffix containing the data to be processed.")
+    parser.add_argument("--post_data_suffix", type=str, default="data/test_processed_data.parquet", help="Dataset suffix containing the processed data.")
+    parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.")
+    parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.")
+    parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.")
+    parser.add_argument("--max_tokens", default=1024, type=int, help="Maximum number of tokens to generate.")
+    parser.add_argument("--quantization", default="fp8", type=str, help="Quantization type.")
+    parser.add_argument("--dtype", default="auto", type=str, help="Model weights precision. Default corresponds to the modle config (float16 here)")
+    return parser.parse_args()
+
+
+def main():
+    """Batch processing job.
+
+    Original lists of ingredients are stored in a gs bucket before being loaded then processed by the model.
+    The corrected lists of ingredients are then stored back in gs.
+
+    We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository.  
+    """
+    LOGGER.info("Starting batch processing job.")
+    args = parse()
+
+    LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}")
+    data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix)
+    LOGGER.info(f"Feature in uploaded data: {data.columns}")
+    if not all(feature in data.columns for feature in FEATURES_VALIDATION):
+        raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}")
+
+    instructions = [prepare_instruction(text) for text in data["text"]]
+    llm = LLM(
+        model=args.model_path, 
+        max_model_len=args.max_model_len, 
+        dtype=args.dtype,
+        quantization=args.quantization,
+    )
+    sampling_params = SamplingParams(
+        temperature=args.temperature, 
+        max_tokens=args.max_tokens
+    )
+
+    LOGGER.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}")
+    data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params)
+
+    LOGGER.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}")
+    # Save DataFrame as Parquet to a temporary file
+    with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file:
+        data.to_parquet(temp_file.name)
+        temp_file_name = temp_file.name
+        upload_gcs(
+            temp_file_name, 
+            bucket_name=args.data_bucket, 
+            suffix=args.post_data_suffix
+        )
+    LOGGER.info("Batch processing job completed.")
+
+
+def prepare_instruction(text: str) -> str:
+    """Prepare instruction prompt for fine-tuning and inference.
+
+    Args:
+        text (str): List of ingredients
+
+    Returns:
+        str: Instruction.
+    """
+    instruction = (
+        "###Correct the list of ingredients:\n"
+        + text
+        + "\n\n###Correction:\n"
+    )
+    return instruction
+
+
+def batch_inference(
+        texts: List[str], 
+        llm: LLM,
+        sampling_params: SamplingParams
+    ) -> List[str]:
+    """Process batch of texts with vLLM.
+
+    Args:
+        texts (List[str]): Batch
+        llm (LLM): Model engine optimized with vLLM
+        sampling_params (SamplingParams): Generation parameters
+
+    Returns:
+        List[str]: Processed batch of texts
+    """
+    outputs = llm.generate(texts, sampling_params,)
+    corrections = [output.outputs[0].text for output in outputs]
+    return corrections
+
+
+def load_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
+    """Load data from Google Cloud Storage bucket.
+
+    Args:
+        bucket_name (str): 
+        suffix (str): Path inside the bucket
+
+    Returns:
+        pd.DataFrame: Df from parquet file.
+    """
+    client = storage.Client()
+    bucket = client.get_bucket(bucket_name)
+    blob = bucket.blob(suffix)
+    with blob.open("rb") as f:
+        df = pd.read_parquet(f)
+    return df
+
+
+def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
+    """Upload data to GCS.
+
+    Args:
+        filepath (str): File path to export.
+        bucket_name (str): Bucket name.
+        suffix (str): Path inside the bucket.
+    """
+    client = storage.Client()
+    bucket = client.get_bucket(bucket_name)
+    blob = bucket.blob(suffix)
+    blob.upload_from_filename(filename=file_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/batch/spellcheck/requirements.txt b/batch/spellcheck/requirements.txt
new file mode 100644
index 0000000000..0ab3046f20
--- /dev/null
+++ b/batch/spellcheck/requirements.txt
@@ -0,0 +1,2 @@
+vllm==0.5.4
+google-cloud-storage==2.18.0
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 2c370798fb..c6ddc1774b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -987,6 +987,87 @@ python-dateutil = ">=2.8.1"
 [package.extras]
 dev = ["flake8", "markdown", "twine", "wheel"]
 
+[[package]]
+name = "google-api-core"
+version = "1.34.1"
+description = "Google API client core library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-api-core-1.34.1.tar.gz", hash = "sha256:3399c92887a97d33038baa4bfd3bf07acc05d474b0171f333e1f641c1364e552"},
+    {file = "google_api_core-1.34.1-py3-none-any.whl", hash = "sha256:52bcc9d9937735f8a3986fa0bbf9135ae9cf5393a722387e5eced520e39c774a"},
+]
+
+[package.dependencies]
+google-auth = ">=1.25.0,<3.0dev"
+googleapis-common-protos = ">=1.56.2,<2.0dev"
+grpcio = {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""}
+grpcio-status = {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""}
+protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.0.0dev"
+requests = ">=2.18.0,<3.0.0dev"
+
+[package.extras]
+grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)"]
+grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"]
+grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"]
+
+[[package]]
+name = "google-auth"
+version = "2.34.0"
+description = "Google Authentication Library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google_auth-2.34.0-py2.py3-none-any.whl", hash = "sha256:72fd4733b80b6d777dcde515628a9eb4a577339437012874ea286bca7261ee65"},
+    {file = "google_auth-2.34.0.tar.gz", hash = "sha256:8eb87396435c19b20d32abd2f984e31c191a15284af72eb922f10e5bde9c04cc"},
+]
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = ">=3.1.4,<5"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
+enterprise-cert = ["cryptography", "pyopenssl"]
+pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+requests = ["requests (>=2.20.0,<3.0.0.dev0)"]
+
+[[package]]
+name = "google-cloud-batch"
+version = "0.17.26"
+description = "Google Cloud Batch API client library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google_cloud_batch-0.17.26-py2.py3-none-any.whl", hash = "sha256:2cbed78f6fe612b540c08f92e01cca22fa66c38505b4a084d0c6e4da88dea335"},
+    {file = "google_cloud_batch-0.17.26.tar.gz", hash = "sha256:9d86f703ed990d223c386883047c83a70ecab2378e1a686c8f67b113b00644cf"},
+]
+
+[package.dependencies]
+google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]}
+google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev"
+proto-plus = ">=1.22.3,<2.0.0dev"
+protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.63.2"
+description = "Common protobufs used in Google APIs"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"},
+    {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"},
+]
+
+[package.dependencies]
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+
+[package.extras]
+grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+
 [[package]]
 name = "grpcio"
 version = "1.65.1"
@@ -1045,6 +1126,22 @@ files = [
 [package.extras]
 protobuf = ["grpcio-tools (>=1.65.1)"]
 
+[[package]]
+name = "grpcio-status"
+version = "1.48.2"
+description = "Status proto mapping for gRPC"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "grpcio-status-1.48.2.tar.gz", hash = "sha256:53695f45da07437b7c344ee4ef60d370fd2850179f5a28bb26d8e2aa1102ec11"},
+    {file = "grpcio_status-1.48.2-py3-none-any.whl", hash = "sha256:2c33bbdbe20188b2953f46f31af669263b6ee2a9b2d38fa0d36ee091532e21bf"},
+]
+
+[package.dependencies]
+googleapis-common-protos = ">=1.5.5"
+grpcio = ">=1.48.2"
+protobuf = ">=3.12.0"
+
 [[package]]
 name = "gunicorn"
 version = "22.0.0"
@@ -2117,38 +2214,52 @@ pyyaml = ">=5.1"
 toml = "*"
 virtualenv = ">=20.0.8"
 
+[[package]]
+name = "proto-plus"
+version = "1.24.0"
+description = "Beautiful, Pythonic protocol buffers."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "proto-plus-1.24.0.tar.gz", hash = "sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445"},
+    {file = "proto_plus-1.24.0-py3-none-any.whl", hash = "sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19.0,<6.0.0dev"
+
+[package.extras]
+testing = ["google-api-core (>=1.31.5)"]
+
 [[package]]
 name = "protobuf"
-version = "3.19.6"
+version = "3.20.3"
 description = "Protocol Buffers"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.7"
 files = [
-    {file = "protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1"},
-    {file = "protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6"},
-    {file = "protobuf-3.19.6-cp310-cp310-win32.whl", hash = "sha256:559670e006e3173308c9254d63facb2c03865818f22204037ab76f7a0ff70b5f"},
-    {file = "protobuf-3.19.6-cp310-cp310-win_amd64.whl", hash = "sha256:347b393d4dd06fb93a77620781e11c058b3b0a5289262f094379ada2920a3730"},
-    {file = "protobuf-3.19.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a8ce5ae0de28b51dff886fb922012dad885e66176663950cb2344c0439ecb473"},
-    {file = "protobuf-3.19.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b0d02163c4e67279ddb6dc25e063db0130fc299aefabb5d481053509fae5c8"},
-    {file = "protobuf-3.19.6-cp36-cp36m-win32.whl", hash = "sha256:30f5370d50295b246eaa0296533403961f7e64b03ea12265d6dfce3a391d8992"},
-    {file = "protobuf-3.19.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0c0714b025ec057b5a7600cb66ce7c693815f897cfda6d6efb58201c472e3437"},
-    {file = "protobuf-3.19.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5057c64052a1f1dd7d4450e9aac25af6bf36cfbfb3a1cd89d16393a036c49157"},
-    {file = "protobuf-3.19.6-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bb6776bd18f01ffe9920e78e03a8676530a5d6c5911934c6a1ac6eb78973ecb6"},
-    {file = "protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a04134866861b11556a82dd91ea6daf1f4925746b992f277b84013a7cc1229"},
-    {file = "protobuf-3.19.6-cp37-cp37m-win32.whl", hash = "sha256:4bc98de3cdccfb5cd769620d5785b92c662b6bfad03a202b83799b6ed3fa1fa7"},
-    {file = "protobuf-3.19.6-cp37-cp37m-win_amd64.whl", hash = "sha256:aa3b82ca1f24ab5326dcf4ea00fcbda703e986b22f3d27541654f749564d778b"},
-    {file = "protobuf-3.19.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2b2d2913bcda0e0ec9a784d194bc490f5dc3d9d71d322d070b11a0ade32ff6ba"},
-    {file = "protobuf-3.19.6-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d0b635cefebd7a8a0f92020562dead912f81f401af7e71f16bf9506ff3bdbb38"},
-    {file = "protobuf-3.19.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a552af4dc34793803f4e735aabe97ffc45962dfd3a237bdde242bff5a3de684"},
-    {file = "protobuf-3.19.6-cp38-cp38-win32.whl", hash = "sha256:0469bc66160180165e4e29de7f445e57a34ab68f49357392c5b2f54c656ab25e"},
-    {file = "protobuf-3.19.6-cp38-cp38-win_amd64.whl", hash = "sha256:91d5f1e139ff92c37e0ff07f391101df77e55ebb97f46bbc1535298d72019462"},
-    {file = "protobuf-3.19.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c0ccd3f940fe7f3b35a261b1dd1b4fc850c8fde9f74207015431f174be5976b3"},
-    {file = "protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39"},
-    {file = "protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0"},
-    {file = "protobuf-3.19.6-cp39-cp39-win32.whl", hash = "sha256:5a0d7539a1b1fb7e76bf5faa0b44b30f812758e989e59c40f77a7dab320e79b9"},
-    {file = "protobuf-3.19.6-cp39-cp39-win_amd64.whl", hash = "sha256:bbf5cea5048272e1c60d235c7bd12ce1b14b8a16e76917f371c718bd3005f045"},
-    {file = "protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4"},
-    {file = "protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4"},
+    {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"},
+    {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"},
+    {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"},
+    {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"},
+    {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"},
+    {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"},
+    {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"},
+    {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"},
+    {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"},
+    {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"},
+    {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"},
+    {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"},
+    {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"},
+    {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"},
+    {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"},
+    {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"},
+    {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"},
+    {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"},
+    {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"},
+    {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"},
+    {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"},
+    {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"},
 ]
 
 [[package]]
@@ -2158,6 +2269,7 @@ description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
 python-versions = ">=3.7"
 files = [
+    {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
     {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
     {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
     {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
@@ -2183,6 +2295,7 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2191,10 +2304,43 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
 ]
 
 [[package]]
@@ -2211,6 +2357,31 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "pyasn1"
+version = "0.6.0"
+description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1-0.6.0-py2.py3-none-any.whl", hash = "sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473"},
+    {file = "pyasn1-0.6.0.tar.gz", hash = "sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c"},
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.0"
+description = "A collection of ASN.1-based protocols modules"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1_modules-0.4.0-py3-none-any.whl", hash = "sha256:be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b"},
+    {file = "pyasn1_modules-0.4.0.tar.gz", hash = "sha256:831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.7.0"
+
 [[package]]
 name = "pycodestyle"
 version = "2.8.0"
@@ -2808,6 +2979,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -2815,8 +2987,16 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -2833,6 +3013,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -2840,6 +3021,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -3047,6 +3229,20 @@ redis = "*"
 redis-sentinel-url = "*"
 rq = ">=1.0"
 
+[[package]]
+name = "rsa"
+version = "4.9"
+description = "Pure-Python RSA implementation"
+optional = false
+python-versions = ">=3.6,<4"
+files = [
+    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
+    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
 [[package]]
 name = "safetensors"
 version = "0.4.3"
@@ -3908,4 +4104,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "f3a66804cfb8e94e6e7e8c149a9921e590942da2e03c526041906c5b473ca6fd"
+content-hash = "080e1a8ef09819c49742c8270b5c2da81ff49469d77a8cd304567aeba79e0741"
diff --git a/pyproject.toml b/pyproject.toml
index 7f36bb121f..7b5d309f9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,7 @@ jsonschema = "~4.4.0"
 orjson = ">=3.8.2,<3.10.0"
 Pillow = ">=9.3,<10.4"
 numpy = "~1.26.4"
-protobuf = "~3.19.0"
+protobuf = "^3.19.0"
 Pint = "0.22"
 APScheduler = "~3.10.1"
 more-itertools = "~8.9.0"
@@ -78,6 +78,7 @@ openfoodfacts = "1.1.1"
 imagehash = "~4.3.1"
 peewee-migrate = "~1.12.2"
 diskcache = "~5.6.3"
+google-cloud-batch = "^0.17.26"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index ea3531c0a9..64b23fa83e 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -86,6 +86,11 @@
 from robotoff.utils.text import get_tag
 from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue
 from robotoff.workers.tasks import download_product_dataset_job
+from robotoff.batch import (
+    BatchJobType, 
+    GoogleBatchJob,
+    GoogleBatchJobConfig
+)
 
 logger = get_logger()
 
@@ -1748,6 +1753,24 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         resp.status = falcon.HTTP_200
 
 
+class BatchJobResource:
+    def on_post(self, req: falcon.Request, resp: falcon.Response):
+        job_type_str: str = req.get_param("job_type", required=True)
+
+        # Batch extraction
+
+        # Launch Batch job
+        logger.info(f"Start batch with job_type: {job_type_str}")
+        try:
+            job_type = BatchJobType[job_type_str]
+        except KeyError: 
+            raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}")
+        
+        batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
+        batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
+        resp.media = {"batch_job_details": batch_job}
+
+
 def custom_handle_uncaught_exception(
     req: falcon.Request, resp: falcon.Response, ex: Exception, params
 ):
@@ -1785,7 +1808,7 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predict/nutrition", NutritionPredictorResource())
 api.add_route("/api/v1/predict/ocr_prediction", OCRPredictionPredictorResource())
 api.add_route("/api/v1/predict/category", CategoryPredictorResource())
-api.add_route("/api/v1/predict/ingredient_list", IngredientListPredictorResource())
+api.add_route("/api/v1/predict/ ", IngredientListPredictorResource())
 api.add_route("/api/v1/predict/lang", LanguagePredictorResource())
 api.add_route("/api/v1/predict/lang/product", ProductLanguagePredictorResource())
 api.add_route("/api/v1/products/dataset", UpdateDatasetResource())
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
new file mode 100644
index 0000000000..3febe071f0
--- /dev/null
+++ b/robotoff/batch/__init__.py
@@ -0,0 +1,11 @@
+from .batch import (
+    GoogleBatchJob,
+    GoogleBatchJobConfig,
+    BatchJobType,
+)
+
+__all__ = [
+    "GoogleBatchJob",
+    "GoogleBatchJobConfig",
+    "BatchJobType",
+]
\ No newline at end of file
diff --git a/robotoff/batch/batch.py b/robotoff/batch/batch.py
new file mode 100644
index 0000000000..a21468aea6
--- /dev/null
+++ b/robotoff/batch/batch.py
@@ -0,0 +1,206 @@
+import abc
+from typing import List, Optional
+import enum
+import yaml
+import datetime
+
+from google.cloud import batch_v1
+from pydantic import BaseModel, Field
+
+from robotoff import settings
+
+
+@enum.unique
+class BatchJobType(enum.Enum):
+    """Each job type correspond to a task that will be executed in the batch job."""
+
+    ingredients_spellcheck = "ingredients_spellcheck"
+
+
+# Paths batch job config files
+BATCH_JOB_TYPE_TO_CONFIG_PATH = {
+    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR
+    / "spellcheck.yaml",
+}
+
+
+class GoogleBatchJobConfig(BaseModel):
+    """Batch job configuration class."""
+
+    job_name: str = Field(
+        description="The name of the job. It needs to be unique amongst exisiting batch job names.",
+    )
+    location: str = Field(
+        pattern=r"^europe-west\d{1,2}$",
+        description="The region in which the job will run. Regions that are available for Batch are listed on: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones. We restrict to Europe-West for now.",
+    )
+    entrypoint: Optional[str] = Field(
+        default=None,
+        description="The entrypoint for the container. If None, use default entrypoint.",
+        examples=["python main.py"],
+    )
+    commands: List[str] = Field(
+        default_factory=list,
+        description="Commands to run in the container. If None, use default commands. Can be used to add arguments to the job script.",
+        examples=[["--max_tokens", "1024"]],
+    )
+    cpu_milli: int = Field(
+        default=1000,
+        description="The number of CPU milliseconds to allocate to the job. 1000 corresponds to 1 CPU core.",
+        ge=1000,
+    )
+    memory_mib: int = Field(
+        default=8000,  # 8GB
+        description="The amount of RAM in MiB to allocate to each CPU core.",
+        le=64000,
+    )
+    boot_disk_mib: Optional[int] = Field(
+        default=None,
+        description="The size of the boot disk in MiB. It is deleted once the job finished. If None, no bootDisk is added.",
+        le=200000,  # 200 GB
+    )
+    max_retry_count: int = Field(
+        default=1,
+        ge=1,
+        description="The maximum number of times a task should be retried in case of failure.",
+    )
+    max_run_duration: str = Field(
+        pattern=r"^\d{1,5}s$",
+        default="3600s",
+        description="The maximum duration of the job in seconds.",
+    )
+    task_count: str = Field(
+        pattern=r"^\d+$",
+        default="1",
+        description="The number of tasks to run in the job.",
+    )
+    parallelism: str = Field(
+        pattern=r"^\d+$",
+        default="1",
+        description="The number of tasks to run in parallel.",
+    )
+    machine_type: str = Field(
+        description="The machine type to use for the job. Read more about machine types here: https://cloud.google.com/compute/docs/general-purpose-machines",
+    )
+    accelerators_type: str = Field(
+        description="The type of accelerator to use for the job. Depends on the machine type. Read more about accelerators here: https://cloud.google.com/compute/docs/gpus",
+    )
+    accelerators_count: int = Field(
+        ge=1,
+        description="The number of accelerators to use for the job.",
+    )
+    install_gpu_drivers: bool = Field(
+        default=True,
+        description="Required if GPUs.",
+    )
+
+    @classmethod
+    def init(cls, job_type: BatchJobType):
+        """Initialize the class with the configuration file corresponding to the job type.
+
+        :param job_type: Batch job type.
+        :type job_type: BatchJobType
+        """
+        # Generate unique id for the job
+        unique_job_name = (
+            job_type.name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+        )
+
+        # Load config from job_type
+        config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type]
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        return cls(job_name=unique_job_name, **config)
+
+
+class BatchJob(abc.ABC):
+    """Abstract class to launch and manage batch jobs: Google, AWS, Azure, Triton..."""
+
+    @staticmethod
+    @abc.abstractmethod
+    def launch_job() -> str:
+        """Launch batch job."""
+        pass
+
+
+class GoogleBatchJob(BatchJob):
+    """GCP Batch class. It uses the Google Cloud Batch API to launch and manage jobs.
+
+    More information on:
+    https://cloud.google.com/batch/docs/get-started
+    """
+
+    @staticmethod
+    def launch_job(
+        batch_job_config: GoogleBatchJobConfig,
+    ) -> batch_v1.Job:
+        """This method creates a Batch Job on GCP.
+
+        Method copied from https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
+
+        :param google_batch_launch_config: Config to run a job on Google Batch.
+        :type google_batch_launch_config: GoogleBatchLaunchConfig
+        :param batch_job_config: Config to run a specific job on Google Batch.
+        :type batch_job_config: BatchJobConfig
+        :return: Batch job information.
+        :rtype: batch_v1.Job
+
+        Returns:
+            A job object representing the job created.
+        """
+
+        client = batch_v1.BatchServiceClient()
+
+        # Define what will be done as part of the job.
+        runnable = batch_v1.Runnable()
+        runnable.container = batch_v1.Runnable.Container()
+        runnable.container.image_uri = batch_job_config.container_image_uri
+        runnable.container.entrypoint = batch_job_config.entrypoint
+        runnable.container.commands = batch_job_config.commands
+
+        # Jobs can be divided into tasks. In this case, we have only one task.
+        task = batch_v1.TaskSpec()
+        task.runnables = [runnable]
+
+        # We can specify what resources are requested by each task.
+        resources = batch_v1.ComputeResource()
+        resources.cpu_milli = batch_job_config.cpu_milli
+        resources.memory_mib = batch_job_config.memory_mib
+        resources.boot_disk_mib = batch_job_config.boot_disk_mib
+        task.compute_resource = resources
+
+        task.max_retry_count = batch_job_config.max_retry_count
+        task.max_run_duration = batch_job_config.max_run_duration
+
+        # Tasks are grouped inside a job using TaskGroups.
+        group = batch_v1.TaskGroup()
+        group.task_count = batch_job_config.task_count
+        group.task_spec = task
+
+        # Policies are used to define on what kind of virtual machines the tasks will run on.
+        policy = batch_v1.AllocationPolicy.InstancePolicy()
+        policy.machine_type = batch_job_config.machine_type
+        instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
+        instances.install_gpu_drivers = batch_job_config.install_gpu_drivers
+        instances.policy = policy
+        allocation_policy = batch_v1.AllocationPolicy()
+        allocation_policy.instances = [instances]
+
+        accelerator = batch_v1.AllocationPolicy.Accelerator()
+        accelerator.type_ = batch_job_config.accelerators_type
+        accelerator.count = batch_job_config.accelerators_count
+
+        job = batch_v1.Job()
+        job.task_groups = [group]
+        job.allocation_policy = allocation_policy
+        # We use Cloud Logging as it's an out of the box available option
+        job.logs_policy = batch_v1.LogsPolicy()
+        job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
+
+        create_request = batch_v1.CreateJobRequest()
+        create_request.job = job
+        create_request.job_id = batch_job_config.job_name
+        # The job's parent is the region in which the job will run
+        create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}"
+
+        return client.create_job(create_request)
diff --git a/robotoff/batch/configs/spellcheck.yaml b/robotoff/batch/configs/spellcheck.yaml
new file mode 100644
index 0000000000..18562f6f09
--- /dev/null
+++ b/robotoff/batch/configs/spellcheck.yaml
@@ -0,0 +1,13 @@
+container_image_uri: "europe-west9-docker.pkg.dev/robotoff/gcf-artifacts/spellcheck-batch-vllm"
+cpu_milli: 1000
+memory_mib: 32000
+boot_disk_mib: 100000
+max_retry_count: 1
+max_run_duration: "3600s"
+task_count: "1"
+parallelism: "1"
+machine_type: "g2-standard-8"
+accelerators_type: "nvidia-l4"
+accelerators_count: "1"
+install_gpu_drivers: true
+location: "europe-west4"
diff --git a/robotoff/settings.py b/robotoff/settings.py
index 20a6ad5bd8..4db6f20126 100644
--- a/robotoff/settings.py
+++ b/robotoff/settings.py
@@ -133,6 +133,7 @@ def event_api() -> str:
 JSONL_DATASET_ETAG_PATH = DATASET_DIR / "products-etag.txt"
 JSONL_MIN_DATASET_PATH = DATASET_DIR / "products-min.jsonl.gz"
 DATASET_CHECK_MIN_PRODUCT_COUNT = 2_800_000
+BATCH_JOB_CONFIG_DIR = PROJECT_DIR / "robotoff/batch/configs"
 
 # Products JSONL
 
@@ -356,3 +357,6 @@ def get_package_version() -> str:
 
 # Domains allowed to be used as image sources while cropping
 CROP_ALLOWED_DOMAINS = os.environ.get("CROP_ALLOWED_DOMAINS", "").split(",")
+
+# Batch jobs
+GOOGLE_PROJECT_NAME= "robotoff"
\ No newline at end of file
diff --git a/robotoff/types.py b/robotoff/types.py
index 99db6dce6f..8105d2030a 100644
--- a/robotoff/types.py
+++ b/robotoff/types.py
@@ -74,8 +74,7 @@ class InsightType(str, enum.Enum):
     """InsightType defines the type of the insight."""
 
     # The 'ingredient spellcheck' insight corrects the spelling in the given
-    # ingredients list. NOTE: this insight is deprecated until a new spellcheck
-    # method is developed
+    # ingredients list.
     ingredient_spellcheck = "ingredient_spellcheck"
 
     # The 'packager code' insight extracts the packager code using regex from
@@ -359,3 +358,40 @@ class PackagingElementProperty(enum.Enum):
 LogoLabelType = tuple[str, Optional[str]]
 
 InsightAnnotation = Literal[-1, 0, 1, 2]
+
+
+
+
+
+@enum.unique
+class Lang(str, enum.Enum):
+    english = "en"
+    french = "fr"
+    german = "de"
+    spanish = "es"
+    italian = "it"
+    portuguese = "pt"
+    dutch = "nl"
+    polish = "pl"
+    russian = "ru"
+    japanese = "ja"
+    chinese = "zh"
+    arabic = "ar"
+    turkish = "tr"
+    vietnamese = "vi"
+    thai = "th"
+    korean = "ko"
+    ukrainian = "uk"
+    indonesian = "id"
+    hungarian = "hu"
+    greek = "el"
+    romanian = "ro"
+    danish = "da"
+    swedish = "sv"
+    norwegian = "no"
+    finnish = "fi"
+    bulgarian = "bg"
+    czech = "cs"
+    slovak = "sk"
+    croatian = "hr"
+    
\ No newline at end of file
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
new file mode 100644
index 0000000000..4501ad1277
--- /dev/null
+++ b/tests/unit/test_batch.py
@@ -0,0 +1,18 @@
+import pytest
+
+from robotoff.batch import (
+    GoogleBatchJobConfig,
+    BatchJobType,
+)
+
+# Add future job types here for testing.
+@pytest.mark.parametrize(
+    "job_type_str",
+    [
+        "ingredients_spellcheck",
+    ],
+)
+def test_batch_job_config_file(job_type_str):
+    "Test indirectly the batch job config file by validating with the Pydantic class model."
+    job_type = BatchJobType[job_type_str]
+    GoogleBatchJobConfig.init(job_type)

From eb15bab345d1489cf02a261e66d7f1cc0543d5ed Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 21 Aug 2024 18:14:39 +0200
Subject: [PATCH 02/22] fix(batch-spellcheck): :lipstick: Fix Spellcheck Batch
 job file name for Dockerfile ENTRYPOINT

---
 batch/spellcheck/{job.py => main.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename batch/spellcheck/{job.py => main.py} (100%)

diff --git a/batch/spellcheck/job.py b/batch/spellcheck/main.py
similarity index 100%
rename from batch/spellcheck/job.py
rename to batch/spellcheck/main.py

From d36648b5a9d1227f16ba08525290aa6d45eb9188 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Thu, 22 Aug 2024 19:51:58 +0200
Subject: [PATCH 03/22] feat(batch-spellcheck): :zap: Batch extraction from
 database before Batch processing operational

---
 batch/spellcheck/Dockerfile                   |   2 +-
 batch/spellcheck/main.py                      |   8 +-
 poetry.lock                                   | 195 +++++++++++++++++-
 pyproject.toml                                |   2 +
 robotoff/app/api.py                           |  28 ++-
 robotoff/batch/__init__.py                    |  14 +-
 robotoff/batch/buckets.py                     |  71 +++++++
 .../configs/{ => job_configs}/spellcheck.yaml |   0
 robotoff/batch/configs/sql/spellcheck.sql     |   7 +
 robotoff/batch/extraction.py                  |  81 ++++++++
 robotoff/batch/{batch.py => launch.py}        |   3 +-
 robotoff/utils/buckets.py                     |  41 ++++
 tests/unit/data/dataset_sample.jsonl.gz       |   3 +
 tests/unit/test_batch.py                      |  31 ++-
 14 files changed, 461 insertions(+), 25 deletions(-)
 create mode 100644 robotoff/batch/buckets.py
 rename robotoff/batch/configs/{ => job_configs}/spellcheck.yaml (100%)
 create mode 100644 robotoff/batch/configs/sql/spellcheck.sql
 create mode 100644 robotoff/batch/extraction.py
 rename robotoff/batch/{batch.py => launch.py} (99%)
 create mode 100644 robotoff/utils/buckets.py
 create mode 100644 tests/unit/data/dataset_sample.jsonl.gz

diff --git a/batch/spellcheck/Dockerfile b/batch/spellcheck/Dockerfile
index 61b73b5b1b..0c9f31dad7 100644
--- a/batch/spellcheck/Dockerfile
+++ b/batch/spellcheck/Dockerfile
@@ -6,7 +6,7 @@ ENV PYTHONUNBUFFERED=1 \
 
 WORKDIR /app
 
-COPY job.py /app
+COPY main.py /app
 COPY requirements.txt /app
 
 RUN pip install --no-cache-dir -r requirements.txt
diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 6c629e4c56..3b4d0339a9 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -1,6 +1,7 @@
 import argparse
 import tempfile
 import logging
+import sys
 from typing import List
 
 import pandas as pd
@@ -12,6 +13,7 @@
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
 )
 
 FEATURES_VALIDATION = ["code", "text"]
@@ -22,8 +24,8 @@ def parse() -> argparse.Namespace:
     """
     parser = argparse.ArgumentParser(description="Spellcheck module.")
     parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.")
-    parser.add_argument("--pre_data_suffix", type=str, default="data/test_data.parquet", help="Dataset suffix containing the data to be processed.")
-    parser.add_argument("--post_data_suffix", type=str, default="data/test_processed_data.parquet", help="Dataset suffix containing the processed data.")
+    parser.add_argument("--pre_data_suffix", type=str, default="data/preprocessed_data.parquet", help="Dataset suffix containing the data to be processed.")
+    parser.add_argument("--post_data_suffix", type=str, default="data/postprocessed_data.parquet", help="Dataset suffix containing the processed data.")
     parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.")
     parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.")
     parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.")
@@ -47,7 +49,7 @@ def main():
     LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}")
     data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix)
     LOGGER.info(f"Feature in uploaded data: {data.columns}")
-    if not all(feature in data.columns for feature in FEATURES_VALIDATION):
+    if not all(feature in FEATURES_VALIDATION for feature in data.columns):
         raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}")
 
     instructions = [prepare_instruction(text) for text in data["text"]]
diff --git a/poetry.lock b/poetry.lock
index c6ddc1774b..5dad07cdf6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -654,6 +654,61 @@ typing-extensions = ">=3.7.4.1"
 all = ["pytz (>=2019.1)"]
 dates = ["pytz (>=2019.1)"]
 
+[[package]]
+name = "duckdb"
+version = "1.0.0"
+description = "DuckDB in-process database"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4a8ce2d1f9e1c23b9bab3ae4ca7997e9822e21563ff8f646992663f66d050211"},
+    {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:19797670f20f430196e48d25d082a264b66150c264c1e8eae8e22c64c2c5f3f5"},
+    {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b71c342090fe117b35d866a91ad6bffce61cd6ff3e0cff4003f93fc1506da0d8"},
+    {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dd69f44ad212c35ae2ea736b0e643ea2b70f204b8dff483af1491b0e2a4cec"},
+    {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8da5f293ecb4f99daa9a9352c5fd1312a6ab02b464653a0c3a25ab7065c45d4d"},
+    {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3207936da9967ddbb60644ec291eb934d5819b08169bc35d08b2dedbe7068c60"},
+    {file = "duckdb-1.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1128d6c9c33e883b1f5df6b57c1eb46b7ab1baf2650912d77ee769aaa05111f9"},
+    {file = "duckdb-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:02310d263474d0ac238646677feff47190ffb82544c018b2ff732a4cb462c6ef"},
+    {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:75586791ab2702719c284157b65ecefe12d0cca9041da474391896ddd9aa71a4"},
+    {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:83bb415fc7994e641344f3489e40430ce083b78963cb1057bf714ac3a58da3ba"},
+    {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:bee2e0b415074e84c5a2cefd91f6b5ebeb4283e7196ba4ef65175a7cef298b57"},
+    {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa5a4110d2a499312609544ad0be61e85a5cdad90e5b6d75ad16b300bf075b90"},
+    {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa389e6a382d4707b5f3d1bc2087895925ebb92b77e9fe3bfb23c9b98372fdc"},
+    {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ede6f5277dd851f1a4586b0c78dc93f6c26da45e12b23ee0e88c76519cbdbe0"},
+    {file = "duckdb-1.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0b88cdbc0d5c3e3d7545a341784dc6cafd90fc035f17b2f04bf1e870c68456e5"},
+    {file = "duckdb-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd1693cdd15375156f7fff4745debc14e5c54928589f67b87fb8eace9880c370"},
+    {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c65a7fe8a8ce21b985356ee3ec0c3d3b3b2234e288e64b4cfb03356dbe6e5583"},
+    {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:e5a8eda554379b3a43b07bad00968acc14dd3e518c9fbe8f128b484cf95e3d16"},
+    {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:a1b6acdd54c4a7b43bd7cb584975a1b2ff88ea1a31607a2b734b17960e7d3088"},
+    {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a677bb1b6a8e7cab4a19874249d8144296e6e39dae38fce66a80f26d15e670df"},
+    {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:752e9d412b0a2871bf615a2ede54be494c6dc289d076974eefbf3af28129c759"},
+    {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3aadb99d098c5e32d00dc09421bc63a47134a6a0de9d7cd6abf21780b678663c"},
+    {file = "duckdb-1.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83b7091d4da3e9301c4f9378833f5ffe934fb1ad2b387b439ee067b2c10c8bb0"},
+    {file = "duckdb-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:6a8058d0148b544694cb5ea331db44f6c2a00a7b03776cc4dd1470735c3d5ff7"},
+    {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e40cb20e5ee19d44bc66ec99969af791702a049079dc5f248c33b1c56af055f4"},
+    {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7bce1bc0de9af9f47328e24e6e7e39da30093179b1c031897c042dd94a59c8e"},
+    {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8355507f7a04bc0a3666958f4414a58e06141d603e91c0fa5a7c50e49867fb6d"},
+    {file = "duckdb-1.0.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:39f1a46f5a45ad2886dc9b02ce5b484f437f90de66c327f86606d9ba4479d475"},
+    {file = "duckdb-1.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d29ba477b27ae41676b62c8fae8d04ee7cbe458127a44f6049888231ca58fa"},
+    {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:1bea713c1925918714328da76e79a1f7651b2b503511498ccf5e007a7e67d49e"},
+    {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:bfe67f3bcf181edbf6f918b8c963eb060e6aa26697d86590da4edc5707205450"},
+    {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:dbc6093a75242f002be1d96a6ace3fdf1d002c813e67baff52112e899de9292f"},
+    {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba1881a2b11c507cee18f8fd9ef10100be066fddaa2c20fba1f9a664245cd6d8"},
+    {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:445d0bb35087c522705c724a75f9f1c13f1eb017305b694d2686218d653c8142"},
+    {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:224553432e84432ffb9684f33206572477049b371ce68cc313a01e214f2fbdda"},
+    {file = "duckdb-1.0.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d3914032e47c4e76636ad986d466b63fdea65e37be8a6dfc484ed3f462c4fde4"},
+    {file = "duckdb-1.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:af9128a2eb7e1bb50cd2c2020d825fb2946fdad0a2558920cd5411d998999334"},
+    {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dd2659a5dbc0df0de68f617a605bf12fe4da85ba24f67c08730984a0892087e8"},
+    {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:ac5a4afb0bc20725e734e0b2c17e99a274de4801aff0d4e765d276b99dad6d90"},
+    {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c5a53bee3668d6e84c0536164589d5127b23d298e4c443d83f55e4150fafe61"},
+    {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b980713244d7708b25ee0a73de0c65f0e5521c47a0e907f5e1b933d79d972ef6"},
+    {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21cbd4f9fe7b7a56eff96c3f4d6778770dd370469ca2212eddbae5dd63749db5"},
+    {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed228167c5d49888c5ef36f6f9cbf65011c2daf9dcb53ea8aa7a041ce567b3e4"},
+    {file = "duckdb-1.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46d8395fbcea7231fd5032a250b673cc99352fef349b718a23dea2c0dd2b8dec"},
+    {file = "duckdb-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:6ad1fc1a4d57e7616944166a5f9417bdbca1ea65c490797e3786e3a42e162d8a"},
+    {file = "duckdb-1.0.0.tar.gz", hash = "sha256:a2a059b77bc7d5b76ae9d88e267372deff19c291048d59450c431e166233d453"},
+]
+
 [[package]]
 name = "elastic-transport"
 version = "8.13.1"
@@ -1051,6 +1106,144 @@ google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev"
 proto-plus = ">=1.22.3,<2.0.0dev"
 protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
 
+[[package]]
+name = "google-cloud-core"
+version = "2.4.1"
+description = "Google Cloud API client core library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073"},
+    {file = "google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61"},
+]
+
+[package.dependencies]
+google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-auth = ">=1.25.0,<3.0dev"
+
+[package.extras]
+grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"]
+
+[[package]]
+name = "google-cloud-storage"
+version = "2.14.0"
+description = "Google Cloud Storage API client library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-cloud-storage-2.14.0.tar.gz", hash = "sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e"},
+    {file = "google_cloud_storage-2.14.0-py2.py3-none-any.whl", hash = "sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd"},
+]
+
+[package.dependencies]
+google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-auth = ">=2.23.3,<3.0dev"
+google-cloud-core = ">=2.3.0,<3.0dev"
+google-crc32c = ">=1.0,<2.0dev"
+google-resumable-media = ">=2.6.0"
+requests = ">=2.18.0,<3.0.0dev"
+
+[package.extras]
+protobuf = ["protobuf (<5.0.0dev)"]
+
+[[package]]
+name = "google-crc32c"
+version = "1.5.0"
+description = "A python wrapper of the C library 'Google CRC32C'"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-win32.whl", hash = "sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-win32.whl", hash = "sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-win32.whl", hash = "sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-win32.whl", hash = "sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-win32.whl", hash = "sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93"},
+]
+
+[package.extras]
+testing = ["pytest"]
+
+[[package]]
+name = "google-resumable-media"
+version = "2.7.2"
+description = "Utilities for Google Media Downloads and Resumable Uploads"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"},
+    {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"},
+]
+
+[package.dependencies]
+google-crc32c = ">=1.0,<2.0dev"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"]
+requests = ["requests (>=2.18.0,<3.0.0dev)"]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.63.2"
@@ -4104,4 +4297,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "080e1a8ef09819c49742c8270b5c2da81ff49469d77a8cd304567aeba79e0741"
+content-hash = "5ee1d05103d9616c3968c619f716555dc4d151a7f02d0580bb11c06c26dd3612"
diff --git a/pyproject.toml b/pyproject.toml
index d115e16f22..9f99676272 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,8 @@ imagehash = "~4.3.1"
 peewee-migrate = "~1.12.2"
 diskcache = "~5.6.3"
 google-cloud-batch = "^0.17.26"
+duckdb = "1.0.0"
+google-cloud-storage = "<2.18.2"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index e8f7c67d95..4cf3c5eaa7 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -89,7 +89,9 @@
 from robotoff.batch import (
     BatchJobType, 
     GoogleBatchJob,
-    GoogleBatchJobConfig
+    GoogleBatchJobConfig,
+    BatchExtraction,
+    GoogleStorageBucketForBatchJob,
 )
 
 logger = get_logger()
@@ -1763,16 +1765,25 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
 class BatchJobResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
         job_type_str: str = req.get_param("job_type", required=True)
-
-        # Batch extraction
-
-        # Launch Batch job
-        logger.info(f"Start batch with job_type: {job_type_str}")
+        
         try:
             job_type = BatchJobType[job_type_str]
         except KeyError: 
             raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}")
-        
+
+        # Batch extraction
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            BatchExtraction.extract_from_dataset(
+                job_type=job_type,
+                output_dir=tmp_dir,
+            )
+            if not BatchExtraction.extracted_file_path:
+                raise ValueError("The extracted file was not found.")
+                
+            bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
+            bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
+
+        # Launch batch job
         batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
         batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
         resp.media = {"batch_job_details": batch_job}
@@ -1815,7 +1826,7 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predict/nutrition", NutritionPredictorResource())
 api.add_route("/api/v1/predict/ocr_prediction", OCRPredictionPredictorResource())
 api.add_route("/api/v1/predict/category", CategoryPredictorResource())
-api.add_route("/api/v1/predict/ ", IngredientListPredictorResource())
+api.add_route("/api/v1/predict/ingredient_list", IngredientListPredictorResource())
 api.add_route("/api/v1/predict/lang", LanguagePredictorResource())
 api.add_route("/api/v1/predict/lang/product", ProductLanguagePredictorResource())
 api.add_route("/api/v1/products/dataset", UpdateDatasetResource())
@@ -1845,3 +1856,4 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predictions", PredictionCollection())
 api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection())
 api.add_route("/robots.txt", RobotsTxtResource())
+api.add_route("/api/v1/batch/launch", BatchJobResource())
\ No newline at end of file
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 3febe071f0..7bb0a17d87 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,11 +1,11 @@
-from .batch import (
+from .launch import (
     GoogleBatchJob,
     GoogleBatchJobConfig,
     BatchJobType,
 )
-
-__all__ = [
-    "GoogleBatchJob",
-    "GoogleBatchJobConfig",
-    "BatchJobType",
-]
\ No newline at end of file
+from .extraction import (
+    BatchExtraction,
+)
+from .buckets import (
+    GoogleStorageBucketForBatchJob,
+)
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
new file mode 100644
index 0000000000..19c5ae1d02
--- /dev/null
+++ b/robotoff/batch/buckets.py
@@ -0,0 +1,71 @@
+import io
+
+from robotoff.utils.buckets import GoogleStorageBucket
+from robotoff.batch import BatchJobType
+
+
+BATCH_JOB_TYPE_TO_BUCKET = {
+    BatchJobType.ingredients_spellcheck: {
+        "bucket": "robotoff-spellcheck", 
+        "suffix_preprocess": "data/preprocessed_data.parquet",
+        "suffix_postprocess": "data/postprocessed_data.parquet",
+    },
+}
+
+
+class GoogleStorageBucketForBatchJob(GoogleStorageBucket):
+    """Class to handle the Google Storage bucket for depending on the batch job.
+
+    :param bucket: Bucket name
+    :type bucket: str
+    :param suffix_preprocess: Path inside the bucket before batch processing.
+    :type suffix_preprocess: str
+    :param suffix_postprocess: Path inside the bucket after batch processing.
+    :type suffix_postprocess: str
+    """
+
+    def __init__(
+        self,
+        bucket: str,
+        suffix_preprocess: str,
+        suffix_postprocess: str,
+    ) -> None:
+        self.bucket = bucket
+        self.suffix_preprocess = suffix_preprocess
+        self.suffix_postprocess = suffix_postprocess
+    
+    @classmethod
+    def from_job_type(cls, job_type:BatchJobType) -> "GoogleStorageBucketForBatchJob":
+        """Initialize the class with the configuration file corresponding to the batch job type.
+        Useful to adapt bucket upload and download during the batch job process.
+
+        :param job_type: Batch job type. 
+        :type job_type: BatchJobType
+        :return: Instantiated class.
+        :rtype: GoogleStorageBucketForBatchJob
+        """
+        try: 
+            bucket_dict = BATCH_JOB_TYPE_TO_BUCKET[job_type]
+        except KeyError:
+            raise ValueError(f"Batch job type {job_type} not found in the configuration. Expected {BATCH_JOB_TYPE_TO_BUCKET}.")
+        return cls(**bucket_dict)
+
+    def upload_file(self, file_path: str):
+        """Upload file to the bucket.
+
+        :param file_path: File path to upload.
+        :type file_path: str
+        """
+        self.upload_gcs(
+            file_path=file_path,
+            bucket_name=self.bucket,
+            suffix=self.suffix_preprocess,
+        )
+
+    def download_file(self) -> io.BufferedReader:
+        """Download file from bucket
+        """
+        return self.download_gcs(
+            bucket_name=self.bucket,
+            suffix=self.suffix_postprocess,
+        )
diff --git a/robotoff/batch/configs/spellcheck.yaml b/robotoff/batch/configs/job_configs/spellcheck.yaml
similarity index 100%
rename from robotoff/batch/configs/spellcheck.yaml
rename to robotoff/batch/configs/job_configs/spellcheck.yaml
diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql
new file mode 100644
index 0000000000..5e53938a49
--- /dev/null
+++ b/robotoff/batch/configs/sql/spellcheck.sql
@@ -0,0 +1,7 @@
+SELECT code, ingredients_text AS text, product_name, (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction
+FROM read_ndjson('DATASET_PATH', ignore_errors=True)
+WHERE ingredients_text NOT LIKE ''
+AND fraction > 0 AND fraction <= 0.4
+ORDER BY random() 
+LIMIT 100
+;
\ No newline at end of file
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
new file mode 100644
index 0000000000..d4f517836f
--- /dev/null
+++ b/robotoff/batch/extraction.py
@@ -0,0 +1,81 @@
+import os
+from pathlib import Path
+
+import duckdb
+
+from robotoff import settings
+from robotoff.batch import BatchJobType
+
+
+BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = {
+    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql",
+}
+
+
+class BatchExtraction:
+    """Handle batch extraction from the dataset.
+    Extraction varies regarding the batch job.
+    """
+
+    file_name: str = "batch.parquet"
+    extracted_file_path: str = None
+
+    @classmethod
+    def extract_from_dataset(
+        cls, 
+        job_type: BatchJobType,
+        output_dir: str, 
+        dataset_path: str = str(settings.JSONL_DATASET_PATH), 
+    ) -> None:
+        """Using SQL queries, extract data from the dataset and save it as a parquet file.
+
+        :param job_type: Batch job type.
+        :type job_type: BatchJobType
+        :param output_dir: Directory to save the extracted data.
+        :type output_dir: str
+        :param dataset_path: Path to the jsonl.gz dataset.
+        :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing.
+        """
+        if not isinstance(dataset_path, str):
+            raise ValueError(f"The dataset path should be a string. Current type {type(dataset_path)}")
+        
+        query_file_path = BATCH_JOB_TYPE_TO_QUERY_FILE_PATH[job_type]
+        query = cls._load_query(query_file_path=query_file_path, dataset_path=dataset_path)
+        cls._extract_and_save_batch_data(query=query, output_dir=output_dir)
+        # We save the file path for later usage in the pipeline
+        cls.extracted_file_path = os.path.join(output_dir, cls.file_name)
+
+    @staticmethod
+    def _load_query(query_file_path: Path, dataset_path: str) -> str:
+        """Load the SQL query from a corresponding file.
+
+        :param query_file_path: File path containing the SQL query.
+        :type query_file_path: Path
+        :param dataset_path: Path to the jsonl.gz dataset.
+        :type dataset_path: Path   
+        :raises ValueError: In case the Dataset path is not found in the SQL query.
+        :return: the SQL/DuckDB query.
+        :rtype: str
+        """
+        query = query_file_path.read_text()
+        if "DATASET_PATH" not in query:
+            raise ValueError(
+                "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path."
+            )
+        query = query.replace("DATASET_PATH", dataset_path)
+        return query
+
+    @classmethod
+    def _extract_and_save_batch_data(cls, query: str, output_dir: str) -> None:
+        """Query and save the data.
+
+        :param query: DuckDB/SQL query.
+        :type query: str
+        :param output_dir: Extracted data directory
+        :type output_dir: str
+        """
+        (
+            duckdb
+            .sql(query)
+            .write_parquet(os.path.join(output_dir, cls.file_name))
+        )
diff --git a/robotoff/batch/batch.py b/robotoff/batch/launch.py
similarity index 99%
rename from robotoff/batch/batch.py
rename to robotoff/batch/launch.py
index a21468aea6..8b00f097aa 100644
--- a/robotoff/batch/batch.py
+++ b/robotoff/batch/launch.py
@@ -19,8 +19,7 @@ class BatchJobType(enum.Enum):
 
 # Paths batch job config files
 BATCH_JOB_TYPE_TO_CONFIG_PATH = {
-    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR
-    / "spellcheck.yaml",
+    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml",
 }
 
 
diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py
new file mode 100644
index 0000000000..73c9295afa
--- /dev/null
+++ b/robotoff/utils/buckets.py
@@ -0,0 +1,41 @@
+import io
+from typing import Any
+
+from google.cloud import storage
+
+
+class GoogleStorageBucket:
+
+    @staticmethod
+    def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader:
+        """Download file from Google Storage Bucket.
+
+        :param bucket_name: Bucket name
+        :type bucket_name: str
+        :param suffix: Path inside the bucket
+        :type suffix: str
+        :return: 
+        :rtype: Any
+        """
+        client = storage.Client()
+        bucket = client.get_bucket(bucket_name)
+        blob = bucket.blob(suffix)
+        with blob.open("rb") as f:
+            return f
+
+
+    @staticmethod
+    def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
+        """Upload file to Google Storage Bucket.
+
+        :param file_path: File path.
+        :type file_path: str
+        :param bucket_name: Bucket name.
+        :type bucket_name: str
+        :param suffix: Path inside the bucket.
+        :type suffix: str
+        """
+        client = storage.Client()
+        bucket = client.get_bucket(bucket_name)
+        blob = bucket.blob(suffix)
+        blob.upload_from_filename(filename=file_path)
diff --git a/tests/unit/data/dataset_sample.jsonl.gz b/tests/unit/data/dataset_sample.jsonl.gz
new file mode 100644
index 0000000000..abb852f4d6
--- /dev/null
+++ b/tests/unit/data/dataset_sample.jsonl.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7148cdb2415a8156e39cf0f876b998676d1b2489d5c3ce269407c93769e7151f
+size 262180
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
index 4501ad1277..25f8c94e66 100644
--- a/tests/unit/test_batch.py
+++ b/tests/unit/test_batch.py
@@ -1,18 +1,43 @@
 import pytest
+import tempfile
+from pathlib import Path
 
 from robotoff.batch import (
     GoogleBatchJobConfig,
     BatchJobType,
+    BatchExtraction,
 )
 
+
+DIR = Path(__file__).parent
+JOB_TYPES = [
+    "ingredients_spellcheck",
+]
+
+
 # Add future job types here for testing.
 @pytest.mark.parametrize(
     "job_type_str",
-    [
-        "ingredients_spellcheck",
-    ],
+    JOB_TYPES,
 )
 def test_batch_job_config_file(job_type_str):
     "Test indirectly the batch job config file by validating with the Pydantic class model."
     job_type = BatchJobType[job_type_str]
     GoogleBatchJobConfig.init(job_type)
+
+
+# Add future job types here for testing.
+@pytest.mark.parametrize(
+    "job_type_str",
+    JOB_TYPES,
+)
+def test_batch_extraction(job_type_str):
+    """Test extraction of a batch of data from the dataset depending on the job type.
+    """
+    job_type_str = BatchJobType[job_type_str]
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        BatchExtraction.extract_from_dataset(
+            job_type=job_type_str,
+            output_dir=tmp_dir,
+            dataset_path=str(DIR / "data/dataset_sample.jsonl.gz"),
+        )

From c14338d0ae9eff3ac26255db8b57c196dd9581aa Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Fri, 23 Aug 2024 11:38:00 +0200
Subject: [PATCH 04/22] refactor(batch-spellcheck): :green_heart: Fix some
 bugs: batch-extraction & batch-launch

---
 batch/spellcheck/README.md   |  1 +
 robotoff/batch/buckets.py    |  6 +++---
 robotoff/batch/extraction.py |  2 +-
 robotoff/batch/launch.py     | 26 +++++++++++++++++++-------
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/batch/spellcheck/README.md b/batch/spellcheck/README.md
index 975f63733e..fb2898d4d4 100644
--- a/batch/spellcheck/README.md
+++ b/batch/spellcheck/README.md
@@ -11,6 +11,7 @@
     * A100: a2-highgpu-1g: $3.748064
 * A100/Cuda doesn't support FP8
 * A100 has less availability than L4: need to wait for batch job (can be long)
+* Don't forget to enable **Batch & Storage API** if used without gcloud
 
 ## Links
 
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
index 19c5ae1d02..d27b1c2d73 100644
--- a/robotoff/batch/buckets.py
+++ b/robotoff/batch/buckets.py
@@ -35,9 +35,9 @@ def __init__(
         self.suffix_postprocess = suffix_postprocess
     
     @classmethod
-    def from_job_type(cls, job_type:BatchJobType) -> "GoogleStorageBucketForBatchJob":
-        """Initialize the class with the configuration file corresponding to the batch job type.
-        Useful to adapt bucket upload and download during the batch job process.
+    def from_job_type(cls, job_type: BatchJobType) -> "GoogleStorageBucketForBatchJob":
+        """Initialize the class with the bucket and suffix names corresponding to the batch job type.
+        Used to adapt bucket upload and download during the batch job process.
 
         :param job_type: Batch job type. 
         :type job_type: BatchJobType
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
index d4f517836f..bc75be9d95 100644
--- a/robotoff/batch/extraction.py
+++ b/robotoff/batch/extraction.py
@@ -31,7 +31,7 @@ def extract_from_dataset(
 
         :param job_type: Batch job type.
         :type job_type: BatchJobType
-        :param output_dir: Directory to save the extracted data.
+        :param output_dir: Directory to save the extracted data as a parquet file.
         :type output_dir: str
         :param dataset_path: Path to the jsonl.gz dataset.
         :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing.
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index 8b00f097aa..d8467c6e0c 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -3,9 +3,10 @@
 import enum
 import yaml
 import datetime
+import re
 
 from google.cloud import batch_v1
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ConfigDict
 
 from robotoff import settings
 
@@ -14,7 +15,7 @@
 class BatchJobType(enum.Enum):
     """Each job type correspond to a task that will be executed in the batch job."""
 
-    ingredients_spellcheck = "ingredients_spellcheck"
+    ingredients_spellcheck = "ingredients-spellcheck"
 
 
 # Paths batch job config files
@@ -25,6 +26,8 @@ class BatchJobType(enum.Enum):
 
 class GoogleBatchJobConfig(BaseModel):
     """Batch job configuration class."""
+    # By default, extra fields are just ignored. We raise an error in case of extra fields.
+    model_config: ConfigDict = {"extra": "forbid"}
 
     job_name: str = Field(
         description="The name of the job. It needs to be unique amongst exisiting batch job names.",
@@ -33,6 +36,9 @@ class GoogleBatchJobConfig(BaseModel):
         pattern=r"^europe-west\d{1,2}$",
         description="The region in which the job will run. Regions that are available for Batch are listed on: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones. We restrict to Europe-West for now.",
     )
+    container_image_uri: str = Field(
+        description="The URI of the container image to use for the job. SHould be a valid Image URI.",
+    )
     entrypoint: Optional[str] = Field(
         default=None,
         description="The entrypoint for the container. If None, use default entrypoint.",
@@ -100,12 +106,16 @@ def init(cls, job_type: BatchJobType):
         :param job_type: Batch job type.
         :type job_type: BatchJobType
         """
+        # Batch job name should respect a specific pattern, or returns an error
+        pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
+        if not re.match(pattern, job_type.value):
+            raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_type.value}")
+        
         # Generate unique id for the job
         unique_job_name = (
-            job_type.name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            job_type.value + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
         )
-
-        # Load config from job_type
+        # Load config file from job_type
         config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type]
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
@@ -135,8 +145,10 @@ def launch_job(
     ) -> batch_v1.Job:
         """This method creates a Batch Job on GCP.
 
-        Method copied from https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
-
+        Sources:
+        * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
+        * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
+        
         :param google_batch_launch_config: Config to run a job on Google Batch.
         :type google_batch_launch_config: GoogleBatchLaunchConfig
         :param batch_job_config: Config to run a specific job on Google Batch.

From 6c83b8c9141314305da52d20acb2b43122ec4db2 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Sat, 24 Aug 2024 18:03:52 +0200
Subject: [PATCH 05/22] feat(batch - spellcheck): :zap: From predictions to
 insights

---
 robotoff/app/api.py           | 28 ++++++++++++++++--
 robotoff/batch/__init__.py    |  1 +
 robotoff/batch/importer.py    | 55 +++++++++++++++++++++++++++++++++++
 robotoff/insights/importer.py | 29 ++++++++++++++++++
 4 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 robotoff/batch/importer.py

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index 4cf3c5eaa7..5daf219fb3 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -92,6 +92,8 @@
     GoogleBatchJobConfig,
     BatchExtraction,
     GoogleStorageBucketForBatchJob,
+    generate_predictions_from_batch,
+
 )
 
 logger = get_logger()
@@ -1762,7 +1764,7 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         resp.status = falcon.HTTP_200
 
 
-class BatchJobResource:
+class BatchJobLaunchResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
         job_type_str: str = req.get_param("job_type", required=True)
         
@@ -1779,7 +1781,6 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
             )
             if not BatchExtraction.extracted_file_path:
                 raise ValueError("The extracted file was not found.")
-                
             bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
             bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
 
@@ -1789,6 +1790,27 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
         resp.media = {"batch_job_details": batch_job}
 
 
+class BatchJobImportResource:
+    def on_post(self, req: falcon.Request, resp: falcon.Response):
+        job_type_str: str = req.get_param("job_type", required=True)
+
+        from robotoff.insights.importer import import_insights
+        try:
+            job_type = BatchJobType[job_type_str]
+        except KeyError: 
+            raise falcon.HTTPBadRequest(
+                description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
+            )
+
+        bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
+        predictions = generate_predictions_from_batch(
+            bucket_handler.download_file, 
+            job_type
+        )
+        with db:
+            import_insights(predictions=predictions, server_type="off")
+
+
 def custom_handle_uncaught_exception(
     req: falcon.Request, resp: falcon.Response, ex: Exception, params
 ):
@@ -1856,4 +1878,4 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predictions", PredictionCollection())
 api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection())
 api.add_route("/robots.txt", RobotsTxtResource())
-api.add_route("/api/v1/batch/launch", BatchJobResource())
\ No newline at end of file
+api.add_route("/api/v1/batch/launch", BatchJobLaunchResource())
\ No newline at end of file
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 7bb0a17d87..d9470f8e2b 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -9,3 +9,4 @@
 from .buckets import (
     GoogleStorageBucketForBatchJob,
 )
+from .importer import generate_predictions_from_batch
diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py
new file mode 100644
index 0000000000..d8df8d48ec
--- /dev/null
+++ b/robotoff/batch/importer.py
@@ -0,0 +1,55 @@
+import io
+from typing import Iterator
+
+import pandas as pd
+
+from robotoff.batch import BatchJobType
+from robotoff.types import Prediction, PredictionType
+
+
+BATCH_JOB_TYPE_TO_FEATURES = {
+    BatchJobType.ingredients_spellcheck: {
+        "barcode": "code",
+        "value": "correction",
+        "value_tag": "lang", 
+    },
+}
+
+BATCH_JOB_TYPE_TO_PREDICTION_TYPE = {
+    BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck,
+}
+
+PREDICTOR_VERSION = "1"
+
+
+def generate_predictions_from_batch(
+    f: io.BufferedReader, 
+    job_type: BatchJobType
+) -> Iterator[Prediction]:
+    """From a file imported from google storage, generate predictions depending on the job type.
+
+    :param f: Readable object. Should be a parquet file.
+    :type f: io.BufferedReader
+    :param job_type: Batch job type.
+    :type job_type: BatchJobType
+    :rtype: Iterable[Prediction]
+    :yield: Predictions.
+    :rtype: Iterator[Prediction]
+    """
+    features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type]
+    prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type]
+
+    try:
+        df = pd.read_parquet(f)
+    except Exception as e:
+        raise ValueError(f"Failed to read parquet file: {e}")
+    
+    for _, row in df.iterrows():
+        yield Prediction(
+            type=prediction_type,
+            value=row[features_dict["value"]],
+            value_tag=[features_dict["value_tag"]],
+            barcode=row[features_dict["barcode"]],
+            predictor_version=PREDICTOR_VERSION,
+            predictor="llm",
+        )
diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index bc5cdcebde..275393b9eb 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1475,6 +1475,35 @@ def compute_crop_bounding_box(
         return results
 
 
+class IngredientsSpellcheckImporter(InsightImporter):
+
+    @staticmethod
+    def get_type() -> InsightType:
+        return InsightType.ingredient_spellcheck
+    
+    @classmethod
+    def get_required_prediction_types(cls) -> set[PredictionType]:
+        return {PredictionType.ingredient_spellcheck}
+
+    @classmethod
+    def generate_candidates(
+        cls,
+        product: Optional[Product],
+        predictions: list[Prediction],
+        product_id: ProductIdentifier,
+    ) -> Iterator[ProductInsight]:
+        # No reason to have different candidates for now
+        candidate = predictions[0]
+        yield ProductInsight(**candidate.to_dict())
+    
+    @classmethod
+    def is_conflicting_insight(
+        cls, 
+        candidate: ProductInsight, 
+        reference: ProductInsight
+    ) -> bool:
+        candidate.value == reference.value
+
 class PackagingElementTaxonomyException(Exception):
     pass
 

From a369a5937cb174ec4fdbe8a6e62ff30680a6505e Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Mon, 26 Aug 2024 19:02:40 +0200
Subject: [PATCH 06/22] feat(batch - spellcheck): :zap: API endpoint
 batch/launch ok: Batch extraction with DuckDB and launch on GCP

.
---
 batch/spellcheck/main.py     |  2 +-
 credentials/.gitkeep         |  0
 docker-compose.yml           |  5 ++-
 poetry.lock                  | 78 ++++++++++++++++++++++++++++++++++--
 pyproject.toml               |  1 +
 robotoff/app/api.py          | 34 ++++++----------
 robotoff/batch/__init__.py   | 33 +++++++++++++++
 robotoff/batch/extraction.py |  4 ++
 robotoff/batch/launch.py     |  2 +-
 9 files changed, 132 insertions(+), 27 deletions(-)
 create mode 100644 credentials/.gitkeep

diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 3b4d0339a9..6853ea288b 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -49,7 +49,7 @@ def main():
     LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}")
     data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix)
     LOGGER.info(f"Feature in uploaded data: {data.columns}")
-    if not all(feature in FEATURES_VALIDATION for feature in data.columns):
+    if not all(feature in data.columns for feature in FEATURES_VALIDATION):
         raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}")
 
     instructions = [prepare_instruction(text) for text in data["text"]]
diff --git a/credentials/.gitkeep b/credentials/.gitkeep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docker-compose.yml b/docker-compose.yml
index 60a914f0fb..83a5022945 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,6 +4,7 @@ x-robotoff-base-volumes:
   - ./cache:/opt/robotoff/cache
   - ./datasets:/opt/robotoff/datasets
   - ./models:/opt/robotoff/models
+  - ./credentials:/opt/credentials
 
 x-robotoff-base:
   &robotoff-base
@@ -53,7 +54,9 @@ x-robotoff-base-env:
   IMAGE_MODERATION_SERVICE_URL:
   CROP_ALLOWED_DOMAINS:
   NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings
-
+  GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json
+  GOOGLE_CLOUD_PROJECT: "robotoff"
+  
 x-robotoff-worker-base:
   &robotoff-worker
   restart: $RESTART_POLICY
diff --git a/poetry.lock b/poetry.lock
index 5dad07cdf6..11fcef4863 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2117,8 +2117,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
     {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 
 [[package]]
@@ -2211,6 +2211,78 @@ files = [
     {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
 ]
 
+[[package]]
+name = "pandas"
+version = "2.2.2"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
+    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
+    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
+    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
+    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
+    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
+
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -2612,8 +2684,8 @@ files = [
 annotated-types = ">=0.4.0"
 pydantic-core = "2.20.1"
 typing-extensions = [
-    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
     {version = ">=4.6.1", markers = "python_version < \"3.13\""},
+    {version = ">=4.12.2", markers = "python_version >= \"3.13\""},
 ]
 
 [package.extras]
@@ -4297,4 +4369,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "5ee1d05103d9616c3968c619f716555dc4d151a7f02d0580bb11c06c26dd3612"
+content-hash = "30d2c1f1df11a9fdbecd73ec5c64732a361053cf7b350350d433def955691df8"
diff --git a/pyproject.toml b/pyproject.toml
index 9f99676272..7926145882 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,7 @@ diskcache = "~5.6.3"
 google-cloud-batch = "^0.17.26"
 duckdb = "1.0.0"
 google-cloud-storage = "<2.18.2"
+pandas = "^2.2.2"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index 5daf219fb3..531cde9e4c 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -87,10 +87,8 @@
 from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue
 from robotoff.workers.tasks import download_product_dataset_job
 from robotoff.batch import (
-    BatchJobType, 
-    GoogleBatchJob,
-    GoogleBatchJobConfig,
-    BatchExtraction,
+    BatchJobType,
+    launch_batch_job,
     GoogleStorageBucketForBatchJob,
     generate_predictions_from_batch,
 
@@ -1771,23 +1769,17 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
         try:
             job_type = BatchJobType[job_type_str]
         except KeyError: 
-            raise falcon.HTTPBadRequest(description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}")
-
-        # Batch extraction
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            BatchExtraction.extract_from_dataset(
-                job_type=job_type,
-                output_dir=tmp_dir,
+            raise falcon.HTTPBadRequest(
+                description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
             )
-            if not BatchExtraction.extracted_file_path:
-                raise ValueError("The extracted file was not found.")
-            bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
-            bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
-
-        # Launch batch job
-        batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
-        batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
-        resp.media = {"batch_job_details": batch_job}
+        # Batch extraction can take some time, so we queue it
+        enqueue_job(
+            launch_batch_job,
+            queue=low_queue,
+            job_type=job_type,
+            job_kwargs={"timeout": "10m"},
+        )
+        logger.info("Batch job %s queued", job_type)
 
 
 class BatchJobImportResource:
@@ -1804,7 +1796,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
 
         bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
         predictions = generate_predictions_from_batch(
-            bucket_handler.download_file, 
+            bucket_handler.download_file(), 
             job_type
         )
         with db:
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index d9470f8e2b..c7e681bf5d 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,3 +1,5 @@
+import tempfile
+
 from .launch import (
     GoogleBatchJob,
     GoogleBatchJobConfig,
@@ -10,3 +12,34 @@
     GoogleStorageBucketForBatchJob,
 )
 from .importer import generate_predictions_from_batch
+
+from robotoff.utils import get_logger
+
+
+LOGGER = get_logger(__name__)
+
+
+def launch_batch_job(job_type: BatchJobType) -> None:
+    """_summary_
+
+    :param job_type: _description_
+    :type job_type: BatchJobType
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        BatchExtraction.extract_from_dataset(
+            job_type=job_type,
+            output_dir=tmp_dir,
+        )
+        if not BatchExtraction.extracted_file_path:
+            raise ValueError("The extracted file was not found.")
+        LOGGER.debug(f"Batch data succesfully extracted and saved at {BatchExtraction.extracted_file_path}")
+
+        # Upload the extracted file to the bucket
+        bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
+        bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
+        LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}")
+
+    # Launch batch job
+    batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
+    batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
+    LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}")
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
index bc75be9d95..c0054c3b8e 100644
--- a/robotoff/batch/extraction.py
+++ b/robotoff/batch/extraction.py
@@ -5,8 +5,11 @@
 
 from robotoff import settings
 from robotoff.batch import BatchJobType
+from robotoff.utils import get_logger
 
 
+LOGGER = get_logger(__name__)
+
 BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = {
     BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql",
 }
@@ -63,6 +66,7 @@ def _load_query(query_file_path: Path, dataset_path: str) -> str:
                 "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path."
             )
         query = query.replace("DATASET_PATH", dataset_path)
+        LOGGER.debug(f"Query used to extract batch from dataset: {query}")
         return query
 
     @classmethod
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index d8467c6e0c..cdd17b8b6f 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -157,7 +157,7 @@ def launch_job(
         :rtype: batch_v1.Job
 
         Returns:
-            A job object representing the job created.
+            Batch job information.
         """
 
         client = batch_v1.BatchServiceClient()

From 729d4e16ce91c76c3f500046f71f476f640f8ef9 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 27 Aug 2024 17:46:44 +0200
Subject: [PATCH 07/22] feat(batch - spellcheck): :zap: Integrate batch data
 from job into Robotoff sql tables

---
 .gitignore                                |  2 +
 batch/spellcheck/main.py                  | 20 ++++++++
 poetry.lock                               | 53 ++++++++++++++++++++-
 pyproject.toml                            |  1 +
 robotoff/app/api.py                       | 23 ++++-----
 robotoff/batch/__init__.py                |  7 ++-
 robotoff/batch/buckets.py                 |  5 +-
 robotoff/batch/configs/sql/spellcheck.sql |  7 ++-
 robotoff/batch/importer.py                | 58 +++++++++++++++--------
 robotoff/insights/importer.py             | 11 +++--
 robotoff/utils/buckets.py                 | 10 ++--
 11 files changed, 144 insertions(+), 53 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3a4dd3e70a..0443dcd510 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,5 @@ site/
 gh_pages/
 doc/README.md
 doc/references/cli.md
+
+credentials
\ No newline at end of file
diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 6853ea288b..0960765fef 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -2,6 +2,7 @@
 import tempfile
 import logging
 import sys
+import requests
 from typing import List
 
 import pandas as pd
@@ -77,6 +78,10 @@ def main():
             bucket_name=args.data_bucket, 
             suffix=args.post_data_suffix
         )
+    
+    LOGGER.info("Request Robotoff API batch import endpoint.")
+    run_robotoff_endpoint_batch_import()
+
     LOGGER.info("Batch processing job completed.")
 
 
@@ -148,5 +153,20 @@ def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
     blob = bucket.blob(suffix)
     blob.upload_from_filename(filename=file_path)
 
+
+def run_robotoff_endpoint_batch_import():
+    """Run Robotoff api endpoint to import batch data into tables. 
+    """
+    url = "https://robotoff.openfoodfacts.org/api/v1/batch/import"
+    data = {"job_type": "ingredients_spellcheck"}
+
+    try:
+        response = requests.post(url, data=data)
+    except requests.exceptions.RequestException as e:
+        raise SystemExit(e)
+
+    LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
+
+
 if __name__ == "__main__":
     main()
diff --git a/poetry.lock b/poetry.lock
index 11fcef4863..c0680af5a8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2622,6 +2622,57 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "pyarrow"
+version = "17.0.0"
+description = "Python library for Apache Arrow"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
+]
+
+[package.dependencies]
+numpy = ">=1.16.6"
+
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
 [[package]]
 name = "pyasn1"
 version = "0.6.0"
@@ -4369,4 +4420,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "30d2c1f1df11a9fdbecd73ec5c64732a361053cf7b350350d433def955691df8"
+content-hash = "07551d5c2b36e7b3321ac361d384ef02a74de7686051a450ecd692a0b832929b"
diff --git a/pyproject.toml b/pyproject.toml
index 7926145882..133f61c850 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,6 +82,7 @@ google-cloud-batch = "^0.17.26"
 duckdb = "1.0.0"
 google-cloud-storage = "<2.18.2"
 pandas = "^2.2.2"
+pyarrow = "^17.0.0"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index 531cde9e4c..1e761b10f8 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -89,9 +89,7 @@
 from robotoff.batch import (
     BatchJobType,
     launch_batch_job,
-    GoogleStorageBucketForBatchJob,
-    generate_predictions_from_batch,
-
+    import_batch_predictions,
 )
 
 logger = get_logger()
@@ -1779,28 +1777,26 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
             job_type=job_type,
             job_kwargs={"timeout": "10m"},
         )
-        logger.info("Batch job %s queued", job_type)
+        logger.info("Batch job launch %s has been queued.", job_type)
 
 
 class BatchJobImportResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
         job_type_str: str = req.get_param("job_type", required=True)
 
-        from robotoff.insights.importer import import_insights
         try:
             job_type = BatchJobType[job_type_str]
         except KeyError: 
             raise falcon.HTTPBadRequest(
                 description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
             )
-
-        bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
-        predictions = generate_predictions_from_batch(
-            bucket_handler.download_file(), 
-            job_type
+        enqueue_job(
+            import_batch_predictions,
+            job_type=job_type,
+            queue=low_queue,
+            job_kwargs={"timeout": "10m"},
         )
-        with db:
-            import_insights(predictions=predictions, server_type="off")
+        logger.info("Batch import %s has been queued.", job_type)
 
 
 def custom_handle_uncaught_exception(
@@ -1870,4 +1866,5 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predictions", PredictionCollection())
 api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection())
 api.add_route("/robots.txt", RobotsTxtResource())
-api.add_route("/api/v1/batch/launch", BatchJobLaunchResource())
\ No newline at end of file
+api.add_route("/api/v1/batch/launch", BatchJobLaunchResource())
+api.add_route("/api/v1/batch/import", BatchJobImportResource())
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index c7e681bf5d..1d2e20c521 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -11,8 +11,7 @@
 from .buckets import (
     GoogleStorageBucketForBatchJob,
 )
-from .importer import generate_predictions_from_batch
-
+from .importer import import_batch_predictions
 from robotoff.utils import get_logger
 
 
@@ -37,9 +36,9 @@ def launch_batch_job(job_type: BatchJobType) -> None:
         # Upload the extracted file to the bucket
         bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
         bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
-        LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}")
+        LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}/{bucket_handler.suffix_preprocess}")
 
     # Launch batch job
     batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
     batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
-    LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}")
+    LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
index d27b1c2d73..655c8f0af9 100644
--- a/robotoff/batch/buckets.py
+++ b/robotoff/batch/buckets.py
@@ -1,5 +1,4 @@
-import io
-
+import pandas as pd
 from robotoff.utils.buckets import GoogleStorageBucket
 from robotoff.batch import BatchJobType
 
@@ -62,7 +61,7 @@ def upload_file(self, file_path: str):
             suffix=self.suffix_preprocess,
         )
 
-    def download_file(self) -> io.BufferedReader:
+    def download_file(self) -> pd.DataFrame:
         """Download file from bucket
         """
         return self.download_gcs(
diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql
index 5e53938a49..0cfebcb09a 100644
--- a/robotoff/batch/configs/sql/spellcheck.sql
+++ b/robotoff/batch/configs/sql/spellcheck.sql
@@ -1,4 +1,9 @@
-SELECT code, ingredients_text AS text, product_name, (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction
+SELECT 
+code, 
+ingredients_text AS text, 
+product_name, 
+lang, 
+(CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction
 FROM read_ndjson('DATASET_PATH', ignore_errors=True)
 WHERE ingredients_text NOT LIKE ''
 AND fraction > 0 AND fraction <= 0.4
diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py
index d8df8d48ec..5ebbc9dff0 100644
--- a/robotoff/batch/importer.py
+++ b/robotoff/batch/importer.py
@@ -1,12 +1,17 @@
-import io
-from typing import Iterator
+from typing import List
 
 import pandas as pd
 
-from robotoff.batch import BatchJobType
+from robotoff.insights.importer import import_insights
+from robotoff.batch import BatchJobType, GoogleStorageBucketForBatchJob
 from robotoff.types import Prediction, PredictionType
+from robotoff.models import db
+from robotoff.utils import get_logger
+from robotoff.types import ServerType
 
 
+LOGGER = get_logger(__name__)
+
 BATCH_JOB_TYPE_TO_FEATURES = {
     BatchJobType.ingredients_spellcheck: {
         "barcode": "code",
@@ -19,13 +24,28 @@
     BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck,
 }
 
-PREDICTOR_VERSION = "1"
+PREDICTOR_VERSION = "2"
+
+
+def import_batch_predictions(job_type: BatchJobType) -> None:
+    """Import predictions from remote storage.
+    """
+    bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
+    LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}")
+    df = bucket_handler.download_file()
+    predictions = _generate_predictions_from_batch(df, job_type)
+    with db:
+        import_results = import_insights(
+            predictions=predictions,
+            server_type=ServerType.off
+        )
+    LOGGER.info(f"Batch import results: {repr(import_results)}.")
 
 
-def generate_predictions_from_batch(
-    f: io.BufferedReader, 
+def _generate_predictions_from_batch(
+    df: pd.DataFrame, 
     job_type: BatchJobType
-) -> Iterator[Prediction]:
+) -> List[Prediction]:
     """From a file imported from google storage, generate predictions depending on the job type.
 
     :param f: Readable object. Should be a parquet file.
@@ -36,20 +56,18 @@ def generate_predictions_from_batch(
     :yield: Predictions.
     :rtype: Iterator[Prediction]
     """
+    predictions = []
     features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type]
     prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type]
-
-    try:
-        df = pd.read_parquet(f)
-    except Exception as e:
-        raise ValueError(f"Failed to read parquet file: {e}")
-    
     for _, row in df.iterrows():
-        yield Prediction(
-            type=prediction_type,
-            value=row[features_dict["value"]],
-            value_tag=[features_dict["value_tag"]],
-            barcode=row[features_dict["barcode"]],
-            predictor_version=PREDICTOR_VERSION,
-            predictor="llm",
+        predictions.append(
+            Prediction(
+                type=prediction_type,
+                value=row[features_dict["value"]],
+                value_tag=row[features_dict["value_tag"]],
+                barcode=row[features_dict["barcode"]],
+                predictor_version=PREDICTOR_VERSION,
+                predictor="llm",
+            )
         )
+    return predictions
diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index 275393b9eb..029f9aefcd 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1475,7 +1475,7 @@ def compute_crop_bounding_box(
         return results
 
 
-class IngredientsSpellcheckImporter(InsightImporter):
+class IngredientSpellcheckImporter(InsightImporter):
 
     @staticmethod
     def get_type() -> InsightType:
@@ -1492,9 +1492,9 @@ def generate_candidates(
         predictions: list[Prediction],
         product_id: ProductIdentifier,
     ) -> Iterator[ProductInsight]:
-        # No reason to have different candidates for now
-        candidate = predictions[0]
-        yield ProductInsight(**candidate.to_dict())
+        # Only one prediction
+        for candidate in predictions:
+            yield ProductInsight(**candidate.to_dict())
     
     @classmethod
     def is_conflicting_insight(
@@ -1502,7 +1502,7 @@ def is_conflicting_insight(
         candidate: ProductInsight, 
         reference: ProductInsight
     ) -> bool:
-        candidate.value == reference.value
+        candidate.value_tag == reference.value_tag
 
 class PackagingElementTaxonomyException(Exception):
     pass
@@ -1839,6 +1839,7 @@ def import_product_predictions(
     PackagingImporter,
     UPCImageImporter,
     NutritionImageImporter,
+    IngredientSpellcheckImporter,
 ]
 
 
diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py
index 73c9295afa..cc92cadfda 100644
--- a/robotoff/utils/buckets.py
+++ b/robotoff/utils/buckets.py
@@ -1,14 +1,12 @@
-import io
-from typing import Any
-
+import pandas as pd
 from google.cloud import storage
 
 
 class GoogleStorageBucket:
 
     @staticmethod
-    def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader:
-        """Download file from Google Storage Bucket.
+    def download_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
+        """Download parquet file from Google Storage Bucket.
 
         :param bucket_name: Bucket name
         :type bucket_name: str
@@ -21,7 +19,7 @@ def download_gcs(bucket_name: str, suffix: str) -> io.BufferedReader:
         bucket = client.get_bucket(bucket_name)
         blob = bucket.blob(suffix)
         with blob.open("rb") as f:
-            return f
+            return pd.read_parquet(f)
 
 
     @staticmethod

From 34ce80e42e76143e37a1eada1c031ab78b047e8c Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 27 Aug 2024 18:39:04 +0200
Subject: [PATCH 08/22] feat: :sparkles: Restructure code

---
 robotoff/batch/__init__.py   | 75 ++++++++++++++++++++++++++++++++----
 robotoff/batch/buckets.py    | 11 +-----
 robotoff/batch/extraction.py |  5 +--
 robotoff/batch/importer.py   | 73 -----------------------------------
 robotoff/batch/launch.py     | 16 +-------
 robotoff/batch/types.py      | 35 +++++++++++++++++
 robotoff/types.py            | 39 ++-----------------
 7 files changed, 112 insertions(+), 142 deletions(-)
 delete mode 100644 robotoff/batch/importer.py
 create mode 100644 robotoff/batch/types.py

diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 1d2e20c521..87e45fe69a 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,9 +1,20 @@
 import tempfile
+from typing import List
+
+import pandas as pd
+
+from robotoff.utils import get_logger
+from robotoff.types import (
+    BatchJobType,
+    Prediction, 
+    ServerType
+)
+from robotoff.models import db
+from robotoff.insights.importer import import_insights
 
 from .launch import (
     GoogleBatchJob,
     GoogleBatchJobConfig,
-    BatchJobType,
 )
 from .extraction import (
     BatchExtraction,
@@ -11,18 +22,21 @@
 from .buckets import (
     GoogleStorageBucketForBatchJob,
 )
-from .importer import import_batch_predictions
-from robotoff.utils import get_logger
+from .types import (
+    BATCH_JOB_TYPE_TO_FEATURES,
+    BATCH_JOB_TYPE_TO_PREDICTION_TYPE,
+)
 
 
 LOGGER = get_logger(__name__)
 
+PREDICTOR_VERSION = "1" #TODO: shard HF model version? instead of manual change?
 
-def launch_batch_job(job_type: BatchJobType) -> None:
-    """_summary_
+PREDICTOR = "llm"
 
-    :param job_type: _description_
-    :type job_type: BatchJobType
+
+def launch_batch_job(job_type: BatchJobType) -> None:
+    """Launch a batch job.
     """
     with tempfile.TemporaryDirectory() as tmp_dir:
         BatchExtraction.extract_from_dataset(
@@ -42,3 +56,50 @@ def launch_batch_job(job_type: BatchJobType) -> None:
     batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
     batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
     LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
+
+
+def import_batch_predictions(job_type: BatchJobType) -> None:
+    """Import predictions from remote storage.
+    """
+    bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
+    LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}")
+    df = bucket_handler.download_file()
+    predictions = _generate_predictions_from_batch(df, job_type)
+    with db:
+        import_results = import_insights(
+            predictions=predictions,
+            server_type=ServerType.off
+        )
+    LOGGER.info(f"Batch import results: {repr(import_results)}.")
+
+
+def _generate_predictions_from_batch(
+    df: pd.DataFrame, 
+    job_type: BatchJobType
+) -> List[Prediction]:
+    """From a file imported from google storage, generate predictions depending on the job type.
+
+    :param f: Readable object. Should be a parquet file.
+    :type f: io.BufferedReader
+    :param job_type: Batch job type.
+    :type job_type: BatchJobType
+    :rtype: Iterable[Prediction]
+    :yield: Predictions.
+    :rtype: Iterator[Prediction]
+    """
+    predictions = []
+    features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type]
+    prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type]
+    for _, row in df.iterrows():
+        predictions.append(
+            Prediction(
+                type=prediction_type,
+                value=row[features_dict["value"]],
+                value_tag=row[features_dict["value_tag"]],
+                barcode=row[features_dict["barcode"]],
+                predictor_version=PREDICTOR_VERSION,
+                predictor=PREDICTOR,
+            )
+        )
+    return predictions
+
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
index 655c8f0af9..278b62bf18 100644
--- a/robotoff/batch/buckets.py
+++ b/robotoff/batch/buckets.py
@@ -1,15 +1,8 @@
 import pandas as pd
+
 from robotoff.utils.buckets import GoogleStorageBucket
 from robotoff.batch import BatchJobType
-
-
-BATCH_JOB_TYPE_TO_BUCKET = {
-    BatchJobType.ingredients_spellcheck: {
-        "bucket": "robotoff-spellcheck", 
-        "suffix_preprocess": "data/preprocessed_data.parquet",
-        "suffix_postprocess": "data/postprocessed_data.parquet",
-    },
-}
+from robotoff.batch.types import BATCH_JOB_TYPE_TO_BUCKET
 
 
 class GoogleStorageBucketForBatchJob(GoogleStorageBucket):
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
index c0054c3b8e..5ca4d2f0e1 100644
--- a/robotoff/batch/extraction.py
+++ b/robotoff/batch/extraction.py
@@ -5,15 +5,12 @@
 
 from robotoff import settings
 from robotoff.batch import BatchJobType
+from robotoff.batch.types import BATCH_JOB_TYPE_TO_QUERY_FILE_PATH
 from robotoff.utils import get_logger
 
 
 LOGGER = get_logger(__name__)
 
-BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = {
-    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql",
-}
-
 
 class BatchExtraction:
     """Handle batch extraction from the dataset.
diff --git a/robotoff/batch/importer.py b/robotoff/batch/importer.py
deleted file mode 100644
index 5ebbc9dff0..0000000000
--- a/robotoff/batch/importer.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from typing import List
-
-import pandas as pd
-
-from robotoff.insights.importer import import_insights
-from robotoff.batch import BatchJobType, GoogleStorageBucketForBatchJob
-from robotoff.types import Prediction, PredictionType
-from robotoff.models import db
-from robotoff.utils import get_logger
-from robotoff.types import ServerType
-
-
-LOGGER = get_logger(__name__)
-
-BATCH_JOB_TYPE_TO_FEATURES = {
-    BatchJobType.ingredients_spellcheck: {
-        "barcode": "code",
-        "value": "correction",
-        "value_tag": "lang", 
-    },
-}
-
-BATCH_JOB_TYPE_TO_PREDICTION_TYPE = {
-    BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck,
-}
-
-PREDICTOR_VERSION = "2"
-
-
-def import_batch_predictions(job_type: BatchJobType) -> None:
-    """Import predictions from remote storage.
-    """
-    bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
-    LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}")
-    df = bucket_handler.download_file()
-    predictions = _generate_predictions_from_batch(df, job_type)
-    with db:
-        import_results = import_insights(
-            predictions=predictions,
-            server_type=ServerType.off
-        )
-    LOGGER.info(f"Batch import results: {repr(import_results)}.")
-
-
-def _generate_predictions_from_batch(
-    df: pd.DataFrame, 
-    job_type: BatchJobType
-) -> List[Prediction]:
-    """From a file imported from google storage, generate predictions depending on the job type.
-
-    :param f: Readable object. Should be a parquet file.
-    :type f: io.BufferedReader
-    :param job_type: Batch job type.
-    :type job_type: BatchJobType
-    :rtype: Iterable[Prediction]
-    :yield: Predictions.
-    :rtype: Iterator[Prediction]
-    """
-    predictions = []
-    features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type]
-    prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type]
-    for _, row in df.iterrows():
-        predictions.append(
-            Prediction(
-                type=prediction_type,
-                value=row[features_dict["value"]],
-                value_tag=row[features_dict["value_tag"]],
-                barcode=row[features_dict["barcode"]],
-                predictor_version=PREDICTOR_VERSION,
-                predictor="llm",
-            )
-        )
-    return predictions
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index cdd17b8b6f..a5ca69132c 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -1,6 +1,5 @@
 import abc
 from typing import List, Optional
-import enum
 import yaml
 import datetime
 import re
@@ -9,19 +8,8 @@
 from pydantic import BaseModel, Field, ConfigDict
 
 from robotoff import settings
-
-
-@enum.unique
-class BatchJobType(enum.Enum):
-    """Each job type correspond to a task that will be executed in the batch job."""
-
-    ingredients_spellcheck = "ingredients-spellcheck"
-
-
-# Paths batch job config files
-BATCH_JOB_TYPE_TO_CONFIG_PATH = {
-    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml",
-}
+from robotoff.types import BatchJobType
+from robotoff.batch.types import BATCH_JOB_TYPE_TO_CONFIG_PATH
 
 
 class GoogleBatchJobConfig(BaseModel):
diff --git a/robotoff/batch/types.py b/robotoff/batch/types.py
new file mode 100644
index 0000000000..c0c452cefd
--- /dev/null
+++ b/robotoff/batch/types.py
@@ -0,0 +1,35 @@
+from robotoff.types import BatchJobType, PredictionType
+from robotoff import settings
+
+
+# Bucket structure to enable the batch job to load and upload data
+BATCH_JOB_TYPE_TO_BUCKET = {
+    BatchJobType.ingredients_spellcheck: {
+        "bucket": "robotoff-spellcheck", 
+        "suffix_preprocess": "data/preprocessed_data.parquet",
+        "suffix_postprocess": "data/postprocessed_data.parquet",
+    },
+}
+
+# Paths batch job config files
+BATCH_JOB_TYPE_TO_CONFIG_PATH = {
+    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml",
+}
+
+BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = {
+    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql",
+}
+
+# Mapping between batch job type and prediction type
+BATCH_JOB_TYPE_TO_PREDICTION_TYPE = {
+    BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck,
+}
+
+# Column names in the processed batch of data
+BATCH_JOB_TYPE_TO_FEATURES = {
+    BatchJobType.ingredients_spellcheck: {
+        "barcode": "code",
+        "value": "correction",
+        "value_tag": "lang", 
+    },
+}
diff --git a/robotoff/types.py b/robotoff/types.py
index 8105d2030a..52704e0ec5 100644
--- a/robotoff/types.py
+++ b/robotoff/types.py
@@ -359,39 +359,8 @@ class PackagingElementProperty(enum.Enum):
 
 InsightAnnotation = Literal[-1, 0, 1, 2]
 
-
-
-
-
 @enum.unique
-class Lang(str, enum.Enum):
-    english = "en"
-    french = "fr"
-    german = "de"
-    spanish = "es"
-    italian = "it"
-    portuguese = "pt"
-    dutch = "nl"
-    polish = "pl"
-    russian = "ru"
-    japanese = "ja"
-    chinese = "zh"
-    arabic = "ar"
-    turkish = "tr"
-    vietnamese = "vi"
-    thai = "th"
-    korean = "ko"
-    ukrainian = "uk"
-    indonesian = "id"
-    hungarian = "hu"
-    greek = "el"
-    romanian = "ro"
-    danish = "da"
-    swedish = "sv"
-    norwegian = "no"
-    finnish = "fi"
-    bulgarian = "bg"
-    czech = "cs"
-    slovak = "sk"
-    croatian = "hr"
-    
\ No newline at end of file
+class BatchJobType(enum.Enum):
+    """Each job type correspond to a task that will be executed in the batch job.
+    """
+    ingredients_spellcheck = "ingredients-spellcheck"
\ No newline at end of file

From 92cb5f34a0e3e1b0024fa958fb7cf1608cfbe53f Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 28 Aug 2024 14:19:13 +0200
Subject: [PATCH 09/22] feat: :sparkles: Change batch job launch from api
 endpoint to CLI

There's no reason to configure the launch from endpoint. So we put in CLI instead of manual launch
---
 robotoff/app/api.py        | 21 ---------------------
 robotoff/batch/__init__.py |  4 ++--
 robotoff/cli/main.py       | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index 1e761b10f8..0c60637090 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -1760,26 +1760,6 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         resp.status = falcon.HTTP_200
 
 
-class BatchJobLaunchResource:
-    def on_post(self, req: falcon.Request, resp: falcon.Response):
-        job_type_str: str = req.get_param("job_type", required=True)
-        
-        try:
-            job_type = BatchJobType[job_type_str]
-        except KeyError: 
-            raise falcon.HTTPBadRequest(
-                description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
-            )
-        # Batch extraction can take some time, so we queue it
-        enqueue_job(
-            launch_batch_job,
-            queue=low_queue,
-            job_type=job_type,
-            job_kwargs={"timeout": "10m"},
-        )
-        logger.info("Batch job launch %s has been queued.", job_type)
-
-
 class BatchJobImportResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
         job_type_str: str = req.get_param("job_type", required=True)
@@ -1866,5 +1846,4 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/predictions", PredictionCollection())
 api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection())
 api.add_route("/robots.txt", RobotsTxtResource())
-api.add_route("/api/v1/batch/launch", BatchJobLaunchResource())
 api.add_route("/api/v1/batch/import", BatchJobImportResource())
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 87e45fe69a..855a4cebe1 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -79,8 +79,8 @@ def _generate_predictions_from_batch(
 ) -> List[Prediction]:
     """From a file imported from google storage, generate predictions depending on the job type.
 
-    :param f: Readable object. Should be a parquet file.
-    :type f: io.BufferedReader
+    :param df: Post-processed dataset
+    :type df: pd.DataFrame
     :param job_type: Batch job type.
     :type job_type: BatchJobType
     :rtype: Iterable[Prediction]
diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
index 4d69a00cc0..481b5bb57c 100644
--- a/robotoff/cli/main.py
+++ b/robotoff/cli/main.py
@@ -998,5 +998,22 @@ def create_migration(
         router.create(name, auto=auto)
 
 
+@app.command()
+def launch_batch_job(
+    job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"),
+) -> None:
+    """Launch a batch job."""
+    from robotoff.batch import launch_batch_job
+    from robotoff.utils import get_logger
+    from robotoff.types import BatchJobType
+
+    if job_type not in BatchJobType.__members__:
+        raise ValueError(f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}")
+    
+    get_logger()
+    job_type = BatchJobType[job_type]
+    launch_batch_job(job_type)
+
+
 def main() -> None:
     app()

From 54f1734306b9b1ee0383b1b138b52958bde4d556 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 28 Aug 2024 16:26:12 +0200
Subject: [PATCH 10/22] feat: :lock: Secure Batch Data Import endpoint with a
 token key

---
 docker-compose.yml   |  1 +
 robotoff/app/api.py  | 45 +++++++++++++++++++++++++++++++++++++-------
 robotoff/app/auth.py | 20 ++++++++++++++++++++
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index e1de341f05..1fe85dacd2 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -57,6 +57,7 @@ x-robotoff-base-env:
   NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings
   GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json
   GOOGLE_CLOUD_PROJECT: "robotoff"
+  BATCH_JOB_KEY: # Secure Batch job import with a token key 
   
 x-robotoff-worker-base:
   &robotoff-worker
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index 0c60637090..ca09dddbc3 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -25,7 +25,11 @@
 
 from robotoff import settings
 from robotoff.app import schema
-from robotoff.app.auth import BasicAuthDecodeError, basic_decode
+from robotoff.app.auth import (
+    BasicAuthDecodeError, 
+    basic_decode, 
+    validate_token,
+)
 from robotoff.app.core import (
     SkipVotedOn,
     SkipVotedType,
@@ -302,6 +306,29 @@ def parse_auth(req: falcon.Request) -> Optional[OFFAuthentication]:
     )
 
 
+def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool:
+    """Parse and validate authentification token from request.
+
+    :param req: Request.
+    :type req: falcon.Request
+    :param ref_token_name: Secret environment variable name. 
+    :type ref_token_name: str
+    :return: Token valid or not.
+    :rtype: bool
+    """
+    auth_header = req.get_header("Authorization", required=True)
+    
+    scheme, token = auth_header.split()
+    if scheme.lower() != 'bearer':
+        raise falcon.HTTPUnauthorized('Invalid authentication scheme.')
+
+    is_token_valid = validate_token(token, ref_token_name)
+    if not is_token_valid:
+        raise falcon.HTTPUnauthorized('Invalid token.')
+    else:
+        return True
+
+
 def device_id_from_request(req: falcon.Request) -> str:
     """Returns the 'device_id' from the request parameters, or a hash of the
     access route (which should be the IPs of the proxies and the client)."""
@@ -1770,12 +1797,16 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
             raise falcon.HTTPBadRequest(
                 description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
             )
-        enqueue_job(
-            import_batch_predictions,
-            job_type=job_type,
-            queue=low_queue,
-            job_kwargs={"timeout": "10m"},
-        )
+        # We secure the endpoint
+        if parse_valid_token(req, "batch_job_key"):
+            enqueue_job(
+                import_batch_predictions,
+                job_type=job_type,
+                queue=low_queue,
+                job_kwargs={"timeout": "10m"},
+            )
+        else:
+            raise falcon.HTTPForbidden(description="Invalid batch_job_key. Be sure to indicate the authentification key in the request.")
         logger.info("Batch import %s has been queued.", job_type)
 
 
diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py
index 8fbf84c0d6..899b62470d 100644
--- a/robotoff/app/auth.py
+++ b/robotoff/app/auth.py
@@ -1,3 +1,4 @@
+import os
 from base64 import b64decode
 from urllib.parse import unquote
 
@@ -6,6 +7,10 @@ class BasicAuthDecodeError(Exception):
     pass
 
 
+class APITokenError(Exception):
+    pass
+
+
 def basic_decode(encoded_str: str) -> tuple[str, str]:
     """Decode an encrypted HTTP basic authentication string. Returns a tuple of
     the form (username, password), and raises a BasicAuthDecodeError exception
@@ -39,3 +44,18 @@ def basic_decode(encoded_str: str) -> tuple[str, str]:
         raise BasicAuthDecodeError()
 
     return unquote(username), unquote(password)
+
+
+def validate_token(token: str, ref_token_name: str) -> bool:
+    """Validate token.
+
+    :param token: Authentification token
+    :type token: str
+    :param api_token_name: Validation token, stored in environment variables.
+    :type api_token_name: str
+    :rtype: bool
+    """
+    api_token = os.getenv(ref_token_name.upper())
+    if not api_token:
+        raise APITokenError("API token not set in environment variables.")
+    return token == api_token

From 4aabf4ba0780fc6eed2c7cdaf27ec22fc1a02d1c Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 28 Aug 2024 16:33:29 +0200
Subject: [PATCH 11/22] feat: :art: Add key during request by the batch job

---
 batch/spellcheck/main.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 0960765fef..3f4972628e 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 import tempfile
 import logging
@@ -78,7 +79,7 @@ def main():
             bucket_name=args.data_bucket, 
             suffix=args.post_data_suffix
         )
-    
+
     LOGGER.info("Request Robotoff API batch import endpoint.")
     run_robotoff_endpoint_batch_import()
 
@@ -159,14 +160,20 @@ def run_robotoff_endpoint_batch_import():
     """
     url = "https://robotoff.openfoodfacts.org/api/v1/batch/import"
     data = {"job_type": "ingredients_spellcheck"}
-
+    headers = {
+        "Authorization": f"Bearer {os.getenv("BATCH_JOB_KEY")}",
+        "Content-Type": "application/json"
+    }
     try:
-        response = requests.post(url, data=data)
+        response = requests.post(
+            url,
+            data=data,
+            headers=headers,
+        )
+        LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
     except requests.exceptions.RequestException as e:
         raise SystemExit(e)
-
-    LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
-
+    
 
 if __name__ == "__main__":
     main()

From 01d884a02f2bfff2e41d6e955b716212b22b9fab Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Mon, 2 Sep 2024 17:28:31 +0200
Subject: [PATCH 12/22] feat: :sparkles: Implemenation reviews

Simplify abstractions - Change data in insights instead of value - Other small changes
---
 batch/spellcheck/main.py     |  18 ++--
 poetry.lock                  |   2 +-
 pyproject.toml               |  12 +--
 robotoff/app/api.py          |  40 +++++----
 robotoff/app/auth.py         |   1 -
 robotoff/batch/__init__.py   | 142 +++++++++++++++--------------
 robotoff/batch/buckets.py    |  84 +++++++----------
 robotoff/batch/extraction.py | 112 ++++++++++-------------
 robotoff/batch/launch.py     | 170 +++++++++++++++--------------------
 robotoff/batch/types.py      |  35 --------
 robotoff/cli/main.py         |   4 +-
 robotoff/utils/buckets.py    |  39 --------
 tests/unit/test_batch.py     |  46 +++++-----
 13 files changed, 283 insertions(+), 422 deletions(-)
 delete mode 100644 robotoff/batch/types.py
 delete mode 100644 robotoff/utils/buckets.py

diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 3f4972628e..6c73648c7e 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -11,7 +11,7 @@
 from google.cloud import storage
 
 
-LOGGER = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -45,12 +45,12 @@ def main():
 
     We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository.  
     """
-    LOGGER.info("Starting batch processing job.")
+    logger.info("Starting batch processing job.")
     args = parse()
 
-    LOGGER.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}")
+    logger.info(f"Loading data from GCS: {args.data_bucket}/{args.pre_data_suffix}")
     data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix)
-    LOGGER.info(f"Feature in uploaded data: {data.columns}")
+    logger.info(f"Feature in uploaded data: {data.columns}")
     if not all(feature in data.columns for feature in FEATURES_VALIDATION):
         raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}")
 
@@ -66,10 +66,10 @@ def main():
         max_tokens=args.max_tokens
     )
 
-    LOGGER.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}")
+    logger.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}")
     data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params)
 
-    LOGGER.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}")
+    logger.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}")
     # Save DataFrame as Parquet to a temporary file
     with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file:
         data.to_parquet(temp_file.name)
@@ -80,10 +80,10 @@ def main():
             suffix=args.post_data_suffix
         )
 
-    LOGGER.info("Request Robotoff API batch import endpoint.")
+    logger.info("Request Robotoff API batch import endpoint.")
     run_robotoff_endpoint_batch_import()
 
-    LOGGER.info("Batch processing job completed.")
+    logger.info("Batch processing job completed.")
 
 
 def prepare_instruction(text: str) -> str:
@@ -170,7 +170,7 @@ def run_robotoff_endpoint_batch_import():
             data=data,
             headers=headers,
         )
-        LOGGER.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
+        logger.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
     except requests.exceptions.RequestException as e:
         raise SystemExit(e)
     
diff --git a/poetry.lock b/poetry.lock
index c0680af5a8..43da024f22 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4420,4 +4420,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "07551d5c2b36e7b3321ac361d384ef02a74de7686051a450ecd692a0b832929b"
+content-hash = "9a02871efced91ed473a7af971e0f46acd4209165cb7597ca5d9b417267992a6"
diff --git a/pyproject.toml b/pyproject.toml
index 133f61c850..43669c7848 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,7 @@ jsonschema = "~4.4.0"
 orjson = ">=3.8.2,<3.10.0"
 Pillow = ">=9.3,<10.4"
 numpy = "~1.26.4"
-protobuf = "^3.19.0"
+protobuf = "~3.20.3"
 Pint = "0.22"
 APScheduler = "~3.10.1"
 more-itertools = "~8.9.0"
@@ -78,11 +78,11 @@ openfoodfacts = "1.1.1"
 imagehash = "~4.3.1"
 peewee-migrate = "~1.12.2"
 diskcache = "~5.6.3"
-google-cloud-batch = "^0.17.26"
-duckdb = "1.0.0"
-google-cloud-storage = "<2.18.2"
-pandas = "^2.2.2"
-pyarrow = "^17.0.0"
+google-cloud-batch = "~0.17.26"
+duckdb = "~1.0.0"
+google-cloud-storage = "~2.14.0"
+pandas = "~2.2.2"
+pyarrow = "~17.0.0"
 
 [tool.poetry.dependencies.sentry-sdk]
 version = "~1.14.0"
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index ca09dddbc3..e9003929ee 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -26,7 +26,8 @@
 from robotoff import settings
 from robotoff.app import schema
 from robotoff.app.auth import (
-    BasicAuthDecodeError, 
+    BasicAuthDecodeError,
+    APITokenError, 
     basic_decode, 
     validate_token,
 )
@@ -92,7 +93,6 @@
 from robotoff.workers.tasks import download_product_dataset_job
 from robotoff.batch import (
     BatchJobType,
-    launch_batch_job,
     import_batch_predictions,
 )
 
@@ -314,14 +314,15 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool:
     :param ref_token_name: Secret environment variable name. 
     :type ref_token_name: str
     :return: Token valid or not.
-    :rtype: bool
     """
     auth_header = req.get_header("Authorization", required=True)
-    
-    scheme, token = auth_header.split()
-    if scheme.lower() != 'bearer':
-        raise falcon.HTTPUnauthorized('Invalid authentication scheme.')
 
+    try:
+        scheme, token = auth_header.split()
+    except APITokenError:
+        raise falcon.HTTPUnauthorized("Invalid authentication scheme.")
+    if scheme.lower() != 'bearer':
+        raise falcon.HTTPUnauthorized("Invalid authentication scheme: 'Bearer Token' expected.")
     is_token_valid = validate_token(token, ref_token_name)
     if not is_token_valid:
         raise falcon.HTTPUnauthorized('Invalid token.')
@@ -1778,14 +1779,6 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         resp.media = response
 
 
-class RobotsTxtResource:
-    def on_get(self, req: falcon.Request, resp: falcon.Response):
-        # Disallow completely indexation: otherwise web crawlers send millions
-        # of requests to Robotoff (420k requests/day by Google alone)
-        resp.body = "User-agent: *\nDisallow: /\n"
-        resp.content_type = falcon.MEDIA_TEXT
-        resp.status = falcon.HTTP_200
-
 
 class BatchJobImportResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
@@ -1803,12 +1796,23 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
                 import_batch_predictions,
                 job_type=job_type,
                 queue=low_queue,
-                job_kwargs={"timeout": "10m"},
+                job_kwargs={"timeout": "30m"},
             )
         else:
-            raise falcon.HTTPForbidden(description="Invalid batch_job_key. Be sure to indicate the authentification key in the request.")
+            raise falcon.HTTPForbidden(
+                description="Invalid batch_job_key. Be sure to indicate the authentification key in the request."
+            )
         logger.info("Batch import %s has been queued.", job_type)
 
+        
+class RobotsTxtResource:
+    def on_get(self, req: falcon.Request, resp: falcon.Response):
+        # Disallow completely indexation: otherwise web crawlers send millions
+        # of requests to Robotoff (420k requests/day by Google alone)
+        resp.body = "User-agent: *\nDisallow: /\n"
+        resp.content_type = falcon.MEDIA_TEXT
+        resp.status = falcon.HTTP_200
+
 
 def custom_handle_uncaught_exception(
     req: falcon.Request, resp: falcon.Response, ex: Exception, params
@@ -1876,5 +1880,5 @@ def custom_handle_uncaught_exception(
 api.add_route("/api/v1/users/statistics/{username}", UserStatisticsResource())
 api.add_route("/api/v1/predictions", PredictionCollection())
 api.add_route("/api/v1/annotation/collection", LogoAnnotationCollection())
-api.add_route("/robots.txt", RobotsTxtResource())
 api.add_route("/api/v1/batch/import", BatchJobImportResource())
+api.add_route("/robots.txt", RobotsTxtResource())
diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py
index 899b62470d..5eef036497 100644
--- a/robotoff/app/auth.py
+++ b/robotoff/app/auth.py
@@ -53,7 +53,6 @@ def validate_token(token: str, ref_token_name: str) -> bool:
     :type token: str
     :param api_token_name: Validation token, stored in environment variables.
     :type api_token_name: str
-    :rtype: bool
     """
     api_token = os.getenv(ref_token_name.upper())
     if not api_token:
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 855a4cebe1..27e4b5ae80 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,105 +1,103 @@
+import os
 import tempfile
-from typing import List
-
-import pandas as pd
 
 from robotoff.utils import get_logger
 from robotoff.types import (
     BatchJobType,
     Prediction, 
-    ServerType
+    ServerType,
 )
 from robotoff.models import db
 from robotoff.insights.importer import import_insights
+from robotoff import settings
+from robotoff.types import PredictionType
 
-from .launch import (
-    GoogleBatchJob,
-    GoogleBatchJobConfig,
-)
-from .extraction import (
-    BatchExtraction,
-)
-from .buckets import (
-    GoogleStorageBucketForBatchJob,
-)
-from .types import (
-    BATCH_JOB_TYPE_TO_FEATURES,
-    BATCH_JOB_TYPE_TO_PREDICTION_TYPE,
-)
-
-
-LOGGER = get_logger(__name__)
+from .launch import launch_job, GoogleBatchJobConfig
+from .extraction import extract_from_dataset
+from .buckets import upload_file_to_gcs, fetch_dataframe_from_gcs
 
-PREDICTOR_VERSION = "1" #TODO: shard HF model version? instead of manual change?
 
-PREDICTOR = "llm"
+logger = get_logger(__name__)
 
 
 def launch_batch_job(job_type: BatchJobType) -> None:
     """Launch a batch job.
+    Need to be updated if different batch jobs are added.
+    """
+    if job_type is BatchJobType.ingredients_spellcheck:
+        launch_spellcheck_batch_job()
+    else:
+        raise NotImplementedError(f"Batch job type {job_type} not implemented.")
+    
+
+def import_batch_predictions(job_type: BatchJobType) -> None:
+    """Import batch predictions once the job finished.
+    Need to be updated if different batch jobs are added.
+    """
+    if job_type is BatchJobType.ingredients_spellcheck:
+        import_spellcheck_batch_predictions()
+    else:
+        raise NotImplementedError(f"Batch job type {job_type} not implemented.")
+
+
+def launch_spellcheck_batch_job() -> None:
+    """Launch spellcheck batch job.
     """
+    # Init
+    JOB_NAME = "ingredients-spellcheck"
+    QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql"
+    BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
+    BUCKET_NAME = "robotoff-spellcheck"
+    SUFFIX_PREPROCESS = "data/preprocessed_data.parquet"
+
+    # Extract data from dataset
     with tempfile.TemporaryDirectory() as tmp_dir:
-        BatchExtraction.extract_from_dataset(
-            job_type=job_type,
-            output_dir=tmp_dir,
-        )
-        if not BatchExtraction.extracted_file_path:
-            raise ValueError("The extracted file was not found.")
-        LOGGER.debug(f"Batch data succesfully extracted and saved at {BatchExtraction.extracted_file_path}")
+        file_path = os.path.join(tmp_dir, "batch_data.parquet")
+        extract_from_dataset(QUERY_FILE_PATH, file_path)
 
         # Upload the extracted file to the bucket
-        bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
-        bucket_handler.upload_file(file_path=BatchExtraction.extracted_file_path)
-        LOGGER.debug(f"File uploaded to the bucket {bucket_handler.bucket}/{bucket_handler.suffix_preprocess}")
+        upload_file_to_gcs(file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS)
+        logger.debug(f"File uploaded to the bucket {BUCKET_NAME}/{SUFFIX_PREPROCESS}")
 
     # Launch batch job
-    batch_job_config = GoogleBatchJobConfig.init(job_type=job_type)
-    batch_job = GoogleBatchJob.launch_job(batch_job_config=batch_job_config)
-    LOGGER.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
+    batch_job_config = GoogleBatchJobConfig.init(job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH)
+    batch_job = launch_job(batch_job_config=batch_job_config)
+    logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
 
 
-def import_batch_predictions(job_type: BatchJobType) -> None:
-    """Import predictions from remote storage.
-    """
-    bucket_handler = GoogleStorageBucketForBatchJob.from_job_type(job_type)
-    LOGGER.debug(f"Batch data downloaded from bucket {bucket_handler.bucket}/{bucket_handler.suffix_postprocess}")
-    df = bucket_handler.download_file()
-    predictions = _generate_predictions_from_batch(df, job_type)
-    with db:
-        import_results = import_insights(
-            predictions=predictions,
-            server_type=ServerType.off
-        )
-    LOGGER.info(f"Batch import results: {repr(import_results)}.")
-
-
-def _generate_predictions_from_batch(
-    df: pd.DataFrame, 
-    job_type: BatchJobType
-) -> List[Prediction]:
-    """From a file imported from google storage, generate predictions depending on the job type.
-
-    :param df: Post-processed dataset
-    :type df: pd.DataFrame
-    :param job_type: Batch job type.
-    :type job_type: BatchJobType
-    :rtype: Iterable[Prediction]
-    :yield: Predictions.
-    :rtype: Iterator[Prediction]
+def import_spellcheck_batch_predictions() -> None:
+    """Import spellcheck predictions from remote storage.
     """
+    # Init
+    BUCKET_NAME = "robotoff-spellcheck"
+    SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet"
+    PREDICTION_TYPE = PredictionType.ingredient_spellcheck
+    PREDICTOR_VERSION = "1" #TODO: shard HF model version instead of manual change?
+    PREDICTOR = "fine-tuned-mistral-7b"
+    SERVER_TYPE = ServerType.off
+    
+    df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS)
+    logger.debug(f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}")
+
+
+    # Generate predictions
     predictions = []
-    features_dict = BATCH_JOB_TYPE_TO_FEATURES[job_type]
-    prediction_type = BATCH_JOB_TYPE_TO_PREDICTION_TYPE[job_type]
     for _, row in df.iterrows():
         predictions.append(
             Prediction(
-                type=prediction_type,
-                value=row[features_dict["value"]],
-                value_tag=row[features_dict["value_tag"]],
-                barcode=row[features_dict["barcode"]],
+                type=PREDICTION_TYPE,
+                data={"original": row["text"], "correction": row["correction"]},
+                value_tag=row["lang"],
+                barcode=row["code"],
                 predictor_version=PREDICTOR_VERSION,
                 predictor=PREDICTOR,
+                automatic_processing=False,
             )
         )
-    return predictions
-
+    # Store predictions and insights
+    with db:
+        import_results = import_insights(
+            predictions=predictions,
+            server_type=SERVER_TYPE
+        )
+    logger.info("Batch import results: %s", import_results)
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
index 278b62bf18..77ae4f4ba0 100644
--- a/robotoff/batch/buckets.py
+++ b/robotoff/batch/buckets.py
@@ -1,63 +1,39 @@
 import pandas as pd
+from google.cloud import storage
 
-from robotoff.utils.buckets import GoogleStorageBucket
-from robotoff.batch import BatchJobType
-from robotoff.batch.types import BATCH_JOB_TYPE_TO_BUCKET
 
+def upload_file_to_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
+    """Upload file to Google Storage Bucket.
 
-class GoogleStorageBucketForBatchJob(GoogleStorageBucket):
-    """Class to handle the Google Storage bucket for depending on the batch job.
-
-    :param bucket: Bucket name
-    :type bucket: str
-    :param suffix_preprocess: Path inside the bucket before batch processing.
-    :type suffix_preprocess: str
-    :param suffix_postprocess: Path inside the bucket after batch processing.
-    :type suffix_postprocess: str
+    :param file_path: File where the data is stored
+    :type file_path: str
+    :param bucket_name: Bucket name in GCP storage
+    :type bucket_name: str
+    :param suffix: Path inside the bucket
+    :type suffix: str
     """
+    client = storage.Client()
+    bucket = client.get_bucket(bucket_name)
+    blob = bucket.blob(suffix)
+    blob.upload_from_filename(filename=file_path)
 
-    def __init__(
-        self,
-        bucket: str,
-        suffix_preprocess: str,
-        suffix_postprocess: str,
-    ) -> None:
-        self.bucket = bucket
-        self.suffix_preprocess = suffix_preprocess
-        self.suffix_postprocess = suffix_postprocess
-    
-    @classmethod
-    def from_job_type(cls, job_type: BatchJobType) -> "GoogleStorageBucketForBatchJob":
-        """Initialize the class with the bucket and suffix names corresponding to the batch job type.
-        Used to adapt bucket upload and download during the batch job process.
-
-        :param job_type: Batch job type. 
-        :type job_type: BatchJobType
-        :return: Instantiated class.
-        :rtype: GoogleStorageBucketForBatchJob
-        """
-        try: 
-            bucket_dict = BATCH_JOB_TYPE_TO_BUCKET[job_type]
-        except KeyError:
-            raise ValueError(f"Batch job type {job_type} not found in the configuration. Expected {BATCH_JOB_TYPE_TO_BUCKET}.")
-        return cls(**bucket_dict)
 
-    def upload_file(self, file_path: str):
-        """Upload file to the bucket.
+def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
+    """Download parquet file from Google Storage Bucket.
 
-        :param file_path: File path to upload.
-        :type file_path: str
-        """
-        self.upload_gcs(
-            file_path=file_path,
-            bucket_name=self.bucket,
-            suffix=self.suffix_preprocess,
-        )
 
-    def download_file(self) -> pd.DataFrame:
-        """Download file from bucket
-        """
-        return self.download_gcs(
-            bucket_name=self.bucket,
-            suffix=self.suffix_postprocess,
-        )
+    :param bucket_name: Bucket name in GCP storage
+    :type bucket_name: str
+    :param suffix: Path inside the bucket. Should lead to a parquet file.
+    :type suffix: str
+    :return: Dataframe
+    """
+    client = storage.Client()
+    bucket = client.get_bucket(bucket_name)
+    blob = bucket.blob(suffix)
+    with blob.open("rb") as f:
+        try: 
+            df = pd.read_parquet(f)
+        except Exception as e:
+            raise ValueError(f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}")
+        return df
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
index 5ca4d2f0e1..2013e3fa26 100644
--- a/robotoff/batch/extraction.py
+++ b/robotoff/batch/extraction.py
@@ -1,82 +1,64 @@
-import os
 from pathlib import Path
 
 import duckdb
 
 from robotoff import settings
-from robotoff.batch import BatchJobType
-from robotoff.batch.types import BATCH_JOB_TYPE_TO_QUERY_FILE_PATH
 from robotoff.utils import get_logger
 
 
-LOGGER = get_logger(__name__)
+logger = get_logger(__name__)
 
 
-class BatchExtraction:
-    """Handle batch extraction from the dataset.
-    Extraction varies regarding the batch job.
-    """
+def extract_from_dataset(
+    query_file_path: Path,
+    output_file_path: str,
+    dataset_path: Path = settings.JSONL_DATASET_PATH,
+) -> None:
+    """Using SQL queries, extract data from the dataset and save it as a parquet file.
 
-    file_name: str = "batch.parquet"
-    extracted_file_path: str = None
+    :param query_file_path: Path to the SQL file relative to the job.
+    :type query_file_path: Path
+    :param output_file_path: Path to save the extracted data.
+    :type output_file_path: str
+    :param dataset_path: Compressed jsonl database, defaults to settings.JSONL_DATASET_PATH
+    :type dataset_path: Path, optional
+    """
+    if not dataset_path.exists():
+        raise FileNotFoundError(f"Dataset path {str(dataset_path)} not found.")
+    query = _load_query(query_file_path=query_file_path, dataset_path=dataset_path)
+    _extract_and_save_batch_data(query=query, output_file_path=output_file_path)
+    logger.debug(f"Batch data succesfully extracted and saved at {output_file_path}")
 
-    @classmethod
-    def extract_from_dataset(
-        cls, 
-        job_type: BatchJobType,
-        output_dir: str, 
-        dataset_path: str = str(settings.JSONL_DATASET_PATH), 
-    ) -> None:
-        """Using SQL queries, extract data from the dataset and save it as a parquet file.
 
-        :param job_type: Batch job type.
-        :type job_type: BatchJobType
-        :param output_dir: Directory to save the extracted data as a parquet file.
-        :type output_dir: str
-        :param dataset_path: Path to the jsonl.gz dataset.
-        :type dataset_path: Path, optional. Default to settings.JSONL_DATASET_PATH. Mainly used for testing.
-        """
-        if not isinstance(dataset_path, str):
-            raise ValueError(f"The dataset path should be a string. Current type {type(dataset_path)}")
-        
-        query_file_path = BATCH_JOB_TYPE_TO_QUERY_FILE_PATH[job_type]
-        query = cls._load_query(query_file_path=query_file_path, dataset_path=dataset_path)
-        cls._extract_and_save_batch_data(query=query, output_dir=output_dir)
-        # We save the file path for later usage in the pipeline
-        cls.extracted_file_path = os.path.join(output_dir, cls.file_name)
 
-    @staticmethod
-    def _load_query(query_file_path: Path, dataset_path: str) -> str:
-        """Load the SQL query from a corresponding file.
+def _load_query(query_file_path: Path, dataset_path: Path) -> str:
+    """Load the SQL query from a corresponding file.
 
-        :param query_file_path: File path containing the SQL query.
-        :type query_file_path: Path
-        :param dataset_path: Path to the jsonl.gz dataset.
-        :type dataset_path: Path   
-        :raises ValueError: In case the Dataset path is not found in the SQL query.
-        :return: the SQL/DuckDB query.
-        :rtype: str
-        """
-        query = query_file_path.read_text()
-        if "DATASET_PATH" not in query:
-            raise ValueError(
-                "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path."
-            )
-        query = query.replace("DATASET_PATH", dataset_path)
-        LOGGER.debug(f"Query used to extract batch from dataset: {query}")
-        return query
+    :param query_file_path: Path to the SQL file relative to the job.
+    :type query_file_path: Path
+    :param dataset_path: Path to the dataset.
+    :type dataset_path: Path
+    :return: SQL query.
+    """
+    query = query_file_path.read_text()
+    if "DATASET_PATH" not in query:
+        raise ValueError(
+            "The SQL query should contain the string 'DATASET_PATH' to replace it with the dataset path."
+        )
+    query = query.replace("DATASET_PATH", str(dataset_path))
+    logger.debug(f"Query used to extract batch from dataset: {query}")
+    return query
 
-    @classmethod
-    def _extract_and_save_batch_data(cls, query: str, output_dir: str) -> None:
-        """Query and save the data.
+def _extract_and_save_batch_data(query: str, output_file_path: str) -> None:
+    """Query and save the data.
 
-        :param query: DuckDB/SQL query.
-        :type query: str
-        :param output_dir: Extracted data directory
-        :type output_dir: str
-        """
-        (
-            duckdb
-            .sql(query)
-            .write_parquet(os.path.join(output_dir, cls.file_name))
-        )
+    :param query: SQL query.
+    :type query: str
+    :param output_file_path: Path to save the extracted data.
+    :type output_file_path: str
+    """
+    (
+        duckdb
+        .sql(query)
+        .write_parquet(output_file_path)
+    )
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index a5ca69132c..247f7a6c05 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -1,15 +1,13 @@
-import abc
 from typing import List, Optional
 import yaml
 import datetime
 import re
+from pathlib import Path
 
 from google.cloud import batch_v1
 from pydantic import BaseModel, Field, ConfigDict
 
 from robotoff import settings
-from robotoff.types import BatchJobType
-from robotoff.batch.types import BATCH_JOB_TYPE_TO_CONFIG_PATH
 
 
 class GoogleBatchJobConfig(BaseModel):
@@ -88,7 +86,7 @@ class GoogleBatchJobConfig(BaseModel):
     )
 
     @classmethod
-    def init(cls, job_type: BatchJobType):
+    def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig":
         """Initialize the class with the configuration file corresponding to the job type.
 
         :param job_type: Batch job type.
@@ -96,110 +94,88 @@ def init(cls, job_type: BatchJobType):
         """
         # Batch job name should respect a specific pattern, or returns an error
         pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
-        if not re.match(pattern, job_type.value):
-            raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_type.value}")
+        if not re.match(pattern, job_name):
+            raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_name}")
         
         # Generate unique id for the job
         unique_job_name = (
-            job_type.value + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+            job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
         )
         # Load config file from job_type
-        config_path = BATCH_JOB_TYPE_TO_CONFIG_PATH[job_type]
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
         return cls(job_name=unique_job_name, **config)
 
 
-class BatchJob(abc.ABC):
-    """Abstract class to launch and manage batch jobs: Google, AWS, Azure, Triton..."""
+def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
+    """This method creates a Batch Job on GCP.
 
-    @staticmethod
-    @abc.abstractmethod
-    def launch_job() -> str:
-        """Launch batch job."""
-        pass
+    Sources:
+    * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
+    * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
+    
+    :param google_batch_launch_config: Config to run a job on Google Batch.
+    :type google_batch_launch_config: GoogleBatchLaunchConfig
+    :param batch_job_config: Config to run a specific job on Google Batch.
+    :type batch_job_config: BatchJobConfig
+    :return: Batch job information.
 
-
-class GoogleBatchJob(BatchJob):
-    """GCP Batch class. It uses the Google Cloud Batch API to launch and manage jobs.
-
-    More information on:
-    https://cloud.google.com/batch/docs/get-started
+    Returns:
+        Batch job information.
     """
 
-    @staticmethod
-    def launch_job(
-        batch_job_config: GoogleBatchJobConfig,
-    ) -> batch_v1.Job:
-        """This method creates a Batch Job on GCP.
-
-        Sources:
-        * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
-        * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
-        
-        :param google_batch_launch_config: Config to run a job on Google Batch.
-        :type google_batch_launch_config: GoogleBatchLaunchConfig
-        :param batch_job_config: Config to run a specific job on Google Batch.
-        :type batch_job_config: BatchJobConfig
-        :return: Batch job information.
-        :rtype: batch_v1.Job
-
-        Returns:
-            Batch job information.
-        """
-
-        client = batch_v1.BatchServiceClient()
-
-        # Define what will be done as part of the job.
-        runnable = batch_v1.Runnable()
-        runnable.container = batch_v1.Runnable.Container()
-        runnable.container.image_uri = batch_job_config.container_image_uri
-        runnable.container.entrypoint = batch_job_config.entrypoint
-        runnable.container.commands = batch_job_config.commands
-
-        # Jobs can be divided into tasks. In this case, we have only one task.
-        task = batch_v1.TaskSpec()
-        task.runnables = [runnable]
-
-        # We can specify what resources are requested by each task.
-        resources = batch_v1.ComputeResource()
-        resources.cpu_milli = batch_job_config.cpu_milli
-        resources.memory_mib = batch_job_config.memory_mib
-        resources.boot_disk_mib = batch_job_config.boot_disk_mib
-        task.compute_resource = resources
-
-        task.max_retry_count = batch_job_config.max_retry_count
-        task.max_run_duration = batch_job_config.max_run_duration
-
-        # Tasks are grouped inside a job using TaskGroups.
-        group = batch_v1.TaskGroup()
-        group.task_count = batch_job_config.task_count
-        group.task_spec = task
-
-        # Policies are used to define on what kind of virtual machines the tasks will run on.
-        policy = batch_v1.AllocationPolicy.InstancePolicy()
-        policy.machine_type = batch_job_config.machine_type
-        instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
-        instances.install_gpu_drivers = batch_job_config.install_gpu_drivers
-        instances.policy = policy
-        allocation_policy = batch_v1.AllocationPolicy()
-        allocation_policy.instances = [instances]
-
-        accelerator = batch_v1.AllocationPolicy.Accelerator()
-        accelerator.type_ = batch_job_config.accelerators_type
-        accelerator.count = batch_job_config.accelerators_count
-
-        job = batch_v1.Job()
-        job.task_groups = [group]
-        job.allocation_policy = allocation_policy
-        # We use Cloud Logging as it's an out of the box available option
-        job.logs_policy = batch_v1.LogsPolicy()
-        job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
-
-        create_request = batch_v1.CreateJobRequest()
-        create_request.job = job
-        create_request.job_id = batch_job_config.job_name
-        # The job's parent is the region in which the job will run
-        create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}"
-
-        return client.create_job(create_request)
+    client = batch_v1.BatchServiceClient()
+
+    # Define what will be done as part of the job.
+    runnable = batch_v1.Runnable()
+    runnable.container = batch_v1.Runnable.Container()
+    runnable.container.image_uri = batch_job_config.container_image_uri
+    runnable.container.entrypoint = batch_job_config.entrypoint
+    runnable.container.commands = batch_job_config.commands
+
+    # Jobs can be divided into tasks. In this case, we have only one task.
+    task = batch_v1.TaskSpec()
+    task.runnables = [runnable]
+
+    # We can specify what resources are requested by each task.
+    resources = batch_v1.ComputeResource()
+    resources.cpu_milli = batch_job_config.cpu_milli
+    resources.memory_mib = batch_job_config.memory_mib
+    resources.boot_disk_mib = batch_job_config.boot_disk_mib
+    task.compute_resource = resources
+
+    task.max_retry_count = batch_job_config.max_retry_count
+    task.max_run_duration = batch_job_config.max_run_duration
+
+    # Tasks are grouped inside a job using TaskGroups.
+    group = batch_v1.TaskGroup()
+    group.task_count = batch_job_config.task_count
+    group.task_spec = task
+
+    # Policies are used to define on what kind of virtual machines the tasks will run on.
+    policy = batch_v1.AllocationPolicy.InstancePolicy()
+    policy.machine_type = batch_job_config.machine_type
+    instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
+    instances.install_gpu_drivers = batch_job_config.install_gpu_drivers
+    instances.policy = policy
+    allocation_policy = batch_v1.AllocationPolicy()
+    allocation_policy.instances = [instances]
+
+    accelerator = batch_v1.AllocationPolicy.Accelerator()
+    accelerator.type_ = batch_job_config.accelerators_type
+    accelerator.count = batch_job_config.accelerators_count
+
+    job = batch_v1.Job()
+    job.task_groups = [group]
+    job.allocation_policy = allocation_policy
+    # We use Cloud Logging as it's an out of the box available option
+    job.logs_policy = batch_v1.LogsPolicy()
+    job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
+
+    create_request = batch_v1.CreateJobRequest()
+    create_request.job = job
+    create_request.job_id = batch_job_config.job_name
+    # The job's parent is the region in which the job will run
+    create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}"
+
+    return client.create_job(create_request)
diff --git a/robotoff/batch/types.py b/robotoff/batch/types.py
deleted file mode 100644
index c0c452cefd..0000000000
--- a/robotoff/batch/types.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from robotoff.types import BatchJobType, PredictionType
-from robotoff import settings
-
-
-# Bucket structure to enable the batch job to load and upload data
-BATCH_JOB_TYPE_TO_BUCKET = {
-    BatchJobType.ingredients_spellcheck: {
-        "bucket": "robotoff-spellcheck", 
-        "suffix_preprocess": "data/preprocessed_data.parquet",
-        "suffix_postprocess": "data/postprocessed_data.parquet",
-    },
-}
-
-# Paths batch job config files
-BATCH_JOB_TYPE_TO_CONFIG_PATH = {
-    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml",
-}
-
-BATCH_JOB_TYPE_TO_QUERY_FILE_PATH = {
-    BatchJobType.ingredients_spellcheck: settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql",
-}
-
-# Mapping between batch job type and prediction type
-BATCH_JOB_TYPE_TO_PREDICTION_TYPE = {
-    BatchJobType.ingredients_spellcheck: PredictionType.ingredient_spellcheck,
-}
-
-# Column names in the processed batch of data
-BATCH_JOB_TYPE_TO_FEATURES = {
-    BatchJobType.ingredients_spellcheck: {
-        "barcode": "code",
-        "value": "correction",
-        "value_tag": "lang", 
-    },
-}
diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
index 481b5bb57c..47fe2a0c24 100644
--- a/robotoff/cli/main.py
+++ b/robotoff/cli/main.py
@@ -1003,7 +1003,7 @@ def launch_batch_job(
     job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"),
 ) -> None:
     """Launch a batch job."""
-    from robotoff.batch import launch_batch_job
+    from robotoff.batch import launch_batch_job as _launch_batch_job
     from robotoff.utils import get_logger
     from robotoff.types import BatchJobType
 
@@ -1012,7 +1012,7 @@ def launch_batch_job(
     
     get_logger()
     job_type = BatchJobType[job_type]
-    launch_batch_job(job_type)
+    _launch_batch_job(job_type)
 
 
 def main() -> None:
diff --git a/robotoff/utils/buckets.py b/robotoff/utils/buckets.py
deleted file mode 100644
index cc92cadfda..0000000000
--- a/robotoff/utils/buckets.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import pandas as pd
-from google.cloud import storage
-
-
-class GoogleStorageBucket:
-
-    @staticmethod
-    def download_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
-        """Download parquet file from Google Storage Bucket.
-
-        :param bucket_name: Bucket name
-        :type bucket_name: str
-        :param suffix: Path inside the bucket
-        :type suffix: str
-        :return: 
-        :rtype: Any
-        """
-        client = storage.Client()
-        bucket = client.get_bucket(bucket_name)
-        blob = bucket.blob(suffix)
-        with blob.open("rb") as f:
-            return pd.read_parquet(f)
-
-
-    @staticmethod
-    def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
-        """Upload file to Google Storage Bucket.
-
-        :param file_path: File path.
-        :type file_path: str
-        :param bucket_name: Bucket name.
-        :type bucket_name: str
-        :param suffix: Path inside the bucket.
-        :type suffix: str
-        """
-        client = storage.Client()
-        bucket = client.get_bucket(bucket_name)
-        blob = bucket.blob(suffix)
-        blob.upload_from_filename(filename=file_path)
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
index 25f8c94e66..d42297e51f 100644
--- a/tests/unit/test_batch.py
+++ b/tests/unit/test_batch.py
@@ -1,43 +1,43 @@
+import os
 import pytest
 import tempfile
 from pathlib import Path
 
-from robotoff.batch import (
-    GoogleBatchJobConfig,
-    BatchJobType,
-    BatchExtraction,
-)
+from robotoff.batch import GoogleBatchJobConfig
+from robotoff.batch.extraction import extract_from_dataset
+from robotoff import settings
 
 
 DIR = Path(__file__).parent
-JOB_TYPES = [
-    "ingredients_spellcheck",
-]
+SPELLCHECK_QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql"
+SPELLCHECK_BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
 
 
-# Add future job types here for testing.
 @pytest.mark.parametrize(
-    "job_type_str",
-    JOB_TYPES,
+    "inputs",
+    [
+        ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH),
+    ],
 )
-def test_batch_job_config_file(job_type_str):
+def test_batch_job_config_file(inputs):
     "Test indirectly the batch job config file by validating with the Pydantic class model."
-    job_type = BatchJobType[job_type_str]
-    GoogleBatchJobConfig.init(job_type)
+    job_name, config_path = inputs
+    GoogleBatchJobConfig.init(job_name, config_path)
 
 
-# Add future job types here for testing.
 @pytest.mark.parametrize(
-    "job_type_str",
-    JOB_TYPES,
+    "query_file_path",
+    [
+        SPELLCHECK_QUERY_FILE_PATH,
+    ]
 )
-def test_batch_extraction(job_type_str):
+def test_batch_extraction(query_file_path):
     """Test extraction of a batch of data from the dataset depending on the job type.
     """
-    job_type_str = BatchJobType[job_type_str]
     with tempfile.TemporaryDirectory() as tmp_dir:
-        BatchExtraction.extract_from_dataset(
-            job_type=job_type_str,
-            output_dir=tmp_dir,
-            dataset_path=str(DIR / "data/dataset_sample.jsonl.gz"),
+        file_path = os.path.join(tmp_dir, "data.parquet")
+        extract_from_dataset(
+            output_file_path=file_path,
+            query_file_path=SPELLCHECK_QUERY_FILE_PATH,
+            dataset_path=DIR / "data/dataset_sample.jsonl.gz",
         )

From fda7b5d50ba95e05ac3034dd712df7b710e75060 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Mon, 2 Sep 2024 17:33:42 +0200
Subject: [PATCH 13/22] style: :sparkles: make lint

---
 robotoff/app/api.py           | 24 ++++++++--------
 robotoff/batch/__init__.py    | 52 +++++++++++++++++------------------
 robotoff/batch/buckets.py     |  6 ++--
 robotoff/batch/extraction.py  |  9 ++----
 robotoff/batch/launch.py      | 19 ++++++++-----
 robotoff/cli/main.py          | 12 +++++---
 robotoff/insights/importer.py |  9 +++---
 robotoff/settings.py          |  2 +-
 robotoff/types.py             |  7 +++--
 tests/unit/test_batch.py      | 15 +++++-----
 10 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index e9003929ee..dd71a32f6e 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -26,9 +26,9 @@
 from robotoff import settings
 from robotoff.app import schema
 from robotoff.app.auth import (
+    APITokenError,
     BasicAuthDecodeError,
-    APITokenError, 
-    basic_decode, 
+    basic_decode,
     validate_token,
 )
 from robotoff.app.core import (
@@ -45,6 +45,7 @@
     validate_params,
 )
 from robotoff.app.middleware import DBConnectionMiddleware
+from robotoff.batch import BatchJobType, import_batch_predictions
 from robotoff.elasticsearch import get_es_client
 from robotoff.insights.extraction import (
     DEFAULT_OCR_PREDICTION_TYPES,
@@ -91,10 +92,6 @@
 from robotoff.utils.text import get_tag
 from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue
 from robotoff.workers.tasks import download_product_dataset_job
-from robotoff.batch import (
-    BatchJobType,
-    import_batch_predictions,
-)
 
 logger = get_logger()
 
@@ -311,7 +308,7 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool:
 
     :param req: Request.
     :type req: falcon.Request
-    :param ref_token_name: Secret environment variable name. 
+    :param ref_token_name: Secret environment variable name.
     :type ref_token_name: str
     :return: Token valid or not.
     """
@@ -321,11 +318,13 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool:
         scheme, token = auth_header.split()
     except APITokenError:
         raise falcon.HTTPUnauthorized("Invalid authentication scheme.")
-    if scheme.lower() != 'bearer':
-        raise falcon.HTTPUnauthorized("Invalid authentication scheme: 'Bearer Token' expected.")
+    if scheme.lower() != "bearer":
+        raise falcon.HTTPUnauthorized(
+            "Invalid authentication scheme: 'Bearer Token' expected."
+        )
     is_token_valid = validate_token(token, ref_token_name)
     if not is_token_valid:
-        raise falcon.HTTPUnauthorized('Invalid token.')
+        raise falcon.HTTPUnauthorized("Invalid token.")
     else:
         return True
 
@@ -1779,14 +1778,13 @@ def on_get(self, req: falcon.Request, resp: falcon.Response):
         resp.media = response
 
 
-
 class BatchJobImportResource:
     def on_post(self, req: falcon.Request, resp: falcon.Response):
         job_type_str: str = req.get_param("job_type", required=True)
 
         try:
             job_type = BatchJobType[job_type_str]
-        except KeyError: 
+        except KeyError:
             raise falcon.HTTPBadRequest(
                 description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
             )
@@ -1804,7 +1802,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
             )
         logger.info("Batch import %s has been queued.", job_type)
 
-        
+
 class RobotsTxtResource:
     def on_get(self, req: falcon.Request, resp: falcon.Response):
         # Disallow completely indexation: otherwise web crawlers send millions
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 27e4b5ae80..c78b839315 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,21 +1,15 @@
 import os
 import tempfile
 
-from robotoff.utils import get_logger
-from robotoff.types import (
-    BatchJobType,
-    Prediction, 
-    ServerType,
-)
-from robotoff.models import db
-from robotoff.insights.importer import import_insights
 from robotoff import settings
-from robotoff.types import PredictionType
+from robotoff.insights.importer import import_insights
+from robotoff.models import db
+from robotoff.types import BatchJobType, Prediction, PredictionType, ServerType
+from robotoff.utils import get_logger
 
-from .launch import launch_job, GoogleBatchJobConfig
+from .buckets import fetch_dataframe_from_gcs, upload_file_to_gcs
 from .extraction import extract_from_dataset
-from .buckets import upload_file_to_gcs, fetch_dataframe_from_gcs
-
+from .launch import GoogleBatchJobConfig, launch_job
 
 logger = get_logger(__name__)
 
@@ -28,7 +22,7 @@ def launch_batch_job(job_type: BatchJobType) -> None:
         launch_spellcheck_batch_job()
     else:
         raise NotImplementedError(f"Batch job type {job_type} not implemented.")
-    
+
 
 def import_batch_predictions(job_type: BatchJobType) -> None:
     """Import batch predictions once the job finished.
@@ -41,12 +35,13 @@ def import_batch_predictions(job_type: BatchJobType) -> None:
 
 
 def launch_spellcheck_batch_job() -> None:
-    """Launch spellcheck batch job.
-    """
+    """Launch spellcheck batch job."""
     # Init
     JOB_NAME = "ingredients-spellcheck"
     QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql"
-    BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
+    BATCH_JOB_CONFIG_PATH = (
+        settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
+    )
     BUCKET_NAME = "robotoff-spellcheck"
     SUFFIX_PREPROCESS = "data/preprocessed_data.parquet"
 
@@ -56,29 +51,35 @@ def launch_spellcheck_batch_job() -> None:
         extract_from_dataset(QUERY_FILE_PATH, file_path)
 
         # Upload the extracted file to the bucket
-        upload_file_to_gcs(file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS)
+        upload_file_to_gcs(
+            file_path=file_path, bucket_name=BUCKET_NAME, suffix=SUFFIX_PREPROCESS
+        )
         logger.debug(f"File uploaded to the bucket {BUCKET_NAME}/{SUFFIX_PREPROCESS}")
 
     # Launch batch job
-    batch_job_config = GoogleBatchJobConfig.init(job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH)
+    batch_job_config = GoogleBatchJobConfig.init(
+        job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH
+    )
     batch_job = launch_job(batch_job_config=batch_job_config)
     logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
 
 
 def import_spellcheck_batch_predictions() -> None:
-    """Import spellcheck predictions from remote storage.
-    """
+    """Import spellcheck predictions from remote storage."""
     # Init
     BUCKET_NAME = "robotoff-spellcheck"
     SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet"
     PREDICTION_TYPE = PredictionType.ingredient_spellcheck
-    PREDICTOR_VERSION = "1" #TODO: shard HF model version instead of manual change?
+    PREDICTOR_VERSION = "1"  # TODO: shard HF model version instead of manual change?
     PREDICTOR = "fine-tuned-mistral-7b"
     SERVER_TYPE = ServerType.off
-    
-    df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS)
-    logger.debug(f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}")
 
+    df = fetch_dataframe_from_gcs(
+        bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS
+    )
+    logger.debug(
+        f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}"
+    )
 
     # Generate predictions
     predictions = []
@@ -97,7 +98,6 @@ def import_spellcheck_batch_predictions() -> None:
     # Store predictions and insights
     with db:
         import_results = import_insights(
-            predictions=predictions,
-            server_type=SERVER_TYPE
+            predictions=predictions, server_type=SERVER_TYPE
         )
     logger.info("Batch import results: %s", import_results)
diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
index 77ae4f4ba0..8a90d657c8 100644
--- a/robotoff/batch/buckets.py
+++ b/robotoff/batch/buckets.py
@@ -32,8 +32,10 @@ def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
     bucket = client.get_bucket(bucket_name)
     blob = bucket.blob(suffix)
     with blob.open("rb") as f:
-        try: 
+        try:
             df = pd.read_parquet(f)
         except Exception as e:
-            raise ValueError(f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}")
+            raise ValueError(
+                f"Could not read parquet file from {bucket_name}/{suffix}. Error: {e}"
+            )
         return df
diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
index 2013e3fa26..3f3637e9a8 100644
--- a/robotoff/batch/extraction.py
+++ b/robotoff/batch/extraction.py
@@ -5,7 +5,6 @@
 from robotoff import settings
 from robotoff.utils import get_logger
 
-
 logger = get_logger(__name__)
 
 
@@ -30,7 +29,6 @@ def extract_from_dataset(
     logger.debug(f"Batch data succesfully extracted and saved at {output_file_path}")
 
 
-
 def _load_query(query_file_path: Path, dataset_path: Path) -> str:
     """Load the SQL query from a corresponding file.
 
@@ -49,6 +47,7 @@ def _load_query(query_file_path: Path, dataset_path: Path) -> str:
     logger.debug(f"Query used to extract batch from dataset: {query}")
     return query
 
+
 def _extract_and_save_batch_data(query: str, output_file_path: str) -> None:
     """Query and save the data.
 
@@ -57,8 +56,4 @@ def _extract_and_save_batch_data(query: str, output_file_path: str) -> None:
     :param output_file_path: Path to save the extracted data.
     :type output_file_path: str
     """
-    (
-        duckdb
-        .sql(query)
-        .write_parquet(output_file_path)
-    )
+    (duckdb.sql(query).write_parquet(output_file_path))
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index 247f7a6c05..22428991c6 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -1,17 +1,18 @@
-from typing import List, Optional
-import yaml
 import datetime
 import re
 from pathlib import Path
+from typing import List, Optional
 
+import yaml
 from google.cloud import batch_v1
-from pydantic import BaseModel, Field, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 
 from robotoff import settings
 
 
 class GoogleBatchJobConfig(BaseModel):
     """Batch job configuration class."""
+
     # By default, extra fields are just ignored. We raise an error in case of extra fields.
     model_config: ConfigDict = {"extra": "forbid"}
 
@@ -95,8 +96,10 @@ def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig":
         # Batch job name should respect a specific pattern, or returns an error
         pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
         if not re.match(pattern, job_name):
-            raise ValueError(f"Job name should respect the pattern: {pattern}. Current job name: {job_name}")
-        
+            raise ValueError(
+                f"Job name should respect the pattern: {pattern}. Current job name: {job_name}"
+            )
+
         # Generate unique id for the job
         unique_job_name = (
             job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
@@ -113,7 +116,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     Sources:
     * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
     * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
-    
+
     :param google_batch_launch_config: Config to run a job on Google Batch.
     :type google_batch_launch_config: GoogleBatchLaunchConfig
     :param batch_job_config: Config to run a specific job on Google Batch.
@@ -176,6 +179,8 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     create_request.job = job
     create_request.job_id = batch_job_config.job_name
     # The job's parent is the region in which the job will run
-    create_request.parent = f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}"
+    create_request.parent = (
+        f"projects/{settings.GOOGLE_PROJECT_NAME}/locations/{batch_job_config.location}"
+    )
 
     return client.create_job(create_request)
diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
index 47fe2a0c24..ef742bb9dc 100644
--- a/robotoff/cli/main.py
+++ b/robotoff/cli/main.py
@@ -1000,16 +1000,20 @@ def create_migration(
 
 @app.command()
 def launch_batch_job(
-    job_type: str = typer.Argument(..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"),
+    job_type: str = typer.Argument(
+        ..., help="Type of job to launch. Ex: 'ingredients_spellcheck'"
+    ),
 ) -> None:
     """Launch a batch job."""
     from robotoff.batch import launch_batch_job as _launch_batch_job
-    from robotoff.utils import get_logger
     from robotoff.types import BatchJobType
+    from robotoff.utils import get_logger
 
     if job_type not in BatchJobType.__members__:
-        raise ValueError(f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}")
-    
+        raise ValueError(
+            f"Invalid job type: {job_type}. Must be one of those: {[job.name for job in BatchJobType]}"
+        )
+
     get_logger()
     job_type = BatchJobType[job_type]
     _launch_batch_job(job_type)
diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index 029f9aefcd..5c7b142524 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1480,7 +1480,7 @@ class IngredientSpellcheckImporter(InsightImporter):
     @staticmethod
     def get_type() -> InsightType:
         return InsightType.ingredient_spellcheck
-    
+
     @classmethod
     def get_required_prediction_types(cls) -> set[PredictionType]:
         return {PredictionType.ingredient_spellcheck}
@@ -1495,15 +1495,14 @@ def generate_candidates(
         # Only one prediction
         for candidate in predictions:
             yield ProductInsight(**candidate.to_dict())
-    
+
     @classmethod
     def is_conflicting_insight(
-        cls, 
-        candidate: ProductInsight, 
-        reference: ProductInsight
+        cls, candidate: ProductInsight, reference: ProductInsight
     ) -> bool:
         candidate.value_tag == reference.value_tag
 
+
 class PackagingElementTaxonomyException(Exception):
     pass
 
diff --git a/robotoff/settings.py b/robotoff/settings.py
index 4db6f20126..be5669f406 100644
--- a/robotoff/settings.py
+++ b/robotoff/settings.py
@@ -359,4 +359,4 @@ def get_package_version() -> str:
 CROP_ALLOWED_DOMAINS = os.environ.get("CROP_ALLOWED_DOMAINS", "").split(",")
 
 # Batch jobs
-GOOGLE_PROJECT_NAME= "robotoff"
\ No newline at end of file
+GOOGLE_PROJECT_NAME = "robotoff"
diff --git a/robotoff/types.py b/robotoff/types.py
index 52704e0ec5..9f15b1a1fd 100644
--- a/robotoff/types.py
+++ b/robotoff/types.py
@@ -359,8 +359,9 @@ class PackagingElementProperty(enum.Enum):
 
 InsightAnnotation = Literal[-1, 0, 1, 2]
 
+
 @enum.unique
 class BatchJobType(enum.Enum):
-    """Each job type correspond to a task that will be executed in the batch job.
-    """
-    ingredients_spellcheck = "ingredients-spellcheck"
\ No newline at end of file
+    """Each job type correspond to a task that will be executed in the batch job."""
+
+    ingredients_spellcheck = "ingredients-spellcheck"
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
index d42297e51f..0698835c6c 100644
--- a/tests/unit/test_batch.py
+++ b/tests/unit/test_batch.py
@@ -1,16 +1,18 @@
 import os
-import pytest
 import tempfile
 from pathlib import Path
 
+import pytest
+
+from robotoff import settings
 from robotoff.batch import GoogleBatchJobConfig
 from robotoff.batch.extraction import extract_from_dataset
-from robotoff import settings
-
 
 DIR = Path(__file__).parent
 SPELLCHECK_QUERY_FILE_PATH = settings.BATCH_JOB_CONFIG_DIR / "sql/spellcheck.sql"
-SPELLCHECK_BATCH_JOB_CONFIG_PATH = settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
+SPELLCHECK_BATCH_JOB_CONFIG_PATH = (
+    settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
+)
 
 
 @pytest.mark.parametrize(
@@ -29,11 +31,10 @@ def test_batch_job_config_file(inputs):
     "query_file_path",
     [
         SPELLCHECK_QUERY_FILE_PATH,
-    ]
+    ],
 )
 def test_batch_extraction(query_file_path):
-    """Test extraction of a batch of data from the dataset depending on the job type.
-    """
+    """Test extraction of a batch of data from the dataset depending on the job type."""
     with tempfile.TemporaryDirectory() as tmp_dir:
         file_path = os.path.join(tmp_dir, "data.parquet")
         extract_from_dataset(

From f8ed76aa222b6d3e7d57e05b20e1764b22ea0ebd Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 3 Sep 2024 10:16:05 +0200
Subject: [PATCH 14/22] fix: :bug: Fixed bug & Better error handling with
 Falcon

---
 robotoff/app/api.py        | 35 +++++++++++++++++------------------
 robotoff/app/auth.py       |  6 +-----
 robotoff/batch/__init__.py |  2 +-
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index dd71a32f6e..ab27b0483e 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -26,7 +26,6 @@
 from robotoff import settings
 from robotoff.app import schema
 from robotoff.app.auth import (
-    APITokenError,
     BasicAuthDecodeError,
     basic_decode,
     validate_token,
@@ -45,7 +44,7 @@
     validate_params,
 )
 from robotoff.app.middleware import DBConnectionMiddleware
-from robotoff.batch import BatchJobType, import_batch_predictions
+from robotoff.batch import import_batch_predictions
 from robotoff.elasticsearch import get_es_client
 from robotoff.insights.extraction import (
     DEFAULT_OCR_PREDICTION_TYPES,
@@ -86,6 +85,7 @@
     PredictionType,
     ProductIdentifier,
     ServerType,
+    BatchJobType,
 )
 from robotoff.utils import get_image_from_url, get_logger, http_session
 from robotoff.utils.i18n import TranslationStore
@@ -315,18 +315,18 @@ def parse_valid_token(req: falcon.Request, ref_token_name: str) -> bool:
     auth_header = req.get_header("Authorization", required=True)
 
     try:
-        scheme, token = auth_header.split()
-    except APITokenError:
+        scheme, token = auth_header.strip().split()
+        if scheme.lower() != "bearer":
+            raise falcon.HTTPUnauthorized(
+                "Invalid authentication scheme: 'Bearer Token' expected."
+            )
+        is_token_valid = validate_token(token, ref_token_name)
+        if not is_token_valid:
+            raise falcon.HTTPUnauthorized("Invalid token.")
+        else:
+            return True
+    except ValueError:
         raise falcon.HTTPUnauthorized("Invalid authentication scheme.")
-    if scheme.lower() != "bearer":
-        raise falcon.HTTPUnauthorized(
-            "Invalid authentication scheme: 'Bearer Token' expected."
-        )
-    is_token_valid = validate_token(token, ref_token_name)
-    if not is_token_valid:
-        raise falcon.HTTPUnauthorized("Invalid token.")
-    else:
-        return True
 
 
 def device_id_from_request(req: falcon.Request) -> str:
@@ -1788,7 +1788,7 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
             raise falcon.HTTPBadRequest(
                 description=f"invalid job_type: {job_type_str}. Valid job_types are: {[elt.value for elt in BatchJobType]}"
             )
-        # We secure the endpoint
+        # We secure the endpoint.
         if parse_valid_token(req, "batch_job_key"):
             enqueue_job(
                 import_batch_predictions,
@@ -1796,12 +1796,11 @@ def on_post(self, req: falcon.Request, resp: falcon.Response):
                 queue=low_queue,
                 job_kwargs={"timeout": "30m"},
             )
-        else:
-            raise falcon.HTTPForbidden(
-                description="Invalid batch_job_key. Be sure to indicate the authentification key in the request."
-            )
         logger.info("Batch import %s has been queued.", job_type)
 
+        resp.media = {"status": "Request successful. Importing processed data."}
+        resp.status = falcon.HTTP_200
+
 
 class RobotsTxtResource:
     def on_get(self, req: falcon.Request, resp: falcon.Response):
diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py
index 5eef036497..c95e2365b1 100644
--- a/robotoff/app/auth.py
+++ b/robotoff/app/auth.py
@@ -7,10 +7,6 @@ class BasicAuthDecodeError(Exception):
     pass
 
 
-class APITokenError(Exception):
-    pass
-
-
 def basic_decode(encoded_str: str) -> tuple[str, str]:
     """Decode an encrypted HTTP basic authentication string. Returns a tuple of
     the form (username, password), and raises a BasicAuthDecodeError exception
@@ -56,5 +52,5 @@ def validate_token(token: str, ref_token_name: str) -> bool:
     """
     api_token = os.getenv(ref_token_name.upper())
     if not api_token:
-        raise APITokenError("API token not set in environment variables.")
+        raise ValueError("API token not set in environment variables.")
     return token == api_token
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index c78b839315..2e6d130d58 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -75,7 +75,7 @@ def import_spellcheck_batch_predictions() -> None:
     SERVER_TYPE = ServerType.off
 
     df = fetch_dataframe_from_gcs(
-        bucket_name=BUCKET_NAME, suffix_postprocess=SUFFIX_POSTPROCESS
+        bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS
     )
     logger.debug(
         f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}"

From 85b7bfb672cef349be306af255db021543393a4f Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 3 Sep 2024 15:55:12 +0200
Subject: [PATCH 15/22] feat: :ambulance: Changes

Enhance batch extraction with popularity_key - Add env variables to batch job - Add make deploy Spellcheck job to Artifact registry
---
 Makefile                                      | 21 ++++++++++-
 batch/spellcheck/main.py                      |  2 +-
 robotoff/app/api.py                           |  8 ++---
 robotoff/batch/__init__.py                    | 13 ++++---
 .../batch/configs/job_configs/spellcheck.yaml |  2 +-
 robotoff/batch/configs/sql/spellcheck.sql     | 13 +++----
 robotoff/batch/launch.py                      | 36 ++++++++++++++++---
 tests/unit/test_batch.py                      | 24 +++++++------
 8 files changed, 84 insertions(+), 35 deletions(-)

diff --git a/Makefile b/Makefile
index 4c67f91389..9b16983839 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,12 @@ DOCKER_COMPOSE=docker compose --env-file=${ENV_FILE}
 DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=robotoff_test COMMON_NET_NAME=po_test docker compose --env-file=${ENV_FILE}
 ML_OBJECT_DETECTION_MODELS := tf-universal-logo-detector tf-nutrition-table tf-nutriscore
 
+# Spellcheck
+IMAGE_NAME = spellcheck-batch-vllm
+TAG = latest
+GCLOUD_LOCATION = europe-west9-docker.pkg.dev
+REGISTRY = ${GCLOUD_LOCATION}/robotoff/gcf-artifacts
+
 .DEFAULT_GOAL := dev
 # avoid target corresponding to file names, to depends on them
 .PHONY: *
@@ -290,4 +296,17 @@ create-migration: guard-args
 
 # create network if not exists
 create-po-default-network:
-	docker network create po_default || true  
+	docker network create po_default || true 
+
+# Spellcheck
+build-spellcheck:
+	docker build -f batch/spellcheck/Dockerfile -t $(IMAGE_NAME):$(TAG) batch/spellcheck
+
+# Push the image to the registry
+push-spellcheck:
+	docker tag $(IMAGE_NAME):$(TAG) $(REGISTRY)/$(IMAGE_NAME):$(TAG)
+	docker push $(REGISTRY)/$(IMAGE_NAME):$(TAG)
+
+# Build and push in one command
+deploy-spellcheck: 
+	build-spellcheck push-spellcheck
diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 6c73648c7e..34c7b98c1d 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -161,7 +161,7 @@ def run_robotoff_endpoint_batch_import():
     url = "https://robotoff.openfoodfacts.org/api/v1/batch/import"
     data = {"job_type": "ingredients_spellcheck"}
     headers = {
-        "Authorization": f"Bearer {os.getenv("BATCH_JOB_KEY")}",
+        "Authorization": f"Bearer {os.getenv('BATCH_JOB_KEY')}",
         "Content-Type": "application/json"
     }
     try:
diff --git a/robotoff/app/api.py b/robotoff/app/api.py
index ab27b0483e..8066bf6034 100644
--- a/robotoff/app/api.py
+++ b/robotoff/app/api.py
@@ -25,11 +25,7 @@
 
 from robotoff import settings
 from robotoff.app import schema
-from robotoff.app.auth import (
-    BasicAuthDecodeError,
-    basic_decode,
-    validate_token,
-)
+from robotoff.app.auth import BasicAuthDecodeError, basic_decode, validate_token
 from robotoff.app.core import (
     SkipVotedOn,
     SkipVotedType,
@@ -79,13 +75,13 @@
 from robotoff.products import get_image_id, get_product, get_product_dataset_etag
 from robotoff.taxonomy import is_prefixed_value, match_taxonomized_value
 from robotoff.types import (
+    BatchJobType,
     InsightType,
     JSONType,
     NeuralCategoryClassifierModel,
     PredictionType,
     ProductIdentifier,
     ServerType,
-    BatchJobType,
 )
 from robotoff.utils import get_image_from_url, get_logger, http_session
 from robotoff.utils.i18n import TranslationStore
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 2e6d130d58..4a05303193 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -44,7 +44,9 @@ def launch_spellcheck_batch_job() -> None:
     )
     BUCKET_NAME = "robotoff-spellcheck"
     SUFFIX_PREPROCESS = "data/preprocessed_data.parquet"
+    ENV_NAMES = ["BATCH_JOB_KEY"]
 
+    logger.info("Extract batch from dataset.")
     # Extract data from dataset
     with tempfile.TemporaryDirectory() as tmp_dir:
         file_path = os.path.join(tmp_dir, "batch_data.parquet")
@@ -58,10 +60,13 @@ def launch_spellcheck_batch_job() -> None:
 
     # Launch batch job
     batch_job_config = GoogleBatchJobConfig.init(
-        job_name=JOB_NAME, config_path=BATCH_JOB_CONFIG_PATH
+        job_name=JOB_NAME,
+        config_path=BATCH_JOB_CONFIG_PATH,
+        env_names=ENV_NAMES,
     )
+    logger.info("Batch job config: %s", batch_job_config)
     batch_job = launch_job(batch_job_config=batch_job_config)
-    logger.info(f"Batch job succesfully launched. Batch job name: {batch_job.name}.")
+    logger.info("Batch job succesfully launched. Batch job %s", batch_job)
 
 
 def import_spellcheck_batch_predictions() -> None:
@@ -74,9 +79,7 @@ def import_spellcheck_batch_predictions() -> None:
     PREDICTOR = "fine-tuned-mistral-7b"
     SERVER_TYPE = ServerType.off
 
-    df = fetch_dataframe_from_gcs(
-        bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS
-    )
+    df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS)
     logger.debug(
         f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}"
     )
diff --git a/robotoff/batch/configs/job_configs/spellcheck.yaml b/robotoff/batch/configs/job_configs/spellcheck.yaml
index 18562f6f09..0fc435fdaf 100644
--- a/robotoff/batch/configs/job_configs/spellcheck.yaml
+++ b/robotoff/batch/configs/job_configs/spellcheck.yaml
@@ -3,7 +3,7 @@ cpu_milli: 1000
 memory_mib: 32000
 boot_disk_mib: 100000
 max_retry_count: 1
-max_run_duration: "3600s"
+max_run_duration: "54000s" # 15 hours
 task_count: "1"
 parallelism: "1"
 machine_type: "g2-standard-8"
diff --git a/robotoff/batch/configs/sql/spellcheck.sql b/robotoff/batch/configs/sql/spellcheck.sql
index 0cfebcb09a..f3ca49970c 100644
--- a/robotoff/batch/configs/sql/spellcheck.sql
+++ b/robotoff/batch/configs/sql/spellcheck.sql
@@ -1,12 +1,13 @@
-SELECT 
-code, 
-ingredients_text AS text, 
-product_name, 
+SELECT
+code,
+ingredients_text AS text,
+product_name,
 lang, 
+popularity_key,
 (CAST(unknown_ingredients_n AS FLOAT) / CAST(ingredients_n AS FLOAT)) AS fraction
 FROM read_ndjson('DATASET_PATH', ignore_errors=True)
 WHERE ingredients_text NOT LIKE ''
 AND fraction > 0 AND fraction <= 0.4
-ORDER BY random() 
-LIMIT 100
+ORDER BY popularity_key DESC 
+LIMIT 10000
 ;
\ No newline at end of file
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index 22428991c6..b61d8a430f 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -1,7 +1,8 @@
 import datetime
+import os
 import re
 from pathlib import Path
-from typing import List, Optional
+from typing import Dict, Iterable, List, Optional
 
 import yaml
 from google.cloud import batch_v1
@@ -85,13 +86,26 @@ class GoogleBatchJobConfig(BaseModel):
         default=True,
         description="Required if GPUs.",
     )
+    env_variables: Dict[str, str] = Field(
+        description="Environment variables to pass during the batch job.",
+        default_factory=dict,
+    )
 
     @classmethod
-    def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig":
+    def init(
+        cls,
+        job_name: str,
+        config_path: Path,
+        env_names: Optional[Iterable[str]] = None,
+    ) -> "GoogleBatchJobConfig":
         """Initialize the class with the configuration file corresponding to the job type.
 
-        :param job_type: Batch job type.
-        :type job_type: BatchJobType
+        :param job_name: Name of the job.
+        :type job_name: str
+        :param config_path: Path to the configuration file.
+        :type config_path: Path
+        :param env_variables: List of environment variables to add to the job, defaults to None.
+        :type env_variables: Optional[Iterable[str]], optional
         """
         # Batch job name should respect a specific pattern, or returns an error
         pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
@@ -104,10 +118,17 @@ def init(cls, job_name: str, config_path: Path) -> "GoogleBatchJobConfig":
         unique_job_name = (
             job_name + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
         )
+
+        # Environment variables
+        if not env_names:
+            env_variables = {}
+        else:
+            env_variables = {var_name: os.getenv(var_name) for var_name in env_names}
+
         # Load config file from job_type
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
-        return cls(job_name=unique_job_name, **config)
+        return cls(job_name=unique_job_name, env_variables=env_variables, **config)
 
 
 def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
@@ -140,6 +161,11 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     task = batch_v1.TaskSpec()
     task.runnables = [runnable]
 
+    # Environment variables.
+    envable = batch_v1.Environment()
+    envable.variables = batch_job_config.env_variables
+    task.environment = envable
+
     # We can specify what resources are requested by each task.
     resources = batch_v1.ComputeResource()
     resources.cpu_milli = batch_job_config.cpu_milli
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
index 0698835c6c..7f4adbdaa3 100644
--- a/tests/unit/test_batch.py
+++ b/tests/unit/test_batch.py
@@ -14,26 +14,30 @@
     settings.BATCH_JOB_CONFIG_DIR / "job_configs/spellcheck.yaml"
 )
 
+os.environ["KEY"] = "value"
+
 
 @pytest.mark.parametrize(
     "inputs",
     [
-        ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH),
+        (
+            "ingredients-spellcheck", 
+            SPELLCHECK_BATCH_JOB_CONFIG_PATH, 
+            ["KEY"]
+        ),
     ],
 )
 def test_batch_job_config_file(inputs):
     "Test indirectly the batch job config file by validating with the Pydantic class model."
-    job_name, config_path = inputs
-    GoogleBatchJobConfig.init(job_name, config_path)
+    job_name, config_path, env_names = inputs
+    GoogleBatchJobConfig.init(
+        job_name=job_name,
+        config_path=config_path,
+        env_names=env_names,
+    )
 
 
-@pytest.mark.parametrize(
-    "query_file_path",
-    [
-        SPELLCHECK_QUERY_FILE_PATH,
-    ],
-)
-def test_batch_extraction(query_file_path):
+def test_batch_extraction():
     """Test extraction of a batch of data from the dataset depending on the job type."""
     with tempfile.TemporaryDirectory() as tmp_dir:
         file_path = os.path.join(tmp_dir, "data.parquet")

From 31ce875cf78da563fa7d55a43f8bbea696ae490f Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 3 Sep 2024 20:10:57 +0200
Subject: [PATCH 16/22] feat: :ambulance: Credential + Importer

---
 robotoff/batch/launch.py      |  3 ++-
 robotoff/insights/importer.py | 42 +++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index b61d8a430f..1b4a597f4a 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -147,7 +147,8 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     Returns:
         Batch job information.
     """
-
+    # https://cloud.google.com/iam/docs/service-account-overview
+    # batch_v1.BatchServiceClient.from_service_account_info(info=json.loads(os.getenv("GOOGLE_CREDENTIALS")))
     client = batch_v1.BatchServiceClient()
 
     # Define what will be done as part of the job.
diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index 5c7b142524..668d0b5599 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1484,7 +1484,7 @@ def get_type() -> InsightType:
     @classmethod
     def get_required_prediction_types(cls) -> set[PredictionType]:
         return {PredictionType.ingredient_spellcheck}
-
+    
     @classmethod
     def generate_candidates(
         cls,
@@ -1492,16 +1492,50 @@ def generate_candidates(
         predictions: list[Prediction],
         product_id: ProductIdentifier,
     ) -> Iterator[ProductInsight]:
-        # Only one prediction
-        for candidate in predictions:
-            yield ProductInsight(**candidate.to_dict())
+        yield from (
+            ProductInsight(**prediction.to_dict())
+            for prediction in predictions
+            if cls._keep_prediction(prediction, product_id)
+        )
 
     @classmethod
     def is_conflicting_insight(
         cls, candidate: ProductInsight, reference: ProductInsight
     ) -> bool:
+        # Same language
         candidate.value_tag == reference.value_tag
 
+    @classmethod
+    def _keep_prediction(
+        cls,
+        prediction: Prediction, 
+        product_id: ProductIdentifier
+    ) -> bool:
+        conditions = [
+            prediction.data["original"] != prediction.data["correction"],
+            cls._has_changed(prediction, product_id),
+        ]
+        return all(conditions)
+    
+    @staticmethod
+    def _has_changed(
+        prediction: Prediction,
+        product_id: ProductIdentifier
+    ) -> bool:
+        """Check if the lists of ingredients has changed since the last insight."""
+        if not ProductInsight.select().where(
+            ProductInsight.barcode == product_id.barcode,
+            ProductInsight.server_type == product_id.server_type.name,
+        ).exists():
+            return True
+        else:
+            return ProductInsight.select().where(
+                ProductInsight.barcode == product_id.barcode,
+                ProductInsight.server_type == product_id.server_type.name,
+                ProductInsight.type == InsightType.ingredient_spellcheck,
+                ProductInsight.data["original"] != prediction.data["original"],
+            ).exists()
+        
 
 class PackagingElementTaxonomyException(Exception):
     pass

From 7c92836e61954b5d8339377c8c69b8ea5f270e95 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 12:19:24 +0200
Subject: [PATCH 17/22] feat: :ambulance: Credentials + Importer + Test

---
 .gitignore                    |  2 --
 credentials/.gitkeep          |  0
 docker-compose.yml            |  4 ++--
 robotoff/batch/__init__.py    | 12 ++++++++++--
 robotoff/batch/launch.py      | 22 ++++++++++++++++++++--
 robotoff/insights/importer.py | 33 +++++++--------------------------
 robotoff/products.py          |  4 ++++
 tests/unit/test_batch.py      |  6 +-----
 8 files changed, 44 insertions(+), 39 deletions(-)
 delete mode 100644 credentials/.gitkeep

diff --git a/.gitignore b/.gitignore
index 0443dcd510..3a4dd3e70a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,5 +43,3 @@ site/
 gh_pages/
 doc/README.md
 doc/references/cli.md
-
-credentials
\ No newline at end of file
diff --git a/credentials/.gitkeep b/credentials/.gitkeep
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docker-compose.yml b/docker-compose.yml
index 1fe85dacd2..1b2edebd5b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,7 +4,6 @@ x-robotoff-base-volumes:
   - ./cache:/opt/robotoff/cache
   - ./datasets:/opt/robotoff/datasets
   - ./models:/opt/robotoff/models
-  - ./credentials:/opt/credentials
   - robotoff_tmp:/tmp
 
 x-robotoff-base:
@@ -55,8 +54,9 @@ x-robotoff-base-env:
   IMAGE_MODERATION_SERVICE_URL:
   CROP_ALLOWED_DOMAINS:
   NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings
-  GOOGLE_APPLICATION_CREDENTIALS: /opt/credentials/google/application_default_credentials.json
+  GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
   GOOGLE_CLOUD_PROJECT: "robotoff"
+  GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
   BATCH_JOB_KEY: # Secure Batch job import with a token key 
   
 x-robotoff-worker-base:
diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 4a05303193..937e538d1b 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -1,3 +1,4 @@
+import datetime
 import os
 import tempfile
 
@@ -9,7 +10,7 @@
 
 from .buckets import fetch_dataframe_from_gcs, upload_file_to_gcs
 from .extraction import extract_from_dataset
-from .launch import GoogleBatchJobConfig, launch_job
+from .launch import GoogleBatchJobConfig, check_google_credentials, launch_job
 
 logger = get_logger(__name__)
 
@@ -46,6 +47,8 @@ def launch_spellcheck_batch_job() -> None:
     SUFFIX_PREPROCESS = "data/preprocessed_data.parquet"
     ENV_NAMES = ["BATCH_JOB_KEY"]
 
+    check_google_credentials()
+
     logger.info("Extract batch from dataset.")
     # Extract data from dataset
     with tempfile.TemporaryDirectory() as tmp_dir:
@@ -75,10 +78,15 @@ def import_spellcheck_batch_predictions() -> None:
     BUCKET_NAME = "robotoff-spellcheck"
     SUFFIX_POSTPROCESS = "data/postprocessed_data.parquet"
     PREDICTION_TYPE = PredictionType.ingredient_spellcheck
-    PREDICTOR_VERSION = "1"  # TODO: shard HF model version instead of manual change?
+    # We increment to allow import_insights to create a new version
+    PREDICTOR_VERSION = (
+        "batch-job" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    )
     PREDICTOR = "fine-tuned-mistral-7b"
     SERVER_TYPE = ServerType.off
 
+    check_google_credentials()
+
     df = fetch_dataframe_from_gcs(bucket_name=BUCKET_NAME, suffix=SUFFIX_POSTPROCESS)
     logger.debug(
         f"Batch data downloaded from bucket {BUCKET_NAME}/{SUFFIX_POSTPROCESS}"
diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
index 1b4a597f4a..33fff6154c 100644
--- a/robotoff/batch/launch.py
+++ b/robotoff/batch/launch.py
@@ -1,4 +1,5 @@
 import datetime
+import json
 import os
 import re
 from pathlib import Path
@@ -9,6 +10,25 @@
 from pydantic import BaseModel, ConfigDict, Field
 
 from robotoff import settings
+from robotoff.utils import get_logger
+
+logger = get_logger(__name__)
+
+
+def check_google_credentials() -> None:
+    """Create google credentials from variable if doesn't exist"""
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if not credentials_path:
+        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set")
+    if not os.path.exists(credentials_path):
+        logger.info(
+            "No google credentials found at %s. Creating  credentials from GOOGLE_CREDENTIALS.",
+            credentials_path,
+        )
+        os.makedirs(os.path.dirname(credentials_path), exist_ok=True)
+        credentials = json.loads(os.getenv("GOOGLE_CREDENTIALS"))
+        with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), "w") as f:
+            json.dump(credentials, f, indent=4)
 
 
 class GoogleBatchJobConfig(BaseModel):
@@ -147,8 +167,6 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     Returns:
         Batch job information.
     """
-    # https://cloud.google.com/iam/docs/service-account-overview
-    # batch_v1.BatchServiceClient.from_service_account_info(info=json.loads(os.getenv("GOOGLE_CREDENTIALS")))
     client = batch_v1.BatchServiceClient()
 
     # Define what will be done as part of the job.
diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index 668d0b5599..6ca442e304 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1484,7 +1484,7 @@ def get_type() -> InsightType:
     @classmethod
     def get_required_prediction_types(cls) -> set[PredictionType]:
         return {PredictionType.ingredient_spellcheck}
-    
+
     @classmethod
     def generate_candidates(
         cls,
@@ -1495,7 +1495,7 @@ def generate_candidates(
         yield from (
             ProductInsight(**prediction.to_dict())
             for prediction in predictions
-            if cls._keep_prediction(prediction, product_id)
+            if cls._keep_prediction(prediction=prediction, product=product)
         )
 
     @classmethod
@@ -1507,35 +1507,16 @@ def is_conflicting_insight(
 
     @classmethod
     def _keep_prediction(
-        cls,
-        prediction: Prediction, 
-        product_id: ProductIdentifier
+        cls, prediction: Prediction, product: Optional[Product]
     ) -> bool:
         conditions = [
+            # Spellcheck didn't correct
             prediction.data["original"] != prediction.data["correction"],
-            cls._has_changed(prediction, product_id),
+            # Modification of the original ingredients between two dataset dumps (24-hour period)
+            product is None or prediction.data["original"] != product.ingredients_text,
         ]
         return all(conditions)
-    
-    @staticmethod
-    def _has_changed(
-        prediction: Prediction,
-        product_id: ProductIdentifier
-    ) -> bool:
-        """Check if the lists of ingredients has changed since the last insight."""
-        if not ProductInsight.select().where(
-            ProductInsight.barcode == product_id.barcode,
-            ProductInsight.server_type == product_id.server_type.name,
-        ).exists():
-            return True
-        else:
-            return ProductInsight.select().where(
-                ProductInsight.barcode == product_id.barcode,
-                ProductInsight.server_type == product_id.server_type.name,
-                ProductInsight.type == InsightType.ingredient_spellcheck,
-                ProductInsight.data["original"] != prediction.data["original"],
-            ).exists()
-        
+
 
 class PackagingElementTaxonomyException(Exception):
     pass
diff --git a/robotoff/products.py b/robotoff/products.py
index efaec716f9..b7c1dae89c 100644
--- a/robotoff/products.py
+++ b/robotoff/products.py
@@ -417,6 +417,7 @@ class Product:
         "image_ids",
         "packagings",
         "lang",
+        "ingredients_text",
     )
 
     def __init__(self, product: JSONType):
@@ -439,6 +440,7 @@ def __init__(self, product: JSONType):
             else list(key for key in self.images.keys() if key.isdigit())
         )
         self.lang: Optional[str] = product.get("lang")
+        self.ingredients_text: Optional[str] = product.get("ingredients_text")
 
     @staticmethod
     def get_fields():
@@ -454,6 +456,8 @@ def get_fields():
             "stores_tags",
             "unique_scans_n",
             "images",
+            "lang",
+            "ingredients_text",
         }
 
 
diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
index 7f4adbdaa3..89a440fff0 100644
--- a/tests/unit/test_batch.py
+++ b/tests/unit/test_batch.py
@@ -20,11 +20,7 @@
 @pytest.mark.parametrize(
     "inputs",
     [
-        (
-            "ingredients-spellcheck", 
-            SPELLCHECK_BATCH_JOB_CONFIG_PATH, 
-            ["KEY"]
-        ),
+        ("ingredients-spellcheck", SPELLCHECK_BATCH_JOB_CONFIG_PATH, ["KEY"]),
     ],
 )
 def test_batch_job_config_file(inputs):

From be475bdb3f8eea8be1ff4d888f720b100e2437ee Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 12:41:01 +0200
Subject: [PATCH 18/22] feat: :bug: Forgot a return

---
 robotoff/insights/importer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
index 6ca442e304..533f5a5e0b 100644
--- a/robotoff/insights/importer.py
+++ b/robotoff/insights/importer.py
@@ -1503,7 +1503,7 @@ def is_conflicting_insight(
         cls, candidate: ProductInsight, reference: ProductInsight
     ) -> bool:
         # Same language
-        candidate.value_tag == reference.value_tag
+        return candidate.value_tag == reference.value_tag
 
     @classmethod
     def _keep_prediction(

From 762722f9fb3168abeea9173748d041fbafc6ac21 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 12:44:19 +0200
Subject: [PATCH 19/22] style: :sparkles: Black on spellcheck script

---
 batch/spellcheck/main.py | 112 ++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 37 deletions(-)

diff --git a/batch/spellcheck/main.py b/batch/spellcheck/main.py
index 34c7b98c1d..0ae9adeff9 100644
--- a/batch/spellcheck/main.py
+++ b/batch/spellcheck/main.py
@@ -22,18 +22,53 @@
 
 
 def parse() -> argparse.Namespace:
-    """Parse command line arguments.
-    """
+    """Parse command line arguments."""
     parser = argparse.ArgumentParser(description="Spellcheck module.")
-    parser.add_argument("--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name.")
-    parser.add_argument("--pre_data_suffix", type=str, default="data/preprocessed_data.parquet", help="Dataset suffix containing the data to be processed.")
-    parser.add_argument("--post_data_suffix", type=str, default="data/postprocessed_data.parquet", help="Dataset suffix containing the processed data.")
-    parser.add_argument("--model_path", default="openfoodfacts/spellcheck-mistral-7b", type=str, help="HF model path.")
-    parser.add_argument("--max_model_len", default=1024, type=int, help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.")
-    parser.add_argument("--temperature", default=0, type=float, help="Sampling temperature.")
-    parser.add_argument("--max_tokens", default=1024, type=int, help="Maximum number of tokens to generate.")
-    parser.add_argument("--quantization", default="fp8", type=str, help="Quantization type.")
-    parser.add_argument("--dtype", default="auto", type=str, help="Model weights precision. Default corresponds to the modle config (float16 here)")
+    parser.add_argument(
+        "--data_bucket", type=str, default="robotoff-spellcheck", help="Bucket name."
+    )
+    parser.add_argument(
+        "--pre_data_suffix",
+        type=str,
+        default="data/preprocessed_data.parquet",
+        help="Dataset suffix containing the data to be processed.",
+    )
+    parser.add_argument(
+        "--post_data_suffix",
+        type=str,
+        default="data/postprocessed_data.parquet",
+        help="Dataset suffix containing the processed data.",
+    )
+    parser.add_argument(
+        "--model_path",
+        default="openfoodfacts/spellcheck-mistral-7b",
+        type=str,
+        help="HF model path.",
+    )
+    parser.add_argument(
+        "--max_model_len",
+        default=1024,
+        type=int,
+        help="Maximum model context length. A lower max context length reduces the memory footprint and accelerate the inference.",
+    )
+    parser.add_argument(
+        "--temperature", default=0, type=float, help="Sampling temperature."
+    )
+    parser.add_argument(
+        "--max_tokens",
+        default=1024,
+        type=int,
+        help="Maximum number of tokens to generate.",
+    )
+    parser.add_argument(
+        "--quantization", default="fp8", type=str, help="Quantization type."
+    )
+    parser.add_argument(
+        "--dtype",
+        default="auto",
+        type=str,
+        help="Model weights precision. Default corresponds to the modle config (float16 here)",
+    )
     return parser.parse_args()
 
 
@@ -43,7 +78,7 @@ def main():
     Original lists of ingredients are stored in a gs bucket before being loaded then processed by the model.
     The corrected lists of ingredients are then stored back in gs.
 
-    We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository.  
+    We use vLLM to process the batch optimaly. The model is loaded from the Open Food Facts Hugging Face model repository.
     """
     logger.info("Starting batch processing job.")
     args = parse()
@@ -52,32 +87,35 @@ def main():
     data = load_gcs(bucket_name=args.data_bucket, suffix=args.pre_data_suffix)
     logger.info(f"Feature in uploaded data: {data.columns}")
     if not all(feature in data.columns for feature in FEATURES_VALIDATION):
-        raise ValueError(f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}")
+        raise ValueError(
+            f"Data should contain the following features: {FEATURES_VALIDATION}. Current features: {data.columns}"
+        )
 
     instructions = [prepare_instruction(text) for text in data["text"]]
     llm = LLM(
-        model=args.model_path, 
-        max_model_len=args.max_model_len, 
+        model=args.model_path,
+        max_model_len=args.max_model_len,
         dtype=args.dtype,
         quantization=args.quantization,
     )
     sampling_params = SamplingParams(
-        temperature=args.temperature, 
-        max_tokens=args.max_tokens
+        temperature=args.temperature, max_tokens=args.max_tokens
     )
 
-    logger.info(f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}")
-    data["correction"] = batch_inference(instructions, llm=llm, sampling_params=sampling_params)
+    logger.info(
+        f"Starting batch inference:\n {llm}.\n\nSampling parameters: {sampling_params}"
+    )
+    data["correction"] = batch_inference(
+        instructions, llm=llm, sampling_params=sampling_params
+    )
 
     logger.info(f"Uploading data to GCS: {args.data_bucket}/{args.post_data_suffix}")
     # Save DataFrame as Parquet to a temporary file
-    with tempfile.NamedTemporaryFile(delete=True, suffix='.parquet') as temp_file:
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".parquet") as temp_file:
         data.to_parquet(temp_file.name)
         temp_file_name = temp_file.name
         upload_gcs(
-            temp_file_name, 
-            bucket_name=args.data_bucket, 
-            suffix=args.post_data_suffix
+            temp_file_name, bucket_name=args.data_bucket, suffix=args.post_data_suffix
         )
 
     logger.info("Request Robotoff API batch import endpoint.")
@@ -96,18 +134,14 @@ def prepare_instruction(text: str) -> str:
         str: Instruction.
     """
     instruction = (
-        "###Correct the list of ingredients:\n"
-        + text
-        + "\n\n###Correction:\n"
+        "###Correct the list of ingredients:\n" + text + "\n\n###Correction:\n"
     )
     return instruction
 
 
 def batch_inference(
-        texts: List[str], 
-        llm: LLM,
-        sampling_params: SamplingParams
-    ) -> List[str]:
+    texts: List[str], llm: LLM, sampling_params: SamplingParams
+) -> List[str]:
     """Process batch of texts with vLLM.
 
     Args:
@@ -118,7 +152,10 @@ def batch_inference(
     Returns:
         List[str]: Processed batch of texts
     """
-    outputs = llm.generate(texts, sampling_params,)
+    outputs = llm.generate(
+        texts,
+        sampling_params,
+    )
     corrections = [output.outputs[0].text for output in outputs]
     return corrections
 
@@ -127,7 +164,7 @@ def load_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
     """Load data from Google Cloud Storage bucket.
 
     Args:
-        bucket_name (str): 
+        bucket_name (str):
         suffix (str): Path inside the bucket
 
     Returns:
@@ -156,13 +193,12 @@ def upload_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
 
 
 def run_robotoff_endpoint_batch_import():
-    """Run Robotoff api endpoint to import batch data into tables. 
-    """
+    """Run Robotoff api endpoint to import batch data into tables."""
     url = "https://robotoff.openfoodfacts.org/api/v1/batch/import"
     data = {"job_type": "ingredients_spellcheck"}
     headers = {
         "Authorization": f"Bearer {os.getenv('BATCH_JOB_KEY')}",
-        "Content-Type": "application/json"
+        "Content-Type": "application/json",
     }
     try:
         response = requests.post(
@@ -170,10 +206,12 @@ def run_robotoff_endpoint_batch_import():
             data=data,
             headers=headers,
         )
-        logger.info(f"Import batch Robotoff API endpoint succesfully requested: {response.text}")
+        logger.info(
+            f"Import batch Robotoff API endpoint succesfully requested: {response.text}"
+        )
     except requests.exceptions.RequestException as e:
         raise SystemExit(e)
-    
+
 
 if __name__ == "__main__":
     main()

From 10791e7fa59db8f46d18d3394aac790b1f0791d9 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 18:35:58 +0200
Subject: [PATCH 20/22] docs: :memo: Add batch/import api endpoint to doc

---
 doc/references/api.yml | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/doc/references/api.yml b/doc/references/api.yml
index c01664cc2a..74a7f5c920 100644
--- a/doc/references/api.yml
+++ b/doc/references/api.yml
@@ -1105,6 +1105,28 @@ paths:
         "400":
           description: "An HTTP 400 is returned if the provided parameters are invalid"
 
+  /batch/import:
+    post:
+      tags:
+        - Batch Job
+      summary: Import batch processed data to the Robotoff database.
+      security:
+        - batch_job_key: []
+      description:
+        Trigger import of the batch processed data to the Robotoff database. A `BATCH_JOB_KEY` is expected in the authorization header.
+        This endpoint is mainly used by the batch job once the job is finished.
+      parameters:
+        - $ref: "#/components/parameters/job_type"
+      responses:
+        "200":
+          description: Data successfully imported.
+          content:
+            application/json:
+              status:
+                type: string
+                description: Request successful. Importing processed data.
+        "400":
+          description: "An HTTP 400 is returned if the authentification key is invalid or if the job_type is not supported."
 
 components:
   schemas:
@@ -1391,6 +1413,21 @@ components:
       schema:
         type: integer
         example: 5410041040807
+    job_type:
+      name: job_type
+      in: query
+      required: true
+      description: The type of batch job launched.
+      schema:
+        type: string
+        enum:
+          - ingredients_spellcheck
+
+  securitySchemes:
+    batch_job_key:
+      type: http
+      scheme: bearer
+ 
 tags:
   - name: Questions
   - name: Insights
@@ -1398,4 +1435,4 @@ tags:
       An insight is a fact about a product that has been either extracted or inferred from the product pictures, characteristics,...
       If the insight is correct, the Openfoodfacts DB can be updated accordingly.
 
-      Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py).
+      Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py).
\ No newline at end of file

From 400818b9de7cca87b353a960a1c6b19996ddc73e Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 18:41:18 +0200
Subject: [PATCH 21/22] docs: :memo: Because perfection

---
 doc/references/api.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/references/api.yml b/doc/references/api.yml
index 74a7f5c920..d339bd5311 100644
--- a/doc/references/api.yml
+++ b/doc/references/api.yml
@@ -1435,4 +1435,4 @@ tags:
       An insight is a fact about a product that has been either extracted or inferred from the product pictures, characteristics,...
       If the insight is correct, the Openfoodfacts DB can be updated accordingly.
 
-      Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py).
\ No newline at end of file
+      Current insight types and their description can be found in [robotoff/insights/dataclass.py](https://github.com/openfoodfacts/robotoff/blob/main/robotoff/insights/dataclass.py).

From 4ebfd87b5f5bac9299f4f241de295ea3c69aec8b Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 4 Sep 2024 18:48:31 +0200
Subject: [PATCH 22/22] fix: :art: Change predictor version to also track...
 the predictor version

We concluded that PREDICTOR-VERSION will be used to track batch jobs and allow new data predictions to be imported. In the future, we'll find a way to detect already processed data in another way, such as before the batch job during the extraction stage.
---
 robotoff/batch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robotoff/batch/__init__.py b/robotoff/batch/__init__.py
index 937e538d1b..738a994605 100644
--- a/robotoff/batch/__init__.py
+++ b/robotoff/batch/__init__.py
@@ -80,7 +80,7 @@ def import_spellcheck_batch_predictions() -> None:
     PREDICTION_TYPE = PredictionType.ingredient_spellcheck
     # We increment to allow import_insights to create a new version
     PREDICTOR_VERSION = (
-        "batch-job" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+        "llm-v1" + "-" + datetime.datetime.now().strftime("%Y%m%d%H%M%S")
     )
     PREDICTOR = "fine-tuned-mistral-7b"
     SERVER_TYPE = ServerType.off