fix: fix mypy and flake8 issues

openfoodfacts · Sep 10, 2024 · ac7907d · ac7907d
1 parent 9800591
commit ac7907d
Show file tree

Hide file tree

Showing 13 changed files with 996 additions and 1,028 deletions.
diff --git a/.flake8 b/.flake8
@@ -4,4 +4,4 @@ max-line-length = 88
 exclude = .git,__pycache__,build,dist,*_pb2.py,.venv
 per-file-ignores =
     robotoff/cli/main.py:B008
-max-doc-length = 79
+max-doc-length = 88
diff --git a/Makefile b/Makefile
@@ -19,10 +19,9 @@ DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=robotoff_test COMMON_NET_NAME=po_test d
 ML_OBJECT_DETECTION_MODELS := tf-universal-logo-detector tf-nutrition-table tf-nutriscore
 
 # Spellcheck
-IMAGE_NAME = spellcheck-batch-vllm
-TAG = latest
-GCLOUD_LOCATION = europe-west9-docker.pkg.dev
-REGISTRY = ${GCLOUD_LOCATION}/robotoff/gcf-artifacts
+SPELLCHECK_IMAGE_NAME = spellcheck-batch-vllm
+SPELLCHECK_TAG = latest
+SPELLCHECK_REGISTRY = europe-west9-docker.pkg.dev/robotoff/gcf-artifacts
 
 .DEFAULT_GOAL := dev
 # avoid target corresponding to file names, to depends on them
@@ -300,12 +299,12 @@ create-po-default-network:
 
 # Spellcheck
 build-spellcheck:
-	docker build -f batch/spellcheck/Dockerfile -t $(IMAGE_NAME):$(TAG) batch/spellcheck
+	docker build -f batch/spellcheck/Dockerfile -t $(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG) batch/spellcheck
 
 # Push the image to the registry
 push-spellcheck:
-	docker tag $(IMAGE_NAME):$(TAG) $(REGISTRY)/$(IMAGE_NAME):$(TAG)
-	docker push $(REGISTRY)/$(IMAGE_NAME):$(TAG)
+	docker tag $(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG) $(SPELLCHECK_REGISTRY)/$(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG)
+	docker push $(SPELLCHECK_REGISTRY)/$(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG)
 
 # Build and push in one command
 deploy-spellcheck: 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,7 +5,6 @@ x-robotoff-base-volumes:
   - ./datasets:/opt/robotoff/datasets
   - ./models:/opt/robotoff/models
   - robotoff_tmp:/tmp
-  - ./credentials:/opt/credentials
 
 x-robotoff-base:
   &robotoff-base
@@ -55,8 +54,6 @@ x-robotoff-base-env:
   IMAGE_MODERATION_SERVICE_URL:
   CROP_ALLOWED_DOMAINS:
   NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings
-  GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
-  GOOGLE_CLOUD_PROJECT: "robotoff"
   GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
   BATCH_JOB_KEY: # Secure Batch job import with a token key 
 

diff --git a/docker/dev.yml b/docker/dev.yml
@@ -14,6 +14,7 @@ x-robotoff-dev: &robotoff-dev
       - ./pyproject.toml:/opt/robotoff/pyproject.toml
       - ./poetry.toml:/opt/robotoff/poetry.toml
       - ./poetry.lock:/opt/robotoff/poetry.lock
+      - ./.flake8:/opt/robotoff/.flake8
       # make tests available
       - ./tests:/opt/robotoff/tests
       - ./.cov:/opt/robotoff/.cov

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -119,6 +119,7 @@ rq-dashboard = "~0.6.1"
 
 [tool.poetry.group.dev.dependencies]
 types-pytz = "^2024.1.0.20240417"
+types-pyyaml = "^6.0.12.20240808"
 
 [tool.poetry.scripts]
 robotoff-cli = 'robotoff.cli.main:main'
diff --git a/robotoff/app/auth.py b/robotoff/app/auth.py
@@ -46,9 +46,7 @@ def validate_token(token: str, ref_token_name: str) -> bool:
     """Validate token.
 
     :param token: Authentification token
-    :type token: str
     :param api_token_name: Validation token, stored in environment variables.
-    :type api_token_name: str
     """
     api_token = os.getenv(ref_token_name.upper())
     if not api_token:

diff --git a/robotoff/batch/buckets.py b/robotoff/batch/buckets.py
@@ -1,16 +1,13 @@
 import pandas as pd
-from google.cloud import storage
+from google.cloud import storage  # type: ignore
 
 
 def upload_file_to_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
     """Upload file to Google Storage Bucket.
 
     :param file_path: File where the data is stored
-    :type file_path: str
     :param bucket_name: Bucket name in GCP storage
-    :type bucket_name: str
     :param suffix: Path inside the bucket
-    :type suffix: str
     """
     client = storage.Client()
     bucket = client.get_bucket(bucket_name)
@@ -23,9 +20,7 @@ def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
 
 
     :param bucket_name: Bucket name in GCP storage
-    :type bucket_name: str
     :param suffix: Path inside the bucket. Should lead to a parquet file.
-    :type suffix: str
     :return: Dataframe
     """
     client = storage.Client()

diff --git a/robotoff/batch/extraction.py b/robotoff/batch/extraction.py
@@ -19,7 +19,8 @@ def extract_from_dataset(
     :type query_file_path: Path
     :param output_file_path: Path to save the extracted data.
     :type output_file_path: str
-    :param dataset_path: Compressed jsonl database, defaults to settings.JSONL_DATASET_PATH
+    :param dataset_path: Compressed jsonl database, defaults to
+        settings.JSONL_DATASET_PATH
     :type dataset_path: Path, optional
     """
     if not dataset_path.exists():

diff --git a/robotoff/batch/launch.py b/robotoff/batch/launch.py
@@ -7,7 +7,7 @@
 
 import yaml
 from google.cloud import batch_v1
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, Field
 
 from robotoff import settings
 from robotoff.utils import get_logger
@@ -16,27 +16,22 @@
 
 
 def check_google_credentials() -> None:
-    """Create google credentials from variable if doesn't exist"""
-    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-    if not credentials_path:
-        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set")
-    if not os.path.exists(credentials_path):
+    """Create google credentials from variable if doesn't exist."""
+    credentials_path = settings.PROJECT_DIR / "credentials/google/credentials.json"
+    if not credentials_path.is_file():
         logger.info(
-            "No google credentials found at %s. Creating  credentials from GOOGLE_CREDENTIALS.",
+            "No google credentials found at %s. Creating credentials from GOOGLE_CREDENTIALS.",
             credentials_path,
         )
-        os.makedirs(os.path.dirname(credentials_path), exist_ok=True)
-        credentials = json.loads(os.getenv("GOOGLE_CREDENTIALS"))
-        with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), "w") as f:
+        credentials_path.parent.mkdir(parents=True, exist_ok=True)
+        credentials = json.loads(os.environ["GOOGLE_CREDENTIALS"])
+        with open(credentials_path, "w") as f:
             json.dump(credentials, f, indent=4)
 
 
 class GoogleBatchJobConfig(BaseModel):
     """Batch job configuration class."""
 
-    # By default, extra fields are just ignored. We raise an error in case of extra fields.
-    model_config: ConfigDict = {"extra": "forbid"}
-
     job_name: str = Field(
         description="The name of the job. It needs to be unique amongst exisiting batch job names.",
     )
@@ -118,14 +113,13 @@ def init(
         config_path: Path,
         env_names: Optional[Iterable[str]] = None,
     ) -> "GoogleBatchJobConfig":
-        """Initialize the class with the configuration file corresponding to the job type.
+        """Initialize the class with the configuration file corresponding to the job
+        type.
 
         :param job_name: Name of the job.
-        :type job_name: str
         :param config_path: Path to the configuration file.
-        :type config_path: Path
-        :param env_variables: List of environment variables to add to the job, defaults to None.
-        :type env_variables: Optional[Iterable[str]], optional
+        :param env_variables: List of environment variables to add to the job, defaults
+            to None.
         """
         # Batch job name should respect a specific pattern, or returns an error
         pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
@@ -143,7 +137,7 @@ def init(
         if not env_names:
             env_variables = {}
         else:
-            env_variables = {var_name: os.getenv(var_name) for var_name in env_names}
+            env_variables = {var_name: os.environ[var_name] for var_name in env_names}
 
         # Load config file from job_type
         with open(config_path, "r") as f:
@@ -156,12 +150,10 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
 
     Sources:
     * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
-    * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
+    * https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types  # noqa
 
     :param google_batch_launch_config: Config to run a job on Google Batch.
-    :type google_batch_launch_config: GoogleBatchLaunchConfig
     :param batch_job_config: Config to run a specific job on Google Batch.
-    :type batch_job_config: BatchJobConfig
     :return: Batch job information.
 
     Returns:
@@ -173,7 +165,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     runnable = batch_v1.Runnable()
     runnable.container = batch_v1.Runnable.Container()
     runnable.container.image_uri = batch_job_config.container_image_uri
-    runnable.container.entrypoint = batch_job_config.entrypoint
+    runnable.container.entrypoint = batch_job_config.entrypoint  # type: ignore
     runnable.container.commands = batch_job_config.commands
 
     # Jobs can be divided into tasks. In this case, we have only one task.
@@ -189,18 +181,19 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     resources = batch_v1.ComputeResource()
     resources.cpu_milli = batch_job_config.cpu_milli
     resources.memory_mib = batch_job_config.memory_mib
-    resources.boot_disk_mib = batch_job_config.boot_disk_mib
+    resources.boot_disk_mib = batch_job_config.boot_disk_mib  # type: ignore
     task.compute_resource = resources
 
     task.max_retry_count = batch_job_config.max_retry_count
-    task.max_run_duration = batch_job_config.max_run_duration
+    task.max_run_duration = batch_job_config.max_run_duration  # type: ignore
 
     # Tasks are grouped inside a job using TaskGroups.
     group = batch_v1.TaskGroup()
-    group.task_count = batch_job_config.task_count
+    group.task_count = batch_job_config.task_count  # type: ignore
     group.task_spec = task
 
-    # Policies are used to define on what kind of virtual machines the tasks will run on.
+    # Policies are used to define on what kind of virtual machines the tasks will run
+    # on.
     policy = batch_v1.AllocationPolicy.InstancePolicy()
     policy.machine_type = batch_job_config.machine_type
     instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
@@ -218,7 +211,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
     job.allocation_policy = allocation_policy
     # We use Cloud Logging as it's an out of the box available option
     job.logs_policy = batch_v1.LogsPolicy()
-    job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
+    job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING  # type: ignore
 
     create_request = batch_v1.CreateJobRequest()
     create_request.job = job

diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py
@@ -1015,8 +1015,7 @@ def launch_batch_job(
         )
 
     get_logger()
-    job_type = BatchJobType[job_type]
-    _launch_batch_job(job_type)
+    _launch_batch_job(BatchJobType[job_type])
 
 
 def main() -> None:

diff --git a/robotoff/insights/importer.py b/robotoff/insights/importer.py
@@ -1509,13 +1509,16 @@ def is_conflicting_insight(
     def _keep_prediction(
         cls, prediction: Prediction, product: Optional[Product]
     ) -> bool:
-        conditions = [
+        return (
             # Spellcheck didn't correct
-            prediction.data["original"] != prediction.data["correction"],
-            # Modification of the original ingredients between two dataset dumps (24-hour period)
-            product is None or prediction.data["original"] != product.ingredients_text,
-        ]
-        return all(conditions)
+            prediction.data["original"] != prediction.data["correction"]
+            # Modification of the original ingredients between two dataset dumps
+            # (24-hour period)
+            and (
+                product is None
+                or prediction.data["original"] != product.ingredients_text
+            )
+        )
 
 
 class PackagingElementTaxonomyException(Exception):

diff --git a/tests/unit/test_batch.py b/tests/unit/test_batch.py
@@ -24,7 +24,8 @@
     ],
 )
 def test_batch_job_config_file(inputs):
-    "Test indirectly the batch job config file by validating with the Pydantic class model."
+    """Test indirectly the batch job config file by validating with the Pydantic class
+    model."""
     job_name, config_path, env_names = inputs
     GoogleBatchJobConfig.init(
         job_name=job_name,