Skip to content

Commit

Permalink
fix: fix mypy and flake8 issues
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Sep 10, 2024
1 parent 9800591 commit ac7907d
Show file tree
Hide file tree
Showing 13 changed files with 996 additions and 1,028 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ max-line-length = 88
exclude = .git,__pycache__,build,dist,*_pb2.py,.venv
per-file-ignores =
robotoff/cli/main.py:B008
max-doc-length = 79
max-doc-length = 88
13 changes: 6 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@ DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=robotoff_test COMMON_NET_NAME=po_test d
ML_OBJECT_DETECTION_MODELS := tf-universal-logo-detector tf-nutrition-table tf-nutriscore

# Spellcheck
IMAGE_NAME = spellcheck-batch-vllm
TAG = latest
GCLOUD_LOCATION = europe-west9-docker.pkg.dev
REGISTRY = ${GCLOUD_LOCATION}/robotoff/gcf-artifacts
SPELLCHECK_IMAGE_NAME = spellcheck-batch-vllm
SPELLCHECK_TAG = latest
SPELLCHECK_REGISTRY = europe-west9-docker.pkg.dev/robotoff/gcf-artifacts

.DEFAULT_GOAL := dev
# avoid target corresponding to file names, to depends on them
Expand Down Expand Up @@ -300,12 +299,12 @@ create-po-default-network:

# Spellcheck
build-spellcheck:
docker build -f batch/spellcheck/Dockerfile -t $(IMAGE_NAME):$(TAG) batch/spellcheck
docker build -f batch/spellcheck/Dockerfile -t $(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG) batch/spellcheck

# Push the image to the registry
push-spellcheck:
docker tag $(IMAGE_NAME):$(TAG) $(REGISTRY)/$(IMAGE_NAME):$(TAG)
docker push $(REGISTRY)/$(IMAGE_NAME):$(TAG)
docker tag $(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG) $(SPELLCHECK_REGISTRY)/$(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG)
docker push $(SPELLCHECK_REGISTRY)/$(SPELLCHECK_IMAGE_NAME):$(SPELLCHECK_TAG)

# Build and push in one command
deploy-spellcheck:
Expand Down
3 changes: 0 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ x-robotoff-base-volumes:
- ./datasets:/opt/robotoff/datasets
- ./models:/opt/robotoff/models
- robotoff_tmp:/tmp
- ./credentials:/opt/credentials

x-robotoff-base:
&robotoff-base
Expand Down Expand Up @@ -55,8 +54,6 @@ x-robotoff-base-env:
IMAGE_MODERATION_SERVICE_URL:
CROP_ALLOWED_DOMAINS:
NUM_RQ_WORKERS: 4 # Update worker service command accordingly if you change this settings
GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
GOOGLE_CLOUD_PROJECT: "robotoff"
GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
BATCH_JOB_KEY: # Secure Batch job import with a token key

Expand Down
1 change: 1 addition & 0 deletions docker/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ x-robotoff-dev: &robotoff-dev
- ./pyproject.toml:/opt/robotoff/pyproject.toml
- ./poetry.toml:/opt/robotoff/poetry.toml
- ./poetry.lock:/opt/robotoff/poetry.lock
- ./.flake8:/opt/robotoff/.flake8
# make tests available
- ./tests:/opt/robotoff/tests
- ./.cov:/opt/robotoff/.cov
Expand Down
1,922 changes: 951 additions & 971 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ rq-dashboard = "~0.6.1"

[tool.poetry.group.dev.dependencies]
types-pytz = "^2024.1.0.20240417"
types-pyyaml = "^6.0.12.20240808"

[tool.poetry.scripts]
robotoff-cli = 'robotoff.cli.main:main'
2 changes: 0 additions & 2 deletions robotoff/app/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def validate_token(token: str, ref_token_name: str) -> bool:
"""Validate token.
:param token: Authentification token
:type token: str
:param api_token_name: Validation token, stored in environment variables.
:type api_token_name: str
"""
api_token = os.getenv(ref_token_name.upper())
if not api_token:
Expand Down
7 changes: 1 addition & 6 deletions robotoff/batch/buckets.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import pandas as pd
from google.cloud import storage
from google.cloud import storage # type: ignore


def upload_file_to_gcs(file_path: str, bucket_name: str, suffix: str) -> None:
"""Upload file to Google Storage Bucket.
:param file_path: File where the data is stored
:type file_path: str
:param bucket_name: Bucket name in GCP storage
:type bucket_name: str
:param suffix: Path inside the bucket
:type suffix: str
"""
client = storage.Client()
bucket = client.get_bucket(bucket_name)
Expand All @@ -23,9 +20,7 @@ def fetch_dataframe_from_gcs(bucket_name: str, suffix: str) -> pd.DataFrame:
:param bucket_name: Bucket name in GCP storage
:type bucket_name: str
:param suffix: Path inside the bucket. Should lead to a parquet file.
:type suffix: str
:return: Dataframe
"""
client = storage.Client()
Expand Down
3 changes: 2 additions & 1 deletion robotoff/batch/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def extract_from_dataset(
:type query_file_path: Path
:param output_file_path: Path to save the extracted data.
:type output_file_path: str
:param dataset_path: Compressed jsonl database, defaults to settings.JSONL_DATASET_PATH
:param dataset_path: Compressed jsonl database, defaults to
settings.JSONL_DATASET_PATH
:type dataset_path: Path, optional
"""
if not dataset_path.exists():
Expand Down
49 changes: 21 additions & 28 deletions robotoff/batch/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import yaml
from google.cloud import batch_v1
from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, Field

from robotoff import settings
from robotoff.utils import get_logger
Expand All @@ -16,27 +16,22 @@


def check_google_credentials() -> None:
"""Create google credentials from variable if doesn't exist"""
credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
if not credentials_path:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set")
if not os.path.exists(credentials_path):
"""Create google credentials from variable if doesn't exist."""
credentials_path = settings.PROJECT_DIR / "credentials/google/credentials.json"
if not credentials_path.is_file():
logger.info(
"No google credentials found at %s. Creating credentials from GOOGLE_CREDENTIALS.",
"No google credentials found at %s. Creating credentials from GOOGLE_CREDENTIALS.",
credentials_path,
)
os.makedirs(os.path.dirname(credentials_path), exist_ok=True)
credentials = json.loads(os.getenv("GOOGLE_CREDENTIALS"))
with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), "w") as f:
credentials_path.parent.mkdir(parents=True, exist_ok=True)
credentials = json.loads(os.environ["GOOGLE_CREDENTIALS"])
with open(credentials_path, "w") as f:
json.dump(credentials, f, indent=4)


class GoogleBatchJobConfig(BaseModel):
"""Batch job configuration class."""

# By default, extra fields are just ignored. We raise an error in case of extra fields.
model_config: ConfigDict = {"extra": "forbid"}

job_name: str = Field(
description="The name of the job. It needs to be unique amongst exisiting batch job names.",
)
Expand Down Expand Up @@ -118,14 +113,13 @@ def init(
config_path: Path,
env_names: Optional[Iterable[str]] = None,
) -> "GoogleBatchJobConfig":
"""Initialize the class with the configuration file corresponding to the job type.
"""Initialize the class with the configuration file corresponding to the job
type.
:param job_name: Name of the job.
:type job_name: str
:param config_path: Path to the configuration file.
:type config_path: Path
:param env_variables: List of environment variables to add to the job, defaults to None.
:type env_variables: Optional[Iterable[str]], optional
:param env_variables: List of environment variables to add to the job, defaults
to None.
"""
# Batch job name should respect a specific pattern, or returns an error
pattern = "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
Expand All @@ -143,7 +137,7 @@ def init(
if not env_names:
env_variables = {}
else:
env_variables = {var_name: os.getenv(var_name) for var_name in env_names}
env_variables = {var_name: os.environ[var_name] for var_name in env_names}

# Load config file from job_type
with open(config_path, "r") as f:
Expand All @@ -156,12 +150,10 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
Sources:
* https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/batch/create
* https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types
* https://cloud.google.com/python/docs/reference/batch/latest/google.cloud.batch_v1.types # noqa
:param google_batch_launch_config: Config to run a job on Google Batch.
:type google_batch_launch_config: GoogleBatchLaunchConfig
:param batch_job_config: Config to run a specific job on Google Batch.
:type batch_job_config: BatchJobConfig
:return: Batch job information.
Returns:
Expand All @@ -173,7 +165,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
runnable = batch_v1.Runnable()
runnable.container = batch_v1.Runnable.Container()
runnable.container.image_uri = batch_job_config.container_image_uri
runnable.container.entrypoint = batch_job_config.entrypoint
runnable.container.entrypoint = batch_job_config.entrypoint # type: ignore
runnable.container.commands = batch_job_config.commands

# Jobs can be divided into tasks. In this case, we have only one task.
Expand All @@ -189,18 +181,19 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
resources = batch_v1.ComputeResource()
resources.cpu_milli = batch_job_config.cpu_milli
resources.memory_mib = batch_job_config.memory_mib
resources.boot_disk_mib = batch_job_config.boot_disk_mib
resources.boot_disk_mib = batch_job_config.boot_disk_mib # type: ignore
task.compute_resource = resources

task.max_retry_count = batch_job_config.max_retry_count
task.max_run_duration = batch_job_config.max_run_duration
task.max_run_duration = batch_job_config.max_run_duration # type: ignore

# Tasks are grouped inside a job using TaskGroups.
group = batch_v1.TaskGroup()
group.task_count = batch_job_config.task_count
group.task_count = batch_job_config.task_count # type: ignore
group.task_spec = task

# Policies are used to define on what kind of virtual machines the tasks will run on.
# Policies are used to define on what kind of virtual machines the tasks will run
# on.
policy = batch_v1.AllocationPolicy.InstancePolicy()
policy.machine_type = batch_job_config.machine_type
instances = batch_v1.AllocationPolicy.InstancePolicyOrTemplate()
Expand All @@ -218,7 +211,7 @@ def launch_job(batch_job_config: GoogleBatchJobConfig) -> batch_v1.Job:
job.allocation_policy = allocation_policy
# We use Cloud Logging as it's an out of the box available option
job.logs_policy = batch_v1.LogsPolicy()
job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING
job.logs_policy.destination = batch_v1.LogsPolicy.Destination.CLOUD_LOGGING # type: ignore

create_request = batch_v1.CreateJobRequest()
create_request.job = job
Expand Down
3 changes: 1 addition & 2 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,8 +1015,7 @@ def launch_batch_job(
)

get_logger()
job_type = BatchJobType[job_type]
_launch_batch_job(job_type)
_launch_batch_job(BatchJobType[job_type])


def main() -> None:
Expand Down
15 changes: 9 additions & 6 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1509,13 +1509,16 @@ def is_conflicting_insight(
def _keep_prediction(
cls, prediction: Prediction, product: Optional[Product]
) -> bool:
conditions = [
return (
# Spellcheck didn't correct
prediction.data["original"] != prediction.data["correction"],
# Modification of the original ingredients between two dataset dumps (24-hour period)
product is None or prediction.data["original"] != product.ingredients_text,
]
return all(conditions)
prediction.data["original"] != prediction.data["correction"]
# Modification of the original ingredients between two dataset dumps
# (24-hour period)
and (
product is None
or prediction.data["original"] != product.ingredients_text
)
)


class PackagingElementTaxonomyException(Exception):
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
],
)
def test_batch_job_config_file(inputs):
"Test indirectly the batch job config file by validating with the Pydantic class model."
"""Test indirectly the batch job config file by validating with the Pydantic class
model."""
job_name, config_path, env_names = inputs
GoogleBatchJobConfig.init(
job_name=job_name,
Expand Down

0 comments on commit ac7907d

Please sign in to comment.