Skip to content

Commit

Permalink
Stronger validation of responses from MyTardis API (#483)
Browse files Browse the repository at this point in the history
* Add a type for storing/validating an MD5 checksum, and associated tests

* Introduce a base dataclass to be used for all the validation dataclasses representing MyTardis resources

* Move hexadecimal validator to a new module specifically for validators

* Extract two validation functions to the validators module

* Rename validators module to avoid possible name clash with validators package

* Delete unused regex

* Use the ISO datetime parser from dateutils, instead of a hand-rolled regex

* Extract MD5 checksum validation to a standalone function

* Convert MD5Sum to an Annotated str to reduce verbosity

* Move the definitions of some helper Annotated types to mytardis_client so they can be used in defining dataclasses there without introducing cyclic dependencies

* Remove seemingly unnecessary boilerplate

* Fix broken imports

* Add dataclasses for validating/storing response data from the MyTardis API

* Store the output data type associated with GET requests to each endpoint, and use this in the rest client GET calls

* Use the more strongly typed GET interface in the overseer

* Use the more strongly-typed GET interface in the Overseer

* Introduce a MyTardisResource protocol to achieve structural subtyping in the return types from the MyTardis client

* Simplify some type-checking logic

* Delete commented code

* Update the poetry lock file

* Add package with typing stubs for the dateutil package

* Fix broken tests

* Avoid overriding default pagination from MyTardis if no pagination specified

* Add missing f-string prefix

* Fix broken Overseer tests

* Rename modules to align terminology (validation instead of validate/validators)
  • Loading branch information
andrew-uoa authored Aug 15, 2024
1 parent c75c502 commit 20b7c74
Show file tree
Hide file tree
Showing 30 changed files with 609 additions and 292 deletions.
33 changes: 31 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ typer = "^0.12.0"
rocrate = "^0.10.0"
typing-extensions = "^4.12.2"
types-pyyaml = "^6.0.12.20240311"
typeguard = "^4.3.0"

[tool.poetry.group.dev.dependencies]
wily = "^1.25.0"
Expand Down Expand Up @@ -53,6 +54,7 @@ pyfakefs = "^5.3.1"
black = "^24.4.0"
isort = "^5.13.2"
deptry = "^0.17.0"
types-python-dateutil = "^2.9.0.20240316"

[tool.poetry.scripts]
ids = 'src.ids:app'
Expand Down
1 change: 0 additions & 1 deletion src/blueprints/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# pylint: disable=missing-module-docstring, R0801

from src.blueprints.common_models import GroupACL, Parameter, ParameterSet, UserACL
from src.blueprints.custom_data_types import ISODateTime, MTUrl, Username
from src.blueprints.datafile import (
BaseDatafile,
Datafile,
Expand Down
48 changes: 0 additions & 48 deletions src/blueprints/custom_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,11 @@
from typing import Annotated, Any

from pydantic import AfterValidator, PlainSerializer, WithJsonSchema
from validators import url

user_regex = re.compile(
r"^[a-z]{2,4}[0-9]{3}$" # Define as a constant in case of future change
)

iso_time_regex = re.compile(
r"^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]+)?(Z|[+-](?:2[0-3]|[01][0-9]):[0-5][0-9])?$" # pylint: disable=line-too-long
)
iso_date_regex = re.compile(
r"^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])"
)


def validate_username(value: Any) -> str:
"""Defines a validated username, in other words, ensure that the username meets a standardised
Expand All @@ -42,43 +34,3 @@ def validate_username(value: Any) -> str:
PlainSerializer(lambda x: str(x), return_type=str),
WithJsonSchema({"type": "string"}, mode="serialization"),
]


def validate_isodatetime(value: Any) -> str:
"""Custom validator to ensure that the value is a string object and that it matches
the regex defined for an ISO 8601 formatted datetime string"""
if not isinstance(value, str):
raise TypeError(f'Unexpected type for ISO date/time stamp: "{type(value)}"')
if match := iso_time_regex.fullmatch(value):
return f"{match.group(0)}"
raise ValueError(
'Passed string value "%s" is not an ISO 8601 formatted '
"date/time string. Format should follow "
"YYYY-MM-DDTHH:MM:SS.SSSSSS+HH:MM convention" % (value)
)


ISODateTime = Annotated[
str,
AfterValidator(validate_isodatetime),
PlainSerializer(lambda x: str(x), return_type=str),
WithJsonSchema({"type": "string"}, mode="serialization"),
]


def validate_url(value: Any) -> str:
"""Custom validator for Urls since the default pydantic ones are not compatible
with urllib"""
if not isinstance(value, str):
raise TypeError(f'Unexpected type for URL: "{type(value)}"')
if url(value):
return value
raise ValueError(f'Passed string value"{value}" is not a valid URL')


MTUrl = Annotated[
str,
AfterValidator(validate_url),
PlainSerializer(lambda x: str(x), return_type=str),
WithJsonSchema({"type": "string"}, mode="serialization"),
]
3 changes: 1 addition & 2 deletions src/blueprints/datafile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from pydantic import BaseModel, Field, field_serializer

from src.blueprints.common_models import GroupACL, ParameterSet, UserACL
from src.blueprints.custom_data_types import MTUrl
from src.mytardis_client.common_types import DataStatus
from src.mytardis_client.common_types import DataStatus, MTUrl
from src.mytardis_client.endpoints import URI


Expand Down
8 changes: 6 additions & 2 deletions src/blueprints/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
from pydantic import BaseModel, Field

from src.blueprints.common_models import GroupACL, ParameterSet, UserACL
from src.blueprints.custom_data_types import ISODateTime, MTUrl
from src.mytardis_client.common_types import DataClassification, DataStatus
from src.mytardis_client.common_types import (
DataClassification,
DataStatus,
ISODateTime,
MTUrl,
)
from src.mytardis_client.endpoints import URI


Expand Down
8 changes: 6 additions & 2 deletions src/blueprints/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
from pydantic import BaseModel, Field

from src.blueprints.common_models import GroupACL, ParameterSet, UserACL
from src.blueprints.custom_data_types import ISODateTime, MTUrl
from src.mytardis_client.common_types import DataClassification, DataStatus
from src.mytardis_client.common_types import (
DataClassification,
DataStatus,
ISODateTime,
MTUrl,
)
from src.mytardis_client.endpoints import URI


Expand Down
9 changes: 7 additions & 2 deletions src/blueprints/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@
from pydantic import BaseModel, Field

from src.blueprints.common_models import GroupACL, ParameterSet, UserACL
from src.blueprints.custom_data_types import ISODateTime, MTUrl, Username
from src.mytardis_client.common_types import DataClassification, DataStatus
from src.blueprints.custom_data_types import Username
from src.mytardis_client.common_types import (
DataClassification,
DataStatus,
ISODateTime,
MTUrl,
)
from src.mytardis_client.endpoints import URI


Expand Down
15 changes: 8 additions & 7 deletions src/cli/cmd_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from datetime import datetime
from pathlib import Path
from typing import Annotated, Any, Optional
from typing import Annotated, Optional

import typer

Expand All @@ -19,27 +19,28 @@
)
from src.config.config import ConfigFromEnv
from src.inspector.inspector import Inspector
from src.mytardis_client.response_data import IngestedDatafile, Replica
from src.profiles.profile_register import load_profile
from src.utils import log_utils
from src.utils.timing import Timer

logger = logging.getLogger(__name__)


def _get_verified_replica(queried_df: dict[str, Any]) -> Optional[dict[str, Any]]:
def _get_verified_replica(queried_df: IngestedDatafile) -> Optional[Replica]:
"""Returns the first Replica that is verified, or None if there isn't any."""
replicas: list[dict[str, Any]] = queried_df["replicas"]

# Iterate through all replicas. If one replica is verified, then
# return it.
for replica in replicas:
if replica["verified"]:
for replica in queried_df.replicas:
if replica.verified:
return replica
return None


def _is_completed_df(
df: RawDatafile,
query_result: Optional[list[dict[str, Any]]],
query_result: Optional[list[IngestedDatafile]],
min_file_age: Optional[int],
) -> bool:
"""Checks if a datafile has been ingested, verified, and its age is higher
Expand All @@ -58,7 +59,7 @@ def _is_completed_df(
return False
if min_file_age:
# Check file age for the replica.
vtime = datetime.fromisoformat(replica["last_verified_time"])
vtime = datetime.fromisoformat(replica.last_verified_time)
days_from_vtime = (datetime.now() - vtime).days
logger.info("%s was last verified %i days ago.", pth, days_from_vtime)
if days_from_vtime < min_file_age:
Expand Down
2 changes: 1 addition & 1 deletion src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from requests import PreparedRequest
from requests.auth import AuthBase

from src.blueprints.custom_data_types import MTUrl
from src.blueprints.storage_boxes import StorageTypesEnum
from src.mytardis_client.common_types import MTUrl

logger = logging.getLogger(__name__)

Expand Down
8 changes: 4 additions & 4 deletions src/ingestion_factory/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(
self.config = config

mt_rest = mt_rest or MyTardisRESTFactory(config.auth, config.connection)
self._overseer = overseer or Overseer(mt_rest)
self._overseer: Overseer = overseer or Overseer(mt_rest)

self.forge = forge or Forge(mt_rest)
self.smelter = smelter or Smelter(
Expand Down Expand Up @@ -105,7 +105,7 @@ def ingest_projects(
MyTardisObject.PROJECT, project.model_dump()
)
if len(matching_projects) > 0:
project_uri = URI(matching_projects[0]["resource_uri"])
project_uri = matching_projects[0].resource_uri
# Would we ever get multiple matches? If so, what should we do?
logging.info(
'Already ingested project "%s" as "%s". Skipping project ingestion.',
Expand Down Expand Up @@ -145,7 +145,7 @@ def ingest_experiments(
MyTardisObject.EXPERIMENT, experiment.model_dump()
)
if len(matching_experiments) > 0:
experiment_uri = URI(matching_experiments[0]["resource_uri"])
experiment_uri = matching_experiments[0].resource_uri
logging.info(
'Already ingested experiment "%s" as "%s". Skipping experiment ingestion.',
experiment.title,
Expand Down Expand Up @@ -184,7 +184,7 @@ def ingest_datasets(
MyTardisObject.DATASET, dataset.model_dump()
)
if len(matching_datasets) > 0:
dataset_uri = URI(matching_datasets[0]["resource_uri"])
dataset_uri = matching_datasets[0].resource_uri
logging.info(
'Already ingested dataset "%s" as "%s". Skipping dataset ingestion.',
dataset.description,
Expand Down
13 changes: 9 additions & 4 deletions src/inspector/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
"""

import logging
from typing import Any, Optional
from typing import Optional

from typeguard import check_type

from src.blueprints.datafile import RawDatafile
from src.config.config import ConfigFromEnv
from src.crucible.crucible import Crucible
from src.mytardis_client.mt_rest import MyTardisRESTFactory
from src.mytardis_client.objects import MyTardisObject
from src.mytardis_client.response_data import IngestedDatafile
from src.overseers.overseer import Overseer
from src.smelters.smelter import Smelter

Expand All @@ -28,11 +31,11 @@ def __init__(self, config: ConfigFromEnv) -> None:
default_schema=config.default_schema,
)
crucible = Crucible(overseer)
self._overseer = overseer
self._overseer: Overseer = overseer
self._smelter = smelter
self._crucible = crucible

def query_datafile(self, raw_df: RawDatafile) -> Optional[list[dict[str, Any]]]:
def query_datafile(self, raw_df: RawDatafile) -> Optional[list[IngestedDatafile]]:
"""Partially ingests raw datafile and queries MyTardis for matching instances.
Args:
Expand All @@ -50,6 +53,8 @@ def query_datafile(self, raw_df: RawDatafile) -> Optional[list[dict[str, Any]]]:
return None

# Look up the datafile in MyTardis to check if it's ingested.
return self._overseer.get_matching_objects(
matches = self._overseer.get_matching_objects(
MyTardisObject.DATAFILE, df.model_dump()
)

return check_type(matches, list[IngestedDatafile])
22 changes: 21 additions & 1 deletion src/mytardis_client/common_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
objects.py for that."""

from enum import Enum
from typing import Literal
from typing import Annotated, Literal

from pydantic import AfterValidator

from src.utils.validation import validate_isodatetime, validate_md5sum, validate_url

# The HTTP methods supported by MyTardis. Can be used to constrain the request interfaces
# to ensure that only methods that are supported by MyTardis are used.
Expand Down Expand Up @@ -35,3 +39,19 @@ class DataStatus(Enum):
READY_FOR_INGESTION = 1
INGESTED = 5
FAILED = 10


MD5Sum = Annotated[
str,
AfterValidator(validate_md5sum),
]

ISODateTime = Annotated[
str,
AfterValidator(validate_isodatetime),
]

MTUrl = Annotated[
str,
AfterValidator(validate_url),
]
Loading

0 comments on commit 20b7c74

Please sign in to comment.