From c54de496458e1ea8886959223f7790cfc37cc837 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 5 Mar 2024 10:03:38 +0000 Subject: [PATCH 1/5] Use Optional type hint for fields with None default --- lang_qc/models/pacbio/well.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index 54adff5..9e42a25 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -57,16 +57,20 @@ class PacBioWell(BaseModel, extra="forbid"): title="Run name", description="PacBio run name as registered in LIMS" ) # Run and well tracking information from SMRT Link - run_start_time: datetime = Field(default=None, title="Run start time") - run_complete_time: datetime = Field(default=None, title="Run complete time") - well_start_time: datetime = Field(default=None, title="Well start time") - well_complete_time: datetime = Field(default=None, title="Well complete time") - run_status: str = Field(default=None, title="Current PacBio run status") - well_status: str = Field(default=None, title="Current PacBio well status") - instrument_name: str = Field(default=None, title="Instrument name") - instrument_type: str = Field(default=None, title="Instrument type") - - qc_state: QcState = Field( + run_start_time: Optional[datetime] = Field(default=None, title="Run start time") + run_complete_time: Optional[datetime] = Field( + default=None, title="Run complete time" + ) + well_start_time: Optional[datetime] = Field(default=None, title="Well start time") + well_complete_time: Optional[datetime] = Field( + default=None, title="Well complete time" + ) + run_status: Optional[str] = Field(default=None, title="Current PacBio run status") + well_status: Optional[str] = Field(default=None, title="Current PacBio well status") + instrument_name: Optional[str] = Field(default=None, title="Instrument name") + instrument_type: Optional[str] = Field(default=None, title="Instrument type") + + qc_state: Optional[QcState] = Field( default=None, title="Current QC state of this well", description=""" @@ -117,7 +121,7 @@ class PacBioWellFull(PacBioWell): metrics: QCDataWell = Field( title="Currently available QC data for well", ) - experiment_tracking: PacBioExperiment = Field( + experiment_tracking: Optional[PacBioExperiment] = Field( default=None, title="Experiment tracking information", description=""" From 9d1c7ceffb3064b9e066c6bbdb96736b3218f147 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 5 Mar 2024 12:05:46 +0000 Subject: [PATCH 2/5] pydantic BaseModel is replaced by pydantic dataclass ... for some models in order to simplify instantiation of the objects. --- lang_qc/db/helper/wells.py | 21 ++----- lang_qc/endpoints/pacbio_well.py | 5 +- lang_qc/models/pacbio/well.py | 98 ++++++++++++++++---------------- tests/test_pac_well_full.py | 12 +++- 4 files changed, 67 insertions(+), 69 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 63ab3b7..91c1d9b 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -290,14 +290,7 @@ def _get_wells_for_status( id_product = qc_state_model.id_product mlwh_well = self.get_mlwh_well_by_product_id(id_product=id_product) if mlwh_well is not None: - pbw = PacBioWell( - id_product=id_product, - run_name=mlwh_well.pac_bio_run_name, - plate_number=mlwh_well.plate_number, - label=mlwh_well.well_label, - qc_state=qc_state_model, - ) - pbw.copy_run_tracking_info(mlwh_well) + pbw = PacBioWell(db_well=mlwh_well, qc_state=qc_state_model) wells.append(pbw) else: """ @@ -398,16 +391,10 @@ def _well_models( pb_wells = [] for db_well in db_wells_list: id_product = db_well.id_pac_bio_product - attrs = { - "id_product": id_product, - "run_name": db_well.pac_bio_run_name, - "plate_number": db_well.plate_number, - "label": db_well.well_label, - } + qc_state = None if id_product in qced_products: - attrs["qc_state"] = qced_products[id_product][0] - pb_well = PacBioWell.model_validate(attrs) - pb_well.copy_run_tracking_info(db_well) + qc_state = qced_products[id_product][0] + pb_well = PacBioWell(db_well=db_well, qc_state=qc_state) pb_wells.append(pb_well) return pb_wells diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index dca152e..f9d4957 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -29,6 +29,7 @@ from lang_qc.db.helper.qc import ( assign_qc_state_to_product, claim_qc_for_product, + get_qc_state_for_product, product_has_qc_state, ) from lang_qc.db.helper.well import well_seq_product_find_or_create @@ -179,7 +180,9 @@ def get_seq_metrics( mlwh_well = _find_well_product_or_error(id_product, mlwhdb_session) - return PacBioWellFull.from_orm(mlwh_well, qcdb_session) + qc_state_db = get_qc_state_for_product(session=qcdb_session, id_product=id_product) + qc_state = None if qc_state_db is None else QcState.from_orm(qc_state_db) + return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state) @router.post( diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index 9e42a25..efd5abc 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -21,12 +21,11 @@ # this program. If not, see . from datetime import datetime -from typing import Optional +from typing import Any, Optional -from pydantic import BaseModel, ConfigDict, Field -from sqlalchemy.orm import Session +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass -from lang_qc.db.helper.qc import get_qc_state_for_product from lang_qc.db.mlwh_schema import PacBioRunWellMetrics from lang_qc.models.pacbio.experiment import PacBioExperiment from lang_qc.models.pacbio.qc_data import QCDataWell @@ -34,7 +33,8 @@ from lang_qc.models.qc_state import QcState -class PacBioWell(BaseModel, extra="forbid"): +@dataclass +class PacBioWell: """ A response model for a single PacBio well on a particular PacBio run. The class contains the attributes that uniquely define this well (`run_name` @@ -45,6 +45,8 @@ class PacBioWell(BaseModel, extra="forbid"): sequenced or QC metrics or assessment for such data. """ + db_well: PacBioRunWellMetrics = Field(init_var=True) + # Well identifies. id_product: str = Field(title="Product identifier") label: str = Field(title="Well label", description="The label of the PacBio well") @@ -80,19 +82,33 @@ class PacBioWell(BaseModel, extra="forbid"): """, ) - def copy_run_tracking_info(self, db_well: PacBioRunWellMetrics): + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: """ Populates this object with the run and well tracking information from a database row that is passed as an argument. """ - self.run_start_time = db_well.run_start - self.run_complete_time = db_well.run_complete - self.well_start_time = db_well.well_start - self.well_complete_time = db_well.well_complete - self.run_status = db_well.run_status - self.well_status = db_well.well_status - self.instrument_name = db_well.instrument_name - self.instrument_type = db_well.instrument_type + + # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] + assigned = dict() + assigned["id_product"] = mlwh_db_row.id_pac_bio_product + assigned["label"] = mlwh_db_row.well_label + assigned["plate_number"] = mlwh_db_row.plate_number + assigned["run_name"] = mlwh_db_row.pac_bio_run_name + assigned["run_start_time"] = mlwh_db_row.run_start + assigned["run_complete_time"] = mlwh_db_row.run_complete + assigned["well_start_time"] = mlwh_db_row.well_start + assigned["well_complete_time"] = mlwh_db_row.well_complete + assigned["run_status"] = mlwh_db_row.run_status + assigned["well_status"] = mlwh_db_row.well_status + assigned["instrument_name"] = mlwh_db_row.instrument_name + assigned["instrument_type"] = mlwh_db_row.instrument_type + + if "qc_state" in values.kwargs: + assigned["qc_state"] = values.kwargs["qc_state"] + + return assigned class PacBioPagedWells(PagedResponse, extra="forbid"): @@ -110,6 +126,7 @@ class PacBioPagedWells(PagedResponse, extra="forbid"): ) +@dataclass class PacBioWellFull(PacBioWell): """ A response model for a single PacBio well on a particular PacBio run. @@ -128,37 +145,22 @@ class PacBioWellFull(PacBioWell): Laboratory experiment tracking information for this well, if available. """, ) - model_config = ConfigDict(from_attributes=True, extra="forbid") - - @classmethod - def from_orm(cls, mlwh_db_row: PacBioRunWellMetrics, qc_session: Session): - - id_product = mlwh_db_row.id_pac_bio_product - obj = cls( - id_product=id_product, - run_name=mlwh_db_row.pac_bio_run_name, - plate_number=mlwh_db_row.plate_number, - label=mlwh_db_row.well_label, - metrics=QCDataWell.from_orm(mlwh_db_row), - ) - obj.copy_run_tracking_info(mlwh_db_row) - - experiment_info = [] - for row in mlwh_db_row.pac_bio_product_metrics: - exp_row = row.pac_bio_run - if exp_row: - experiment_info.append(exp_row) - else: - # Do not supply incomplete data. - experiment_info = [] - break - if len(experiment_info): - obj.experiment_tracking = PacBioExperiment.from_orm(experiment_info) - - qc_state_db = get_qc_state_for_product( - session=qc_session, id_product=id_product - ) - if qc_state_db is not None: - obj.qc_state = QcState.from_orm(qc_state_db) - - return obj + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + + assigned = super().pre_root(values) + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] + + assigned["metrics"] = QCDataWell.from_orm(mlwh_db_row) + + product_metrics = mlwh_db_row.pac_bio_product_metrics + experiment_info = [ + pbr for pbr in [pm.pac_bio_run for pm in product_metrics] if pbr is not None + ] + # Occasionally product rows are not linked to LIMS rows. + # Go for all or nothing, do not supply incomplete data. + if len(experiment_info) and (len(experiment_info) == len(product_metrics)): + assigned["experiment_tracking"] = PacBioExperiment.from_orm(experiment_info) + + return assigned diff --git a/tests/test_pac_well_full.py b/tests/test_pac_well_full.py index b1a700e..c8ff08c 100644 --- a/tests/test_pac_well_full.py +++ b/tests/test_pac_well_full.py @@ -1,5 +1,6 @@ from npg_id_generation.pac_bio import PacBioEntity +from lang_qc.db.helper.qc import get_qc_states_by_id_product_list from lang_qc.db.helper.wells import WellWh from lang_qc.models.pacbio.well import PacBioWellFull from tests.conftest import compare_dates, insert_from_yaml @@ -21,7 +22,7 @@ def test_creating_experiment_object( ).hash_product_id() well_row = helper.get_mlwh_well_by_product_id(id_product) - pb_well = PacBioWellFull.from_orm(well_row, qcdb_test_session) + pb_well = PacBioWellFull(db_well=well_row) assert pb_well.id_product == id_product assert pb_well.run_name == "TRACTION-RUN-92" assert pb_well.label == "A1" @@ -45,7 +46,12 @@ def test_creating_experiment_object( ).hash_product_id() well_row = helper.get_mlwh_well_by_product_id(id_product) - pb_well = PacBioWellFull.from_orm(well_row, qcdb_test_session) + qc_state = get_qc_states_by_id_product_list( + session=qcdb_test_session, + ids=[id_product], + sequencing_outcomes_only=True, + ) + pb_well = PacBioWellFull(db_well=well_row, qc_state=qc_state) assert pb_well.id_product == id_product assert pb_well.run_name == "TRACTION_RUN_1" assert pb_well.label == "B1" @@ -65,7 +71,7 @@ def test_creating_experiment_object( ).hash_product_id() well_row = helper.get_mlwh_well_by_product_id(id_product) - pb_well = PacBioWellFull.from_orm(well_row, qcdb_test_session) + pb_well = PacBioWellFull(db_well=well_row, qc_state=None) assert pb_well.id_product == id_product assert pb_well.run_name == "TRACTION_RUN_10" assert pb_well.label == "C1" From bcbabcd31530789f68e6530b1a28b20836e9fb7d Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 5 Mar 2024 13:40:56 +0000 Subject: [PATCH 3/5] Make the dataclasses semi-immutable - no change for the values --- lang_qc/models/pacbio/well.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index efd5abc..1568842 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -33,7 +33,7 @@ from lang_qc.models.qc_state import QcState -@dataclass +@dataclass(kw_only=True, frozen=True) class PacBioWell: """ A response model for a single PacBio well on a particular PacBio run. @@ -126,7 +126,7 @@ class PacBioPagedWells(PagedResponse, extra="forbid"): ) -@dataclass +@dataclass(kw_only=True, frozen=True) class PacBioWellFull(PacBioWell): """ A response model for a single PacBio well on a particular PacBio run. From 064c2bb3a35bf04a14c99aa46adc01f8604dd017 Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 6 Mar 2024 15:22:22 +0000 Subject: [PATCH 4/5] Auto-map column names to model fields --- lang_qc/models/pacbio/well.py | 63 ++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index 1568842..ad6acf8 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -33,6 +33,25 @@ from lang_qc.models.qc_state import QcState +def get_field_names(cls): + """Returns a list of field names for a class given as an argument. + + The fields that can only be used at the object initialisation step + are excluded. + """ + + field_names = [] + for field_name in cls.__dataclass_fields__: + field = cls.__dataclass_fields__[field_name] + if field.default.init_var is True: + continue + name = field.default.validation_alias + if name is None: + name = field.name + field_names.append(name) + return field_names + + @dataclass(kw_only=True, frozen=True) class PacBioWell: """ @@ -48,24 +67,36 @@ class PacBioWell: db_well: PacBioRunWellMetrics = Field(init_var=True) # Well identifies. - id_product: str = Field(title="Product identifier") - label: str = Field(title="Well label", description="The label of the PacBio well") + id_product: str = Field( + title="Product identifier", validation_alias="id_pac_bio_product" + ) + label: str = Field( + title="Well label", + description="The label of the PacBio well", + validation_alias="well_label", + ) plate_number: Optional[int] = Field( default=None, title="Plate number", description="Plate number, relevant for Revio instruments only", ) run_name: str = Field( - title="Run name", description="PacBio run name as registered in LIMS" + title="Run name", + description="PacBio run name as registered in LIMS", + validation_alias="pac_bio_run_name", ) # Run and well tracking information from SMRT Link - run_start_time: Optional[datetime] = Field(default=None, title="Run start time") + run_start_time: Optional[datetime] = Field( + default=None, title="Run start time", validation_alias="run_start" + ) run_complete_time: Optional[datetime] = Field( - default=None, title="Run complete time" + default=None, title="Run complete time", validation_alias="run_complete" + ) + well_start_time: Optional[datetime] = Field( + default=None, title="Well start time", validation_alias="well_start" ) - well_start_time: Optional[datetime] = Field(default=None, title="Well start time") well_complete_time: Optional[datetime] = Field( - default=None, title="Well complete time" + default=None, title="Well complete time", validation_alias="well_complete" ) run_status: Optional[str] = Field(default=None, title="Current PacBio run status") well_status: Optional[str] = Field(default=None, title="Current PacBio well status") @@ -91,19 +122,13 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] + + column_names = [column.key for column in PacBioRunWellMetrics.__table__.columns] + assigned = dict() - assigned["id_product"] = mlwh_db_row.id_pac_bio_product - assigned["label"] = mlwh_db_row.well_label - assigned["plate_number"] = mlwh_db_row.plate_number - assigned["run_name"] = mlwh_db_row.pac_bio_run_name - assigned["run_start_time"] = mlwh_db_row.run_start - assigned["run_complete_time"] = mlwh_db_row.run_complete - assigned["well_start_time"] = mlwh_db_row.well_start - assigned["well_complete_time"] = mlwh_db_row.well_complete - assigned["run_status"] = mlwh_db_row.run_status - assigned["well_status"] = mlwh_db_row.well_status - assigned["instrument_name"] = mlwh_db_row.instrument_name - assigned["instrument_type"] = mlwh_db_row.instrument_type + for field_name in get_field_names(cls): + if field_name in column_names: + assigned[field_name] = getattr(mlwh_db_row, field_name) if "qc_state" in values.kwargs: assigned["qc_state"] = values.kwargs["qc_state"] From 4d2efbde3c041ae6e29cda1f9f380c5b5d9ae543 Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 6 Mar 2024 16:33:20 +0000 Subject: [PATCH 5/5] Update documentation --- lang_qc/models/pacbio/well.py | 45 ++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index ad6acf8..9deea4f 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -37,7 +37,8 @@ def get_field_names(cls): """Returns a list of field names for a class given as an argument. The fields that can only be used at the object initialisation step - are excluded. + are excluded. For fields, which have a validation_alias defined, + this alias is returned rather than the field name. """ field_names = [] @@ -54,14 +55,24 @@ def get_field_names(cls): @dataclass(kw_only=True, frozen=True) class PacBioWell: - """ - A response model for a single PacBio well on a particular PacBio run. - The class contains the attributes that uniquely define this well (`run_name` - and `label`), along with the time line and the current QC state of this well, - if any. + """A basic response model for a single PacBio well. + + `run_name`, `label`, `plate_number`, and `id_product` fields uniquely + identify the well. The model also has fields that reflect the time line + of the run and information about a PacBio instrument. The optional + `qc_state field might contain the current QC state of the well. + + The best way to instantiate the model is via the constructor, supplying + the an ORM object representing a database row with information about + the well and, optionally, the model representing the current QC state. - This model does not contain any information about data that was - sequenced or QC metrics or assessment for such data. + Examples: + well_model = PacBioWell(db_well=well_row) + well_model = PacBioWell(db_well=well_row, qc_state=current_qc_state) + + Mapping of the database values to this model's fields is performed by + a pre `__init__` hook. To enable automatic mapping, some fields of this + model have `validation_alias` set. """ db_well: PacBioRunWellMetrics = Field(init_var=True) @@ -137,9 +148,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: class PacBioPagedWells(PagedResponse, extra="forbid"): - """ - A response model for paged data about PacBio wells. - """ + """A response model for paged data about PacBio wells.""" wells: list[PacBioWell] = Field( default=[], @@ -153,11 +162,15 @@ class PacBioPagedWells(PagedResponse, extra="forbid"): @dataclass(kw_only=True, frozen=True) class PacBioWellFull(PacBioWell): - """ - A response model for a single PacBio well on a particular PacBio run. - The class contains the attributes that uniquely define this well (`run_name` - and `label`), along with the laboratory experiment and sequence run tracking - information, current QC state of this well and QC data for this well. + """A full response model for a single PacBio well. + + The model has teh fields that uniquely define the well (`run_name`, `label`, + `plate_number`, `id_product`), along with the laboratory experiment and + sequence run tracking information, current QC state of this well and + QC data for this well. + + Instance creation is described in the documentation of this class's parent + `PacBioWell`. """ metrics: QCDataWell = Field(