Merge pull request #212 from mgcam/study_name4summary

PacBioWellSummary as a response model for well summaries
wtsi-npg · Mar 22, 2024 · f599e37 · f599e37
2 parents 7de46f6 + a43b88a
commit f599e37
Show file tree

Hide file tree

Showing 23 changed files with 2,979 additions and 822 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,23 @@
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [Unreleased]
+
+### Changed
+
+* To simplify object instantiation and fields' assignment for some
+  of the response models, converted `PacBioWell` and `PacBioWellFull`
+  models to pydantic dataclasses.
+
+### Added
+
+* A new response model `PacBioWellSummary`, which replaces `PacBioWell`
+  in the latest's capacity of the response model for a PacBio well
+  summary.
+* A new field, `study_names`, a potentially empty sorted array of
+  unique study names, is added to the response model for a PacBio
+  well summary.
+
 ## [2.0.0] - 2024-02-20
 
 ### Changed

diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py
@@ -33,7 +33,7 @@
 )
 from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
 from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
-from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWell
+from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
 from lang_qc.models.pager import PagedResponse
 from lang_qc.models.qc_flow_status import QcFlowStatusEnum
 from lang_qc.models.qc_state import QcState as QcStateModel
@@ -195,7 +195,7 @@ def create_for_qc_status(
         specified by the `page_size`, `page_number` object's attributes and
         `qc_flow_status` argument of this function..
 
-        The `PacBioWell` objects in `wells` attribute of the returned object
+        The `PacBioWellPacBioWell` objects in `wells` attribute of the returned object
         are sorted in a way appropriate for the requested `qc_flow_status`.
         For the 'in progress' and 'on hold' requests the wells with most recently
         assigned QC states come first. For inbox requests the wells with least
@@ -230,7 +230,7 @@ def create_for_run(self, run_name: str) -> PacBioPagedWells:
         """
         Returns `PacBioPagedWells` object that corresponds to the criteria
         specified by the `page_size` and `page_number` attributes.
-        The `PacBioWell` objects in `wells` attribute of the returned object
+        The `PacBioWellSummary` objects in `wells` attribute of the returned object
         belong to runs specified by the `run_name` argument and are sorted
         by the run name and well label.
         """
@@ -281,7 +281,7 @@ def _retrieve_paged_qc_states(
 
     def _get_wells_for_status(
         self, qc_flow_status: QcFlowStatusEnum
-    ) -> List[PacBioWell]:
+    ) -> List[PacBioWellSummary]:
 
         wells = []
 
@@ -290,7 +290,7 @@ def _get_wells_for_status(
             id_product = qc_state_model.id_product
             mlwh_well = self.get_mlwh_well_by_product_id(id_product=id_product)
             if mlwh_well is not None:
-                pbw = PacBioWell(db_well=mlwh_well, qc_state=qc_state_model)
+                pbw = PacBioWellSummary(db_well=mlwh_well, qc_state=qc_state_model)
                 wells.append(pbw)
             else:
                 """
@@ -394,7 +394,7 @@ def _well_models(
             qc_state = None
             if id_product in qced_products:
                 qc_state = qced_products[id_product][0]
-            pb_well = PacBioWell(db_well=db_well, qc_state=qc_state)
+            pb_well = PacBioWellSummary(db_well=db_well, qc_state=qc_state)
             pb_wells.append(pb_well)
 
         return pb_wells

diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py
@@ -538,6 +538,27 @@ class PacBioRunWellMetrics(Base):
         "PacBioProductMetrics", back_populates="pac_bio_run_well_metrics"
     )
 
+    def get_experiment_info(self):
+        """Returns a list of PacBioRun mlwh database rows.
+
+        Returns LIMS information about the PacBio experiment
+        for this well, one pac_bio_run table row per sample
+        (product) in the well.
+
+        If any or all of the pac_bio_product_metrics rows linked
+        to this well record are not linked to the pac_bio_run
+        table, and empty array is returned, thus preventing incomplete
+        data being supplied to the client.
+        """
+        product_metrics = self.pac_bio_product_metrics
+        experiment_info = [
+            pbr for pbr in [pm.pac_bio_run for pm in product_metrics] if pbr is not None
+        ]
+        if len(experiment_info) != len(product_metrics):
+            experiment_info = []
+
+        return experiment_info
+
 
 class PacBioProductMetrics(Base):
     __tablename__ = "pac_bio_product_metrics"

diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py
@@ -60,7 +60,7 @@ class PacBioWell:
     `run_name`, `label`, `plate_number`, and `id_product` fields uniquely
     identify the well. The model also has fields that reflect the time line
     of the run and information about a PacBio instrument. The optional
-    `qc_state  field might contain the current QC state of the well.
+    `qc_state`  field might contain the current QC state of the well.
 
     The best way to instantiate the model is via the constructor, supplying
     the an ORM object representing a database row with information about
@@ -133,6 +133,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
 
         # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi
         mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"]
+        assert mlwh_db_row
 
         column_names = [column.key for column in PacBioRunWellMetrics.__table__.columns]
 
@@ -147,14 +148,41 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
         return assigned
 
 
+@dataclass(kw_only=True, frozen=True)
+class PacBioWellSummary(PacBioWell):
+    """A response model for a summary about a single PacBio well.
+
+    Adds `study_names` to a list of attributes of the parent class `PacBioWell`.
+    Instance creation is described in the documentation of the parent class.
+
+    `get_experiment_info` method in this package is used to retrieve study
+    information, see its documentation for details.
+    """
+
+    study_names: list = Field(
+        title="An alphabetically sorted list of distinct study names",
+    )
+
+    @model_validator(mode="before")
+    def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
+
+        assigned = super().pre_root(values)
+        mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"]
+        assigned["study_names"] = sorted(
+            set([row.study.name for row in mlwh_db_row.get_experiment_info()])
+        )
+
+        return assigned
+
+
 class PacBioPagedWells(PagedResponse, extra="forbid"):
     """A response model for paged data about PacBio wells."""
 
-    wells: list[PacBioWell] = Field(
+    wells: list[PacBioWellSummary] = Field(
         default=[],
-        title="A list of PacBioWell objects",
+        title="A list of PacBioWellSummary objects",
         description="""
-        A list of `PacBioWell` objects that corresponds to the page number
+        A list of `PacBioWellSummary` objects that corresponds to the page number
         and size specified by the `page_size` and `page_number` attributes.
         """,
     )
@@ -164,13 +192,16 @@ class PacBioPagedWells(PagedResponse, extra="forbid"):
 class PacBioWellFull(PacBioWell):
     """A full response model for a single PacBio well.
 
-    The model has teh fields that uniquely define the well (`run_name`, `label`,
+    The model has the fields that uniquely define the well (`run_name`, `label`,
     `plate_number`, `id_product`), along with the laboratory experiment and
     sequence run tracking information, current QC state of this well and
     QC data for this well.
 
     Instance creation is described in the documentation of this class's parent
     `PacBioWell`.
+
+    `get_experiment_info` method in this package is used to retrieve information
+    about the experiment, see its documentation for details.
     """
 
     metrics: QCDataWell = Field(
@@ -189,16 +220,9 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
 
         assigned = super().pre_root(values)
         mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"]
-
         assigned["metrics"] = QCDataWell.from_orm(mlwh_db_row)
-
-        product_metrics = mlwh_db_row.pac_bio_product_metrics
-        experiment_info = [
-            pbr for pbr in [pm.pac_bio_run for pm in product_metrics] if pbr is not None
-        ]
-        # Occasionally product rows are not linked to LIMS rows.
-        # Go for all or nothing, do not supply incomplete data.
-        if len(experiment_info) and (len(experiment_info) == len(product_metrics)):
+        experiment_info = mlwh_db_row.get_experiment_info()
+        if len(experiment_info):
             assigned["experiment_tracking"] = PacBioExperiment.from_orm(experiment_info)
 
         return assigned
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -20,15 +20,6 @@
 test_ini = os.path.join(os.path.dirname(__file__), "testdb.ini")
 
 
-@pytest.fixture(scope="package")
-def config() -> configparser.ConfigParser:
-    # Database credentials for the test MySQL instance are stored here. This
-    # should be an instance in a container, discarded after each test run.
-    test_config = configparser.ConfigParser()
-    test_config.read(test_ini)
-    return test_config
-
-
 def mysql_url(
     config: configparser.ConfigParser,
     section: str,
@@ -70,6 +61,43 @@ def mysql_url(
     )
 
 
+def insert_from_yaml(session, dir_path, module_name):
+
+    # Load the schema module where the table ORM classes are defined.
+    module = importlib.import_module(module_name)
+
+    # Find all files in a given directory.
+    dir_obj = pathlib.Path(dir_path)
+    file_paths = list(str(f) for f in dir_obj.iterdir())
+    file_paths.sort()
+
+    for file_path in file_paths:
+        with open(file_path, "r") as f:
+            (head, file_name) = os.path.split(file_path)
+            # File name example: 200-PacBioRun.yml
+            m = re.match(r"\A\d+-([a-zA-Z]+)\.yml\Z", file_name)
+            if m is not None:
+                class_name = m.group(1)
+                table_class = getattr(module, class_name)
+                data = yaml.safe_load(f)
+                session.execute(insert(table_class), data)
+
+    session.commit()
+
+
+def compare_dates(date_obj, date_string):
+    assert date_obj.isoformat(sep=" ", timespec="seconds") == date_string
+
+
+@pytest.fixture(scope="package")
+def config() -> configparser.ConfigParser:
+    # Database credentials for the test MySQL instance are stored here. This
+    # should be an instance in a container, discarded after each test run.
+    test_config = configparser.ConfigParser()
+    test_config.read(test_ini)
+    return test_config
+
+
 @pytest.fixture(scope="module", name="mlwhdb_test_sessionfactory")
 def create_mlwhdb_test_sessionfactory(config):
     """Create a MLWH SQLAlchemy session factory, using credentials from config.
@@ -165,30 +193,8 @@ def override_get_qc_db():
     return client
 
 
-def insert_from_yaml(session, dir_path, module_name):
-
-    # Load the schema module where the table ORM classes are defined.
-    module = importlib.import_module(module_name)
-
-    # Find all files in a given directory.
-    dir_obj = pathlib.Path(dir_path)
-    file_paths = list(str(f) for f in dir_obj.iterdir())
-    file_paths.sort()
-
-    for file_path in file_paths:
-        with open(file_path, "r") as f:
-            (head, file_name) = os.path.split(file_path)
-            # File name example: 200-PacBioRun.yml
-            m = re.match(r"\A\d+-([a-zA-Z]+)\.yml\Z", file_name)
-            if m is None:
-                raise Exception(f"Unexpected file {file_path} in fixtures.")
-            class_name = m.group(1)
-            table_class = getattr(module, class_name)
-            data = yaml.safe_load(f)
-            session.execute(insert(table_class), data)
-
-    session.commit()
-
-
-def compare_dates(date_obj, date_string):
-    assert date_obj.isoformat(sep=" ", timespec="seconds") == date_string
+@pytest.fixture(scope="module", name="mlwhdb_load_runs")
+def mlwhdb_load_from_yaml(mlwhdb_test_session):
+    insert_from_yaml(
+        mlwhdb_test_session, "tests/data/mlwh_pb_runs", "lang_qc.db.mlwh_schema"
+    )