wtsi-npg · nerdstrike · Oct 2, 2023 · Sep 22, 2023 · Oct 2, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased]
 
+### Changed
+
+* Increased the look-back period for the inbox query from 4 weeks to
+  12 weeks. Introduced a preliminary filtering by the QC state, which is
+  now available in ml warehouse. Since the ml warehouse QC state might not
+  be up-to-date, a final check against the LangQC database is retained.
+
 ## [1.4.1] - 2023-08-23
 
 ### Added

diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py
@@ -54,7 +54,7 @@ class WellWh(BaseModel):
         title="SQLAlchemy Session",
         description="A SQLAlchemy Session for the ml warehouse database",
     )
-    INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 4
+    INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 12
 
     class Config:
         allow_mutation = False
@@ -77,18 +77,16 @@ def get_mlwh_well_by_product_id(
 
     def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
         """
-        Get recent completed wells from the mlwh database.
-        The implementation of the inbox query might change when the QC outcomes
-        become available in mlwh.
+        Get recent not QC-ed completed wells from the mlwh database.
         """
 
         ######
         # It is important not to show aborted wells in the inbox.
         #
-        # The well can be complete as in Illumina 'run complete' but that's not
-        # the same as analysis complete which the other conditions are trying for.
-        # It potentially gets a bit easier with v11 but those conditions should
-        # still work ok.
+        # The well can be complete, but that's not the same as analysis
+        # complete which the other conditions are trying for.
+        # It potentially gets a bit easier with v11 but those conditions
+        # should still work ok.
         #
 
         # Using current local time.
@@ -97,11 +95,15 @@ def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
         my_date = date.today() - timedelta(weeks=self.INBOX_LOOK_BACK_NUM_WEEKS)
         look_back_min_date = datetime(my_date.year, my_date.month, my_date.day)
 
+        # Select the wells that has not been QC-ed, but later double-check against
+        # the LangQC database.
+
         # TODO: fall back to run_complete when well_complete is undefined
 
         query = (
             select(PacBioRunWellMetrics)
             .where(PacBioRunWellMetrics.well_status == "Complete")
+            .where(PacBioRunWellMetrics.qc_seq_state.is_(None))
             .where(PacBioRunWellMetrics.run_complete > look_back_min_date)
             .where(PacBioRunWellMetrics.polymerase_num_reads.is_not(None))
             .where(

diff --git a/tests/fixtures/well_data.py b/tests/fixtures/well_data.py
@@ -864,34 +864,40 @@ def load_data4well_retrieval(
     qcdb_test_session.commit()
 
     # We want some wells to be in the inbox. For that their run_complete dates
-    # should be within last four weeks. Therefore, we need to update the timestamps
-    # for these runs.
+    # should be within, for example, last four weeks. Therefore, we need to
+    #  update the timestamps for these runs.
     _update_timestamps4inbox()
 
     # Transform a list of lists into a list of hashes, which map to db rows.
     mlwh_data4insert = []
     for record in MLWH_DATA:
-        mlwh_data4insert.append(
-            {
-                "pac_bio_run_name": record[0],
-                "well_label": record[1],
-                "run_start": record[2],
-                "run_complete": record[3],
-                "well_start": record[4],
-                "well_complete": record[5],
-                "well_status": record[6],
-                "run_status": record[7],
-                "ccs_execution_mode": record[8],
-                "polymerase_num_reads": record[9],
-                "hifi_num_reads": record[10],
-                "id_pac_bio_product": PacBioEntity(
-                    run_name=record[0], well_label=record[1], plate_number=record[13]
-                ).hash_product_id(),
-                "instrument_name": record[11],
-                "instrument_type": record[12],
-                "plate_number": record[13],
-            }
-        )
+        data = {
+            "pac_bio_run_name": record[0],
+            "well_label": record[1],
+            "run_start": record[2],
+            "run_complete": record[3],
+            "well_start": record[4],
+            "well_complete": record[5],
+            "well_status": record[6],
+            "run_status": record[7],
+            "ccs_execution_mode": record[8],
+            "polymerase_num_reads": record[9],
+            "hifi_num_reads": record[10],
+            "id_pac_bio_product": PacBioEntity(
+                run_name=record[0], well_label=record[1], plate_number=record[13]
+            ).hash_product_id(),
+            "instrument_name": record[11],
+            "instrument_type": record[12],
+            "plate_number": record[13],
+        }
+        # Add QC state for one runs.
+        if (data["pac_bio_run_name"] == "TRACTION_RUN_4") and (
+            data["well_label"] in ("A1", "B1")
+        ):
+            data["qc_seq_state"] = "Failed"
+            data["qc_seq_date"] = data["run_complete"]
+        mlwh_data4insert.append(data)
+
     mlwhdb_test_session.execute(insert(PacBioRunWellMetrics), mlwh_data4insert)
     mlwhdb_test_session.commit()
 

diff --git a/tests/test_wh_data_retrieval_pb.py b/tests/test_wh_data_retrieval_pb.py
@@ -17,8 +17,6 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval
         ["TRACTION_RUN_1", "B1"],
         ["TRACTION_RUN_1", "C1"],
         ["TRACTION_RUN_1", "D1"],
-        ["TRACTION_RUN_4", "A1"],
-        ["TRACTION_RUN_4", "B1"],
         ["TRACTION_RUN_4", "C1"],
         ["TRACTION_RUN_4", "D1"],
         ["TRACTION_RUN_3", "A1"],
@@ -42,7 +40,7 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval
         .scalars()
         .all()
     )
-    time = datetime.now() - timedelta(days=40)
+    time = datetime.now() - timedelta(days=130)
     for row in wells_to_update:
         if row.well_complete is not None:
             row.well_complete = time