diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b0b860..e3f7465 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] +### Changed + +* Increased the look-back period for the inbox query from 4 weeks to + 12 weeks. Introduced a preliminary filtering by the QC state, which is + now available in ml warehouse. Since the ml warehouse QC state might not + be up-to-date, a final check against the LangQC database is retained. + ## [1.4.1] - 2023-08-23 ### Added diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 8e2e7af..226203b 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -54,7 +54,7 @@ class WellWh(BaseModel): title="SQLAlchemy Session", description="A SQLAlchemy Session for the ml warehouse database", ) - INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 4 + INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 12 class Config: allow_mutation = False @@ -77,18 +77,16 @@ def get_mlwh_well_by_product_id( def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: """ - Get recent completed wells from the mlwh database. - The implementation of the inbox query might change when the QC outcomes - become available in mlwh. + Get recent not QC-ed completed wells from the mlwh database. """ ###### # It is important not to show aborted wells in the inbox. # - # The well can be complete as in Illumina 'run complete' but that's not - # the same as analysis complete which the other conditions are trying for. - # It potentially gets a bit easier with v11 but those conditions should - # still work ok. + # The well can be complete, but that's not the same as analysis + # complete which the other conditions are trying for. + # It potentially gets a bit easier with v11 but those conditions + # should still work ok. # # Using current local time. @@ -97,11 +95,15 @@ def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: my_date = date.today() - timedelta(weeks=self.INBOX_LOOK_BACK_NUM_WEEKS) look_back_min_date = datetime(my_date.year, my_date.month, my_date.day) + # Select the wells that has not been QC-ed, but later double-check against + # the LangQC database. + # TODO: fall back to run_complete when well_complete is undefined query = ( select(PacBioRunWellMetrics) .where(PacBioRunWellMetrics.well_status == "Complete") + .where(PacBioRunWellMetrics.qc_seq_state.is_(None)) .where(PacBioRunWellMetrics.run_complete > look_back_min_date) .where(PacBioRunWellMetrics.polymerase_num_reads.is_not(None)) .where( diff --git a/tests/fixtures/well_data.py b/tests/fixtures/well_data.py index 8bca2c0..7899c8b 100644 --- a/tests/fixtures/well_data.py +++ b/tests/fixtures/well_data.py @@ -864,34 +864,40 @@ def load_data4well_retrieval( qcdb_test_session.commit() # We want some wells to be in the inbox. For that their run_complete dates - # should be within last four weeks. Therefore, we need to update the timestamps - # for these runs. + # should be within, for example, last four weeks. Therefore, we need to + # update the timestamps for these runs. _update_timestamps4inbox() # Transform a list of lists into a list of hashes, which map to db rows. mlwh_data4insert = [] for record in MLWH_DATA: - mlwh_data4insert.append( - { - "pac_bio_run_name": record[0], - "well_label": record[1], - "run_start": record[2], - "run_complete": record[3], - "well_start": record[4], - "well_complete": record[5], - "well_status": record[6], - "run_status": record[7], - "ccs_execution_mode": record[8], - "polymerase_num_reads": record[9], - "hifi_num_reads": record[10], - "id_pac_bio_product": PacBioEntity( - run_name=record[0], well_label=record[1], plate_number=record[13] - ).hash_product_id(), - "instrument_name": record[11], - "instrument_type": record[12], - "plate_number": record[13], - } - ) + data = { + "pac_bio_run_name": record[0], + "well_label": record[1], + "run_start": record[2], + "run_complete": record[3], + "well_start": record[4], + "well_complete": record[5], + "well_status": record[6], + "run_status": record[7], + "ccs_execution_mode": record[8], + "polymerase_num_reads": record[9], + "hifi_num_reads": record[10], + "id_pac_bio_product": PacBioEntity( + run_name=record[0], well_label=record[1], plate_number=record[13] + ).hash_product_id(), + "instrument_name": record[11], + "instrument_type": record[12], + "plate_number": record[13], + } + # Add QC state for one runs. + if (data["pac_bio_run_name"] == "TRACTION_RUN_4") and ( + data["well_label"] in ("A1", "B1") + ): + data["qc_seq_state"] = "Failed" + data["qc_seq_date"] = data["run_complete"] + mlwh_data4insert.append(data) + mlwhdb_test_session.execute(insert(PacBioRunWellMetrics), mlwh_data4insert) mlwhdb_test_session.commit() diff --git a/tests/test_wh_data_retrieval_pb.py b/tests/test_wh_data_retrieval_pb.py index c3bd277..fc3c94c 100644 --- a/tests/test_wh_data_retrieval_pb.py +++ b/tests/test_wh_data_retrieval_pb.py @@ -17,8 +17,6 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval ["TRACTION_RUN_1", "B1"], ["TRACTION_RUN_1", "C1"], ["TRACTION_RUN_1", "D1"], - ["TRACTION_RUN_4", "A1"], - ["TRACTION_RUN_4", "B1"], ["TRACTION_RUN_4", "C1"], ["TRACTION_RUN_4", "D1"], ["TRACTION_RUN_3", "A1"], @@ -42,7 +40,7 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval .scalars() .all() ) - time = datetime.now() - timedelta(days=40) + time = datetime.now() - timedelta(days=130) for row in wells_to_update: if row.well_complete is not None: row.well_complete = time