Skip to content

Commit

Permalink
Increased the look-back period for inbox.
Browse files Browse the repository at this point in the history
Increased the look-back period for the inbox query
from 4 weeks to 12 weeks. Added a preliminary filtering
by the QC state, which is now available in mlwh. Since
the mlwh QC state might not be up-to-date, a final check
against the LangQC database is retained.
  • Loading branch information
mgcam committed Sep 22, 2023
1 parent d4d98ab commit 3528f8a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 34 deletions.
18 changes: 10 additions & 8 deletions lang_qc/db/helper/wells.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class WellWh(BaseModel):
title="SQLAlchemy Session",
description="A SQLAlchemy Session for the ml warehouse database",
)
INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 4
INBOX_LOOK_BACK_NUM_WEEKS: ClassVar = 12

class Config:
allow_mutation = False
Expand All @@ -77,18 +77,16 @@ def get_mlwh_well_by_product_id(

def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
"""
Get recent completed wells from the mlwh database.
The implementation of the inbox query might change when the QC outcomes
become available in mlwh.
Get recent not QC-ed completed wells from the mlwh database.
"""

######
# It is important not to show aborted wells in the inbox.
#
# The well can be complete as in Illumina 'run complete' but that's not
# the same as analysis complete which the other conditions are trying for.
# It potentially gets a bit easier with v11 but those conditions should
# still work ok.
# The well can be complete, but that's not the same as analysis
# complete which the other conditions are trying for.
# It potentially gets a bit easier with v11 but those conditions
# should still work ok.
#

# Using current local time.
Expand All @@ -97,11 +95,15 @@ def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
my_date = date.today() - timedelta(weeks=self.INBOX_LOOK_BACK_NUM_WEEKS)
look_back_min_date = datetime(my_date.year, my_date.month, my_date.day)

# Select the wells that has not been QC-ed, but later double-check against
# the LangQC database.

# TODO: fall back to run_complete when well_complete is undefined

query = (
select(PacBioRunWellMetrics)
.where(PacBioRunWellMetrics.well_status == "Complete")
.where(PacBioRunWellMetrics.qc_seq_state.is_(None))
.where(PacBioRunWellMetrics.run_complete > look_back_min_date)
.where(PacBioRunWellMetrics.polymerase_num_reads.is_not(None))
.where(
Expand Down
52 changes: 29 additions & 23 deletions tests/fixtures/well_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,34 +864,40 @@ def load_data4well_retrieval(
qcdb_test_session.commit()

# We want some wells to be in the inbox. For that their run_complete dates
# should be within last four weeks. Therefore, we need to update the timestamps
# for these runs.
# should be within, for example, last four weeks. Therefore, we need to
# update the timestamps for these runs.
_update_timestamps4inbox()

# Transform a list of lists into a list of hashes, which map to db rows.
mlwh_data4insert = []
for record in MLWH_DATA:
mlwh_data4insert.append(
{
"pac_bio_run_name": record[0],
"well_label": record[1],
"run_start": record[2],
"run_complete": record[3],
"well_start": record[4],
"well_complete": record[5],
"well_status": record[6],
"run_status": record[7],
"ccs_execution_mode": record[8],
"polymerase_num_reads": record[9],
"hifi_num_reads": record[10],
"id_pac_bio_product": PacBioEntity(
run_name=record[0], well_label=record[1], plate_number=record[13]
).hash_product_id(),
"instrument_name": record[11],
"instrument_type": record[12],
"plate_number": record[13],
}
)
data = {
"pac_bio_run_name": record[0],
"well_label": record[1],
"run_start": record[2],
"run_complete": record[3],
"well_start": record[4],
"well_complete": record[5],
"well_status": record[6],
"run_status": record[7],
"ccs_execution_mode": record[8],
"polymerase_num_reads": record[9],
"hifi_num_reads": record[10],
"id_pac_bio_product": PacBioEntity(
run_name=record[0], well_label=record[1], plate_number=record[13]
).hash_product_id(),
"instrument_name": record[11],
"instrument_type": record[12],
"plate_number": record[13],
}
# Add QC state for one runs.
if (data["pac_bio_run_name"] == "TRACTION_RUN_4") and (
data["well_label"] in ("A1", "B1")
):
data["qc_seq_state"] = "Failed"
data["qc_seq_date"] = data["run_complete"]
mlwh_data4insert.append(data)

mlwhdb_test_session.execute(insert(PacBioRunWellMetrics), mlwh_data4insert)
mlwhdb_test_session.commit()

Expand Down
4 changes: 1 addition & 3 deletions tests/test_wh_data_retrieval_pb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval
["TRACTION_RUN_1", "B1"],
["TRACTION_RUN_1", "C1"],
["TRACTION_RUN_1", "D1"],
["TRACTION_RUN_4", "A1"],
["TRACTION_RUN_4", "B1"],
["TRACTION_RUN_4", "C1"],
["TRACTION_RUN_4", "D1"],
["TRACTION_RUN_3", "A1"],
Expand All @@ -42,7 +40,7 @@ def test_completed_wells_retrieval(mlwhdb_test_session, load_data4well_retrieval
.scalars()
.all()
)
time = datetime.now() - timedelta(days=40)
time = datetime.now() - timedelta(days=130)
for row in wells_to_update:
if row.well_complete is not None:
row.well_complete = time
Expand Down

0 comments on commit 3528f8a

Please sign in to comment.