From 59c25027793658b82462c2ba1ea77df08dadcc4d Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 2 May 2024 13:39:17 +0000 Subject: [PATCH 01/16] Put in some syntactic sugar checksum types to enable differentiation between use of id_product in mlwh tables --- lang_qc/endpoints/pacbio_well.py | 9 +++++---- lang_qc/util/type_checksum.py | 16 ++++++++++++++++ tests/test_checksum_type.py | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f9d4957..27534f6 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Adam Blanchet @@ -40,13 +40,14 @@ from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic +from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ChecksumSHA256 +from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256, PacBioProductSHA256 """ A collection of API endpoints that are specific to the PacBio sequencing @@ -173,7 +174,7 @@ def get_wells_in_run( response_model=PacBioWellFull, ) def get_seq_metrics( - id_product: ChecksumSHA256, + id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db), qcdb_session: Session = Depends(get_qc_db), ) -> PacBioWellFull: @@ -210,7 +211,7 @@ def get_seq_metrics( status_code=status.HTTP_201_CREATED, ) def claim_qc( - id_product: ChecksumSHA256, + id_product: PacBioWellSHA256, user: User = Depends(check_user), qcdb_session: Session = Depends(get_qc_db), mlwhdb_session: Session = Depends(get_mlwh_db), diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index a704d3e..15b8bdc 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -40,3 +40,19 @@ def validate(cls, v, _): def __repr__(self): return f"ChecksumSHA256({super().__repr__()})" + + +class PacBioWellSHA256(ChecksumSHA256): + """ + A checksum generated from the coordinates of a single well on a plate in a PacBio run + """ + pass + + +class PacBioProductSHA256(ChecksumSHA256): + """ + A checksum generated from the combination of run, well, plate and any tags required for deplexing + See `npg_id_generation.pac_bio.PacBioEntity`. + Tags only contribute to the checksum when samples are multiplexed. + """ + pass \ No newline at end of file diff --git a/tests/test_checksum_type.py b/tests/test_checksum_type.py index aba01f7..41474f7 100644 --- a/tests/test_checksum_type.py +++ b/tests/test_checksum_type.py @@ -7,7 +7,7 @@ class ChecksumSHA256User(BaseModel): - product_chcksm: ChecksumSHA256 + product_chcksm: ChecksumSHA256 | None = None def test_valid_checksum(): From 486faca2e5629578a77ba1741e64567e6145fb10 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 10 May 2024 15:44:11 +0000 Subject: [PATCH 02/16] Undo type fix, it seems to impact how the code works. Disturbing. --- tests/test_checksum_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_checksum_type.py b/tests/test_checksum_type.py index 41474f7..aba01f7 100644 --- a/tests/test_checksum_type.py +++ b/tests/test_checksum_type.py @@ -7,7 +7,7 @@ class ChecksumSHA256User(BaseModel): - product_chcksm: ChecksumSHA256 | None = None + product_chcksm: ChecksumSHA256 def test_valid_checksum(): From 2b9dc32659af72ac08afce95c36befa9ced07636 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 10 May 2024 15:47:04 +0000 Subject: [PATCH 03/16] Define a response model for pool metrics for a given well --- lang_qc/models/pacbio/qc_data.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 3fe13e5..8f64b84 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -23,6 +23,7 @@ from pydantic import BaseModel, ConfigDict, Field from lang_qc.db.mlwh_schema import PacBioRunWellMetrics +from lang_qc.util.type_checksum import PacBioProductSHA256 # Pydantic prohibits us from defining these as @classmethod or @staticmethod @@ -153,3 +154,31 @@ def from_orm(cls, obj: PacBioRunWellMetrics): qc_data[name]["value"] = getattr(obj, name, None) return cls.model_validate(qc_data) + + +class SampleDeplexingStats(BaseModel): + """ + A representation of metrics for one product, some direct from the DB and others inferred + + For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence. + As a result tag2_name will be None for most data in or before 2024. + """ + + id_product: PacBioProductSHA256 + tag1_name: str | None + tag2_name: str | None + hifi_read_bases: int | None + hifi_num_reads: int | None + hifi_read_length_mean: float | None + hifi_bases_percent: float | None + percentage_total_reads: float | None + + +class QCPoolMetrics(BaseModel): + pool_coeff_of_variance: float | None = Field( + title="Coefficient of variance for reads in the pool", + description="Percentage of the standard deviation w.r.t. mean, reported when the pool is larger than one", + ) + products: list[SampleDeplexingStats] = Field( + title="List of products and their metrics" + ) From 3b3c32aecb6a6a8223b74b9b09da93e82868e659 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 14 May 2024 16:07:47 +0000 Subject: [PATCH 04/16] Allow WellWh helper to compute pool metrics --- lang_qc/db/helper/wells.py | 45 ++++++- tests/fixtures/sample_data.py | 205 +++++++++++++++++++++++++++++ tests/test_pac_bio_qc_data_well.py | 32 +++++ 3 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/sample_data.py diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 976dd63..48a3042 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -21,6 +21,7 @@ import logging from datetime import date, datetime, timedelta +from statistics import mean, stdev from typing import ClassVar, List from pydantic import BaseModel, ConfigDict, Field @@ -33,11 +34,13 @@ ) from lang_qc.db.mlwh_schema import PacBioRunWellMetrics from lang_qc.db.qc_schema import QcState, QcStateDict, QcType +from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState as QcStateModel from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError +from lang_qc.util.type_checksum import PacBioWellSHA256 """ This package is using an undocumented feature of Pydantic, type @@ -64,7 +67,7 @@ class WellWh(BaseModel): # The TestClient seems to be keeping these instances alive and changing them. def get_mlwh_well_by_product_id( - self, id_product: str + self, id_product: PacBioWellSHA256 ) -> PacBioRunWellMetrics | None: """ Returns a well row record from the well metrics table or @@ -77,6 +80,44 @@ def get_mlwh_well_by_product_id( ) ).scalar_one_or_none() + def get_metrics_by_well_product_id( + self, id_product: PacBioWellSHA256 + ) -> QCPoolMetrics | None: + well = self.get_mlwh_well_by_product_id(id_product) + if well: + product_metrics = well.pac_bio_product_metrics + if len(product_metrics) == 1: + return None + + cov: float | None + if any(p.hifi_num_reads is None for p in product_metrics): + cov = None + else: + hifi_reads = [prod.hifi_num_reads for prod in product_metrics] + cov = stdev(hifi_reads) / mean(hifi_reads) * 100 + + return QCPoolMetrics( + pool_coeff_of_variance=cov, + products=[ + SampleDeplexingStats( + id_product=prod.id_pac_bio_product, + tag1_name=prod.pac_bio_run.tag_identifier, + tag2_name=prod.pac_bio_run.tag2_identifier, + hifi_read_bases=prod.hifi_read_bases, + hifi_num_reads=prod.hifi_num_reads, + hifi_read_length_mean=prod.hifi_read_length_mean, + hifi_bases_percent=prod.hifi_bases_percent, + percentage_total_reads=( + prod.hifi_num_reads / well.hifi_num_reads * 100 + if well.hifi_num_reads + else None + ), + ) + for prod in product_metrics + ], + ) + return None + def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: """ Get recent not QC-ed completed wells from the mlwh database. diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py new file mode 100644 index 0000000..15ed224 --- /dev/null +++ b/tests/fixtures/sample_data.py @@ -0,0 +1,205 @@ +from datetime import datetime + +import pytest +from npg_id_generation.pac_bio import PacBioEntity + +from lang_qc.db.mlwh_schema import ( + PacBioProductMetrics, + PacBioRun, + PacBioRunWellMetrics, + Sample, + Study, +) + + +@pytest.fixture(scope="function", params=["AAAAAAAA", None]) +def simplex_run(mlwhdb_test_session): + """ + A single sample, well, run mlwh fixture that provides both an explicit tag1 + for the sample, and an implicit default tag (when the PacBio instrument is + run with default barcodes) + """ + run_name = "RUN" + well_label = "A1" + plate_number = 1 + tag1 = mlwhdb_test_session.param + + common_run_attribs = { + "recorded_at": datetime.now(), + "last_updated": datetime.now(), + "pipeline_id_lims": "nobody cares", + "cost_code": "probably ToL", + "id_lims": 1, + "plate_uuid_lims": "uuid1", + "well_uuid_lims": "uuid1", + "pac_bio_library_tube_id_lims": "id", + "pac_bio_library_tube_uuid": "uuid", + "pac_bio_library_tube_name": "bob", + } + + well_metrics_a1 = PacBioRunWellMetrics( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + instrument_type="Revio", + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + ).hash_product_id(), + ) + + study = Study( + id_lims="id", + id_study_lims="1", + ) + + # This run-well-plate has one singly tagged sample + simplex_run = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=0, + sample=Sample( + id_lims="id", + id_sample_lims="1", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=10, + hifi_read_length_mean=90, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_a1, + ) + ], + **common_run_attribs + ) + mlwhdb_test_session.add(simplex_run) + mlwhdb_test_session.commit() + + +@pytest.fixture(scope="function") +def multiplexed_run(mlwhdb_test_session): + "runs for several (2) samples in one run-well-plate" + + run_name = "RUN" + well_label = "B1" + plate_number = 1 + tag1 = "AAAAAAA" + + common_run_attribs = { + "recorded_at": datetime.now(), + "last_updated": datetime.now(), + "pipeline_id_lims": "nobody cares", + "cost_code": "probably ToL", + "id_lims": 1, + "plate_uuid_lims": "uuid1", + "well_uuid_lims": "uuid1", + "pac_bio_library_tube_id_lims": "id", + "pac_bio_library_tube_uuid": "uuid", + "pac_bio_library_tube_name": "bob", + } + + study = Study( + id_lims="id", + id_study_lims="1", + ) + + tag1 = "TTTTTTTT" + tag1_2 = "GGGGGGGG" + well_metrics_b1 = PacBioRunWellMetrics( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + instrument_type="Revio", + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + ).hash_product_id(), + hifi_num_reads=30, + ) + + multiplex_run_1 = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=1, + sample=Sample( + id_lims="pooled_id_1", + id_sample_lims="2", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=20, + hifi_read_length_mean=45, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_b1, + ), + ], + **common_run_attribs + ) + + multiplex_run_2 = PacBioRun( + pac_bio_run_name=run_name, + well_label=well_label, + plate_number=plate_number, + id_pac_bio_run_lims=2, + sample=Sample( + id_lims="pooled_id_2", + id_sample_lims="3", + ), + study=study, + plate_barcode="ABCD", + pac_bio_product_metrics=[ + PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1_2, + ).hash_product_id(), + qc=1, + hifi_read_bases=100, + hifi_num_reads=10, + hifi_read_length_mean=10, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=100.00, + pac_bio_run_well_metrics=well_metrics_b1, + ) + ], + **common_run_attribs + ) + + mlwhdb_test_session.add_all([multiplex_run_1, multiplex_run_2]) + mlwhdb_test_session.commit() + + +# Some runs use "default barcodes" and the tag1 fields in pac_bio_run are empty. When this is true, we also lose the deplex stats +# Show user "default" in the interface? +# Not all runs get any hifi stats in pac_bio_product_metrics. Not all runs use the hifi reads setting diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 701cce8..32a07df 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -2,6 +2,7 @@ from lang_qc.db.helper.wells import WellWh from lang_qc.models.pacbio.qc_data import QCDataWell +from tests.fixtures.sample_data import multiplexed_run, simplex_run def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): @@ -98,3 +99,34 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): assert ( qc.percentage_deplexed_reads["value"] == None ), "Absent metrics mean this is set to none" + + +def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run): + helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity(run_name="RUN", well_label="A1", plate_number=1).hash_product_id() + + metrics = helper.get_metrics_by_well_product_id(id) + assert metrics is None, "Got no metrics for a one-sample well" + + +def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): + helper = WellWh(session=mlwhdb_test_session) + id = PacBioEntity(run_name="RUN", well_label="B1", plate_number=1).hash_product_id() + metrics = helper.get_metrics_by_well_product_id(id) + + assert metrics, "Two samples means we get a metrics response" + assert ( + int(metrics.pool_coeff_of_variance) == 47 + ), "Variance between 20 and 10 is ~47%" + + assert metrics.products[0].hifi_read_bases == 100 + assert ( + metrics.products[1].hifi_read_bases == 900 + ), "hifi read base counts are faithfully copied" + + assert ( + int(metrics.products[0].percentage_total_reads) == 33 + ), "10 of 30 reads is 33.3%" + assert ( + int(metrics.products[1].percentage_total_reads) == 66 + ), "20 of 30 reads is 66.6%" From b092e194cb56e45d0c02076e147ff46a5ae09ac0 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 14 May 2024 16:23:40 +0000 Subject: [PATCH 05/16] fixture parametrisation not quite right --- tests/fixtures/sample_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 15ed224..c359b26 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -13,7 +13,7 @@ @pytest.fixture(scope="function", params=["AAAAAAAA", None]) -def simplex_run(mlwhdb_test_session): +def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 for the sample, and an implicit default tag (when the PacBio instrument is @@ -22,7 +22,7 @@ def simplex_run(mlwhdb_test_session): run_name = "RUN" well_label = "A1" plate_number = 1 - tag1 = mlwhdb_test_session.param + tag1 = request.param common_run_attribs = { "recorded_at": datetime.now(), From 4382aa72462736efd1dba00e97f0d9cfb2bec814 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Fri, 17 May 2024 13:36:50 +0000 Subject: [PATCH 06/16] An (untested) endpoint for fetching pool stats --- lang_qc/endpoints/pacbio_well.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index 27534f6..f232488 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -47,7 +47,10 @@ InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256, PacBioProductSHA256 +from lang_qc.util.type_checksum import ( + ChecksumSHA256, + PacBioWellSHA256, +) """ A collection of API endpoints that are specific to the PacBio sequencing @@ -186,6 +189,28 @@ def get_seq_metrics( return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state) +@router.get( + "/products/{id_product}/seq_level/pool", + summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID", + responses={ + status.HTTP_404_NOT_FOUND: {"description": "Product not found"}, + status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"}, + }, + response_model=QCPoolMetrics, +) +def get_product_metrics( + id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db) +) -> QCPoolMetrics: + metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id( + id_product + ) + if metrics is None: + raise HTTPException( + status_code=404, detail="Well does not have any pool metrics" + ) + return metrics + + @router.post( "/products/{id_product}/qc_claim", summary="Claim the well to start QC", From 7f329ffe60f2bc0b1ce07ee79b899fa4980715ae Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Mon, 20 May 2024 15:26:13 +0000 Subject: [PATCH 07/16] Make pool fixture self-cleaning --- tests/fixtures/sample_data.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index c359b26..7891a0d 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -12,7 +12,7 @@ ) -@pytest.fixture(scope="function", params=["AAAAAAAA", None]) +@pytest.fixture(scope="module", params=["AAAAAAAA", None]) def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 @@ -88,6 +88,10 @@ def simplex_run(request, mlwhdb_test_session): ) mlwhdb_test_session.add(simplex_run) mlwhdb_test_session.commit() + yield simplex_run + mlwhdb_test_session.delete(simplex_run) + mlwhdb_test_session.delete(study) + mlwhdb_test_session.commit() @pytest.fixture(scope="function") @@ -198,6 +202,11 @@ def multiplexed_run(mlwhdb_test_session): mlwhdb_test_session.add_all([multiplex_run_1, multiplex_run_2]) mlwhdb_test_session.commit() + yield (multiplex_run_1, multiplex_run_2) + mlwhdb_test_session.delete(multiplex_run_1) + mlwhdb_test_session.delete(multiplex_run_2) + mlwhdb_test_session.delete(study) + mlwhdb_test_session.commit() # Some runs use "default barcodes" and the tag1 fields in pac_bio_run are empty. When this is true, we also lose the deplex stats From 3c9b9bb275a0f7785fe968e3bf7fea274675a12e Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Mon, 20 May 2024 17:00:45 +0000 Subject: [PATCH 08/16] Add metrics from mlwh to a multi-sample well, and test pool API endpoint Black rides again --- lang_qc/util/type_checksum.py | 4 +- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 40 +++++++++---------- .../endpoints/test_single_well_qc_details.py | 18 +++++++++ 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index 15b8bdc..4a72dce 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -46,6 +46,7 @@ class PacBioWellSHA256(ChecksumSHA256): """ A checksum generated from the coordinates of a single well on a plate in a PacBio run """ + pass @@ -55,4 +56,5 @@ class PacBioProductSHA256(ChecksumSHA256): See `npg_id_generation.pac_bio.PacBioEntity`. Tags only contribute to the checksum when samples are multiplexed. """ - pass \ No newline at end of file + + pass diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index 0b6de2e..6485990 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -255,11 +255,11 @@ id_pac_bio_tmp: 120632 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 97 + hifi_bases_percent: 27.49 + hifi_num_reads: 1952224 + hifi_read_bases: 21504288522 + hifi_read_length_mean: 11015 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30023 id_pac_bio_product: 74af5a311e15af654336aea65826a2c4974842d752e25875b0303ad5a3556167 @@ -267,11 +267,11 @@ id_pac_bio_tmp: 120633 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 96 + hifi_bases_percent: 19.62 + hifi_num_reads: 1139885 + hifi_read_bases: 15344650012 + hifi_read_length_mean: 13461 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30024 id_pac_bio_product: 11022006a649937c570d100ccb382dddadf9a7174ee303903c8d2b7cd7efb328 @@ -279,11 +279,11 @@ id_pac_bio_tmp: 120634 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 96 + hifi_bases_percent: 23.7 + hifi_num_reads: 1751410 + hifi_read_bases: 18538781061 + hifi_read_length_mean: 10585 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30025 id_pac_bio_product: e6a2157d0fda8faae1288025e99ce5f8133f1466b752a67809668e5b9b16d5b1 @@ -291,11 +291,11 @@ id_pac_bio_tmp: 120635 last_changed: 2024-02-28 11:10:15 qc: 1 -- barcode_quality_score_mean: ~ - hifi_bases_percent: ~ - hifi_num_reads: ~ - hifi_read_bases: ~ - hifi_read_length_mean: ~ +- barcode_quality_score_mean: 97 + hifi_bases_percent: 28.72 + hifi_num_reads: 1991282 + hifi_read_bases: 22462478066 + hifi_read_length_mean: 11280 hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30026 id_pac_bio_product: 9840280d97c98ff3ddda36ac95cf3b87f5810cc3be73a64c27d6ab92cfaab0ac diff --git a/tests/endpoints/test_single_well_qc_details.py b/tests/endpoints/test_single_well_qc_details.py index 7931e42..b9b8d62 100644 --- a/tests/endpoints/test_single_well_qc_details.py +++ b/tests/endpoints/test_single_well_qc_details.py @@ -1,4 +1,5 @@ from fastapi.testclient import TestClient +from npg_id_generation.pac_bio import PacBioEntity from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -165,3 +166,20 @@ def test_get_well_info( assert result["plate_number"] == 2 assert result["id_product"] == id_product assert result["qc_state"] is None + + +def test_get_pool_info(test_client: TestClient, mlwhdb_load_runs): + id_product = PacBioEntity( + run_name="TRACTION-RUN-1140", well_label="D1", plate_number=1 + ).hash_product_id() + response = test_client.get(f"/pacbio/products/{id_product}/seq_level/pool") + assert response.status_code == 200 + + data = response.json() + assert int(data["pool_coeff_of_variance"]) == 23, "variance is calculated" + assert {prod["tag1_name"] for prod in data["products"]} == { + "bc2036", + "bc2040", + "bc2054", + "bc2063", + }, "Correct products present" From 7a55cc6e3426f48567a0d247de1afac4430ee9d3 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 14:54:01 +0000 Subject: [PATCH 09/16] parameterised fixture triggers unique condition in DB, so make more dynamic shorten some lines for flake8 --- lang_qc/models/pacbio/qc_data.py | 2 +- lang_qc/util/type_checksum.py | 4 +- tests/fixtures/sample_data.py | 119 +++++++++++++++-------------- tests/test_pac_bio_qc_data_well.py | 6 +- 4 files changed, 70 insertions(+), 61 deletions(-) diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 8f64b84..259a178 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -177,7 +177,7 @@ class SampleDeplexingStats(BaseModel): class QCPoolMetrics(BaseModel): pool_coeff_of_variance: float | None = Field( title="Coefficient of variance for reads in the pool", - description="Percentage of the standard deviation w.r.t. mean, reported when the pool is larger than one", + description="Percentage of the standard deviation w.r.t. mean, when pool is more than one", ) products: list[SampleDeplexingStats] = Field( title="List of products and their metrics" diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py index 4a72dce..c78b099 100644 --- a/lang_qc/util/type_checksum.py +++ b/lang_qc/util/type_checksum.py @@ -52,8 +52,8 @@ class PacBioWellSHA256(ChecksumSHA256): class PacBioProductSHA256(ChecksumSHA256): """ - A checksum generated from the combination of run, well, plate and any tags required for deplexing - See `npg_id_generation.pac_bio.PacBioEntity`. + A checksum generated from the combination of run, well, plate and any tags required for + deplexing, see `npg_id_generation.pac_bio.PacBioEntity`. Tags only contribute to the checksum when samples are multiplexed. """ diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 7891a0d..dd26780 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -19,7 +19,8 @@ def simplex_run(request, mlwhdb_test_session): for the sample, and an implicit default tag (when the PacBio instrument is run with default barcodes) """ - run_name = "RUN" + run_name = "RUN-9999" + run_name += request.param if request.param else "" well_label = "A1" plate_number = 1 tag1 = request.param @@ -49,6 +50,23 @@ def simplex_run(request, mlwhdb_test_session): ).hash_product_id(), ) + product = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=10, + hifi_read_length_mean=90, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_a1, + ) + study = Study( id_lims="id", id_study_lims="1", @@ -62,28 +80,11 @@ def simplex_run(request, mlwhdb_test_session): id_pac_bio_run_lims=0, sample=Sample( id_lims="id", - id_sample_lims="1", + id_sample_lims=request.param or "1", ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1, - ).hash_product_id(), - qc=1, - hifi_read_bases=900, - hifi_num_reads=10, - hifi_read_length_mean=90, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=90.001, - pac_bio_run_well_metrics=well_metrics_a1, - ) - ], + pac_bio_product_metrics=[product], **common_run_attribs ) mlwhdb_test_session.add(simplex_run) @@ -91,6 +92,8 @@ def simplex_run(request, mlwhdb_test_session): yield simplex_run mlwhdb_test_session.delete(simplex_run) mlwhdb_test_session.delete(study) + mlwhdb_test_session.delete(product) + mlwhdb_test_session.delete(well_metrics_a1) mlwhdb_test_session.commit() @@ -101,7 +104,6 @@ def multiplexed_run(mlwhdb_test_session): run_name = "RUN" well_label = "B1" plate_number = 1 - tag1 = "AAAAAAA" common_run_attribs = { "recorded_at": datetime.now(), @@ -136,6 +138,23 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=30, ) + product_1 = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1, + ).hash_product_id(), + qc=1, + hifi_read_bases=900, + hifi_num_reads=20, + hifi_read_length_mean=45, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=90.001, + pac_bio_run_well_metrics=well_metrics_b1, + ) + multiplex_run_1 = PacBioRun( pac_bio_run_name=run_name, well_label=well_label, @@ -147,27 +166,27 @@ def multiplexed_run(mlwhdb_test_session): ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1, - ).hash_product_id(), - qc=1, - hifi_read_bases=900, - hifi_num_reads=20, - hifi_read_length_mean=45, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=90.001, - pac_bio_run_well_metrics=well_metrics_b1, - ), - ], + pac_bio_product_metrics=[product_1], **common_run_attribs ) + product_2 = PacBioProductMetrics( + id_pac_bio_product=PacBioEntity( + run_name=run_name, + well_label=well_label, + plate_number=plate_number, + tags=tag1_2, + ).hash_product_id(), + qc=1, + hifi_read_bases=100, + hifi_num_reads=10, + hifi_read_length_mean=10, + barcode_quality_score_mean=34, + hifi_read_quality_mean=35, + hifi_bases_percent=100.00, + pac_bio_run_well_metrics=well_metrics_b1, + ) + multiplex_run_2 = PacBioRun( pac_bio_run_name=run_name, well_label=well_label, @@ -179,24 +198,7 @@ def multiplexed_run(mlwhdb_test_session): ), study=study, plate_barcode="ABCD", - pac_bio_product_metrics=[ - PacBioProductMetrics( - id_pac_bio_product=PacBioEntity( - run_name=run_name, - well_label=well_label, - plate_number=plate_number, - tags=tag1_2, - ).hash_product_id(), - qc=1, - hifi_read_bases=100, - hifi_num_reads=10, - hifi_read_length_mean=10, - barcode_quality_score_mean=34, - hifi_read_quality_mean=35, - hifi_bases_percent=100.00, - pac_bio_run_well_metrics=well_metrics_b1, - ) - ], + pac_bio_product_metrics=[product_2], **common_run_attribs ) @@ -206,6 +208,9 @@ def multiplexed_run(mlwhdb_test_session): mlwhdb_test_session.delete(multiplex_run_1) mlwhdb_test_session.delete(multiplex_run_2) mlwhdb_test_session.delete(study) + mlwhdb_test_session.delete(well_metrics_b1) + mlwhdb_test_session.delete(product_1) + mlwhdb_test_session.delete(product_2) mlwhdb_test_session.commit() diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index 32a07df..bd4318a 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -103,7 +103,11 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs): def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run): helper = WellWh(session=mlwhdb_test_session) - id = PacBioEntity(run_name="RUN", well_label="A1", plate_number=1).hash_product_id() + id = PacBioEntity( + run_name=simplex_run.pac_bio_run_name, + well_label=simplex_run.well_label, + plate_number=simplex_run.plate_number, + ).hash_product_id() metrics = helper.get_metrics_by_well_product_id(id) assert metrics is None, "Got no metrics for a one-sample well" From 686481bc11762f889ae98a967bfde15adccf35f2 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 15:00:13 +0000 Subject: [PATCH 10/16] Stop fixture polluting other tests in module import sort --- lang_qc/endpoints/pacbio_well.py | 7 ++----- tests/fixtures/sample_data.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f232488..7e15724 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -37,20 +37,17 @@ from lang_qc.db.mlwh_connection import get_mlwh_db from lang_qc.db.qc_connection import get_qc_db from lang_qc.db.qc_schema import User +from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic -from lang_qc.models.pacbio.qc_data import QCPoolMetrics from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, RunNotFoundError, ) -from lang_qc.util.type_checksum import ( - ChecksumSHA256, - PacBioWellSHA256, -) +from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256 """ A collection of API endpoints that are specific to the PacBio sequencing diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index dd26780..8a9b64e 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -12,7 +12,7 @@ ) -@pytest.fixture(scope="module", params=["AAAAAAAA", None]) +@pytest.fixture(scope="function", params=["AAAAAAAA", None]) def simplex_run(request, mlwhdb_test_session): """ A single sample, well, run mlwh fixture that provides both an explicit tag1 From 2b6ae77d84f9d8386006d418355c740e69181b7f Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Thu, 23 May 2024 16:18:07 +0000 Subject: [PATCH 11/16] Data not needed for defunct mlwh column --- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 24 ------------------- tests/fixtures/sample_data.py | 3 --- 2 files changed, 27 deletions(-) diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index 6485990..f46e1d1 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -176,7 +176,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30016 id_pac_bio_product: 3b37d8c1a317f229a3aae182f160f8e4f4856607fb15f1ab0588dde66640afda id_pac_bio_rw_metrics_tmp: 6206 @@ -188,7 +187,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30017 id_pac_bio_product: 2b9048414306eb7683056bd91f6ec81f0b2dbf69484b3dd2dbe39932b52bedbb id_pac_bio_rw_metrics_tmp: 6206 @@ -200,7 +198,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30018 id_pac_bio_product: f50319c97e28f2e0a67ebbc736080c4e98f23cdf6e5b7cec964349ffb13ae797 id_pac_bio_rw_metrics_tmp: 6207 @@ -212,7 +209,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30019 id_pac_bio_product: 080733cab28898fcd69d1a418c7675cba38a548c9c20ac2da48a84c5658ee6b2 id_pac_bio_rw_metrics_tmp: 6207 @@ -224,7 +220,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30020 id_pac_bio_product: 14be4b6a6bb857c0967d56c90d2b57edc1401cdb5f95379312fb8e5ca71e09fa id_pac_bio_rw_metrics_tmp: 6207 @@ -236,7 +231,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30021 id_pac_bio_product: 4153f3a64e39588bf626c4dda42e5ee74b424bba67d69bb74bb029adda2e642c id_pac_bio_rw_metrics_tmp: 6208 @@ -248,7 +242,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30022 id_pac_bio_product: fbbcd5cac5d086ce64b3a37646e261b4c784fce6755fd65d6d41f048d2267c61 id_pac_bio_rw_metrics_tmp: 6208 @@ -260,7 +253,6 @@ hifi_num_reads: 1952224 hifi_read_bases: 21504288522 hifi_read_length_mean: 11015 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30023 id_pac_bio_product: 74af5a311e15af654336aea65826a2c4974842d752e25875b0303ad5a3556167 id_pac_bio_rw_metrics_tmp: 6209 @@ -272,7 +264,6 @@ hifi_num_reads: 1139885 hifi_read_bases: 15344650012 hifi_read_length_mean: 13461 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30024 id_pac_bio_product: 11022006a649937c570d100ccb382dddadf9a7174ee303903c8d2b7cd7efb328 id_pac_bio_rw_metrics_tmp: 6209 @@ -284,7 +275,6 @@ hifi_num_reads: 1751410 hifi_read_bases: 18538781061 hifi_read_length_mean: 10585 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30025 id_pac_bio_product: e6a2157d0fda8faae1288025e99ce5f8133f1466b752a67809668e5b9b16d5b1 id_pac_bio_rw_metrics_tmp: 6209 @@ -296,7 +286,6 @@ hifi_num_reads: 1991282 hifi_read_bases: 22462478066 hifi_read_length_mean: 11280 - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30026 id_pac_bio_product: 9840280d97c98ff3ddda36ac95cf3b87f5810cc3be73a64c27d6ab92cfaab0ac id_pac_bio_rw_metrics_tmp: 6209 @@ -308,7 +297,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30027 id_pac_bio_product: 81141cdff1f57c0fc0fc5f88856fa7c6d2945acc5fa6e53e7d1214d17a00c410 id_pac_bio_rw_metrics_tmp: 6210 @@ -320,7 +308,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30028 id_pac_bio_product: 4145bf889c130ecaadcd4d757d0a3ca98d68629556427a27ebc08840ffdd0e0f id_pac_bio_rw_metrics_tmp: 6210 @@ -332,7 +319,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30029 id_pac_bio_product: 5b99ad09c31afd4917da39d44fc6cc40e1915572e80c20acbfda6d6c031e74c5 id_pac_bio_rw_metrics_tmp: 6211 @@ -344,7 +330,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30030 id_pac_bio_product: 0152d7945c4f74fac3ff828012ad2c01a95574df213d7664e7989e1039727cb5 id_pac_bio_rw_metrics_tmp: 6211 @@ -356,7 +341,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30031 id_pac_bio_product: 110e4562a6d28dd96973a98fcc1464d6c82dc413296b95d0c71727d21fa2a193 id_pac_bio_rw_metrics_tmp: 6212 @@ -368,7 +352,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30032 id_pac_bio_product: af65875cfecca04ee585c67525661f57a07d7f1427aa15ca39e158c791d63aa5 id_pac_bio_rw_metrics_tmp: 6212 @@ -380,7 +363,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30033 id_pac_bio_product: c24d50afb4c048f38dca230a03fb4880912713adf7db7a3ec4d5f57ee3c4cdec id_pac_bio_rw_metrics_tmp: 6212 @@ -392,7 +374,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30034 id_pac_bio_product: baa1e87601ca9c16d95b7fda9d9346557de4aaf4adb5c15383d0f8d9366692bf id_pac_bio_rw_metrics_tmp: 6213 @@ -404,7 +385,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30035 id_pac_bio_product: f88bcfb888f075442a005368c070ba83d895b07c013c68e1cb292fce4aaa40f2 id_pac_bio_rw_metrics_tmp: 6213 @@ -416,7 +396,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30036 id_pac_bio_product: 61d2c6fc72d593949cf7b60812a0076c9af57b0fa71b394f0669e410e040458e id_pac_bio_rw_metrics_tmp: 6213 @@ -428,7 +407,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30037 id_pac_bio_product: 252c8d3dc0b4c81e6d7359b0808ba962013e7b320eb9b979da526cecf5fdd019 id_pac_bio_rw_metrics_tmp: 6213 @@ -440,7 +418,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30153 id_pac_bio_product: 2135bf0b32c6b987042e67e062647aa21ac956c1d3385627b7a1d4cd670c355f id_pac_bio_rw_metrics_tmp: 6306 @@ -452,7 +429,6 @@ hifi_num_reads: ~ hifi_read_bases: ~ hifi_read_length_mean: ~ - hifi_read_quality_mean: ~ id_pac_bio_pr_metrics_tmp: 30154 id_pac_bio_product: 790e8882c97615d79ebe27b782eefa87eede2cecda8ebd960cdd88300059f196 id_pac_bio_rw_metrics_tmp: 6307 diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 8a9b64e..818142f 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -62,7 +62,6 @@ def simplex_run(request, mlwhdb_test_session): hifi_num_reads=10, hifi_read_length_mean=90, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_a1, ) @@ -150,7 +149,6 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=20, hifi_read_length_mean=45, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_b1, ) @@ -182,7 +180,6 @@ def multiplexed_run(mlwhdb_test_session): hifi_num_reads=10, hifi_read_length_mean=10, barcode_quality_score_mean=34, - hifi_read_quality_mean=35, hifi_bases_percent=100.00, pac_bio_run_well_metrics=well_metrics_b1, ) From 3df42faf70a9a549a8a426d90a83b5640145c7aa Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 11 Jun 2024 13:52:02 +0000 Subject: [PATCH 12/16] Update mlwh model to include new barcode4deplexing column --- lang_qc/db/mlwh_schema.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index 395916f..d445120 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -609,6 +609,11 @@ class PacBioProductMetrics(Base): hifi_read_length_mean = Column( mysqlINTEGER(unsigned=True), nullable=True, comment="The mean HiFi read length" ) + barcode4deplexing = Column( + mysqlVARCHAR(62), + nullable=True, + comment="The barcode recorded in producing deplexed metrics for this product", + ) barcode_quality_score_mean = Column( mysqlSMALLINT(unsigned=True), nullable=True, From 11be2e055ef3894a9dcfa978deb5b50e09d5a51a Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Tue, 11 Jun 2024 14:04:13 +0000 Subject: [PATCH 13/16] Supplement fixture with barcode IDs --- .../mlwh_pb_runs/300-PacBioProductMetrics.yml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml index f46e1d1..c02bb36 100644 --- a/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml +++ b/tests/data/mlwh_pb_runs/300-PacBioProductMetrics.yml @@ -172,6 +172,7 @@ id_pac_bio_rw_metrics_tmp: 1735 id_pac_bio_tmp: 99008 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2020--bc2020 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -183,6 +184,7 @@ last_changed: 2024-02-28 14:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2011--bc2011 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -194,6 +196,7 @@ last_changed: 2024-02-28 14:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1011_BAK8A_OA--bc1011_BAK8A_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -205,6 +208,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1022_BAK8B_OA--bc1022_BAK8B_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -216,6 +220,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc1001_BAK8A_OA--bc1001_BAK8A_OA hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -227,6 +232,7 @@ last_changed: 2024-03-05 15:10:36 qc: 0 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2035--bc2035 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -238,6 +244,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2052--bc2052 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -249,6 +256,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 97 + barcode4deplexing: bc2036--bc2036 hifi_bases_percent: 27.49 hifi_num_reads: 1952224 hifi_read_bases: 21504288522 @@ -260,6 +268,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 96 + barcode4deplexing: bc2040--bc2040 hifi_bases_percent: 19.62 hifi_num_reads: 1139885 hifi_read_bases: 15344650012 @@ -271,6 +280,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 96 + barcode4deplexing: bc2054--bc2054 hifi_bases_percent: 23.7 hifi_num_reads: 1751410 hifi_read_bases: 18538781061 @@ -282,6 +292,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: 97 + barcode4deplexing: bc2063--bc2063 hifi_bases_percent: 28.72 hifi_num_reads: 1991282 hifi_read_bases: 22462478066 @@ -293,6 +304,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2016--bc2016 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -304,6 +316,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2096--bc2096 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -315,6 +328,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2056--bc2056 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -326,6 +340,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2072--bc2072 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -337,6 +352,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2021--bc2021 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -348,6 +364,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2011--bc2011 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -359,6 +376,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2015--bc2015 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -370,6 +388,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2083--bc2083 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -381,6 +400,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2084--bc2084 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -392,6 +412,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2085--bc2085 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -403,6 +424,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2094--bc2094 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -414,6 +436,7 @@ last_changed: 2024-02-28 11:10:15 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2070--bc2070 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ @@ -425,6 +448,7 @@ last_changed: 2024-03-08 12:10:14 qc: 1 - barcode_quality_score_mean: ~ + barcode4deplexing: bc2055--bc2055 hifi_bases_percent: ~ hifi_num_reads: ~ hifi_read_bases: ~ From 6e5472a0d5bcf7b8521600b6b99e7b2f04aaa737 Mon Sep 17 00:00:00 2001 From: Kieron Taylor Date: Wed, 12 Jun 2024 12:51:48 +0000 Subject: [PATCH 14/16] Add deplexing barcodes and modes to test data. Check deplexing mode to determine whether to run stats or not. --- lang_qc/db/helper/wells.py | 5 ++--- lang_qc/models/pacbio/qc_data.py | 1 + tests/fixtures/sample_data.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 48a3042..69fac7e 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -84,10 +84,8 @@ def get_metrics_by_well_product_id( self, id_product: PacBioWellSHA256 ) -> QCPoolMetrics | None: well = self.get_mlwh_well_by_product_id(id_product) - if well: + if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode: product_metrics = well.pac_bio_product_metrics - if len(product_metrics) == 1: - return None cov: float | None if any(p.hifi_num_reads is None for p in product_metrics): @@ -103,6 +101,7 @@ def get_metrics_by_well_product_id( id_product=prod.id_pac_bio_product, tag1_name=prod.pac_bio_run.tag_identifier, tag2_name=prod.pac_bio_run.tag2_identifier, + deplexing_barcode=prod.barcode4deplexing, hifi_read_bases=prod.hifi_read_bases, hifi_num_reads=prod.hifi_num_reads, hifi_read_length_mean=prod.hifi_read_length_mean, diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py index 259a178..fb9a874 100644 --- a/lang_qc/models/pacbio/qc_data.py +++ b/lang_qc/models/pacbio/qc_data.py @@ -167,6 +167,7 @@ class SampleDeplexingStats(BaseModel): id_product: PacBioProductSHA256 tag1_name: str | None tag2_name: str | None + deplexing_barcode: str | None hifi_read_bases: int | None hifi_num_reads: int | None hifi_read_length_mean: float | None diff --git a/tests/fixtures/sample_data.py b/tests/fixtures/sample_data.py index 818142f..e86fbb5 100644 --- a/tests/fixtures/sample_data.py +++ b/tests/fixtures/sample_data.py @@ -48,6 +48,7 @@ def simplex_run(request, mlwhdb_test_session): well_label=well_label, plate_number=plate_number, ).hash_product_id(), + demultiplex_mode=None, ) product = PacBioProductMetrics( @@ -64,6 +65,7 @@ def simplex_run(request, mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_a1, + barcode4deplexing=None, ) study = Study( @@ -135,6 +137,7 @@ def multiplexed_run(mlwhdb_test_session): plate_number=plate_number, ).hash_product_id(), hifi_num_reads=30, + demultiplex_mode="OnInstrument", ) product_1 = PacBioProductMetrics( @@ -151,6 +154,7 @@ def multiplexed_run(mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=90.001, pac_bio_run_well_metrics=well_metrics_b1, + barcode4deplexing="bc10--bc10", ) multiplex_run_1 = PacBioRun( @@ -182,6 +186,7 @@ def multiplexed_run(mlwhdb_test_session): barcode_quality_score_mean=34, hifi_bases_percent=100.00, pac_bio_run_well_metrics=well_metrics_b1, + barcode4deplexing="bc11--bc11", ) multiplex_run_2 = PacBioRun( From 2f9be8aec83686f16c8bbfa7ebd1c2f4d29a08cd Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 12 Jun 2024 16:59:42 +0100 Subject: [PATCH 15/16] Added a check for unlinked data. --- lang_qc/db/helper/wells.py | 27 ++++++++++++++++++--------- tests/test_pac_bio_qc_data_well.py | 13 +++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 69fac7e..57d7b01 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -85,7 +85,15 @@ def get_metrics_by_well_product_id( ) -> QCPoolMetrics | None: well = self.get_mlwh_well_by_product_id(id_product) if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode: + product_metrics = well.pac_bio_product_metrics + lib_lims_data = [ + row + for row in map(lambda product: product.pac_bio_run, product_metrics) + if row is not None + ] + if len(lib_lims_data) != len(product_metrics): + raise Exception("Partially linked LIMS data or no linked LIMS data") cov: float | None if any(p.hifi_num_reads is None for p in product_metrics): @@ -94,13 +102,13 @@ def get_metrics_by_well_product_id( hifi_reads = [prod.hifi_num_reads for prod in product_metrics] cov = stdev(hifi_reads) / mean(hifi_reads) * 100 - return QCPoolMetrics( - pool_coeff_of_variance=cov, - products=[ + sample_stats = [] + for (i, prod) in enumerate(product_metrics): + sample_stats.append( SampleDeplexingStats( id_product=prod.id_pac_bio_product, - tag1_name=prod.pac_bio_run.tag_identifier, - tag2_name=prod.pac_bio_run.tag2_identifier, + tag1_name=lib_lims_data[i].tag_identifier, + tag2_name=lib_lims_data[i].tag2_identifier, deplexing_barcode=prod.barcode4deplexing, hifi_read_bases=prod.hifi_read_bases, hifi_num_reads=prod.hifi_num_reads, @@ -108,13 +116,14 @@ def get_metrics_by_well_product_id( hifi_bases_percent=prod.hifi_bases_percent, percentage_total_reads=( prod.hifi_num_reads / well.hifi_num_reads * 100 - if well.hifi_num_reads + if (well.hifi_num_reads and prod.hifi_num_reads) else None ), ) - for prod in product_metrics - ], - ) + ) + + return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats) + return None def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py index bd4318a..3be9de9 100644 --- a/tests/test_pac_bio_qc_data_well.py +++ b/tests/test_pac_bio_qc_data_well.py @@ -1,3 +1,4 @@ +import pytest from npg_id_generation.pac_bio import PacBioEntity from lang_qc.db.helper.wells import WellWh @@ -134,3 +135,15 @@ def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run): assert ( int(metrics.products[1].percentage_total_reads) == 66 ), "20 of 30 reads is 66.6%" + + +def test_pool_metrics_from_well(mlwhdb_test_session): + + id = PacBioEntity( + run_name="TRACTION-RUN-1140", well_label="C1", plate_number=2 + ).hash_product_id() + helper = WellWh(session=mlwhdb_test_session) + with pytest.raises( + Exception, match=r"Partially linked LIMS data or no linked LIMS data" + ): + helper.get_metrics_by_well_product_id(id) From 372fc567c64a4f70806079291b8e48d0390afa60 Mon Sep 17 00:00:00 2001 From: mgcam Date: Thu, 13 Jun 2024 12:09:32 +0100 Subject: [PATCH 16/16] Simplified getting linked lims data. --- lang_qc/db/helper/wells.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py index 57d7b01..4c8f089 100644 --- a/lang_qc/db/helper/wells.py +++ b/lang_qc/db/helper/wells.py @@ -88,9 +88,9 @@ def get_metrics_by_well_product_id( product_metrics = well.pac_bio_product_metrics lib_lims_data = [ - row - for row in map(lambda product: product.pac_bio_run, product_metrics) - if row is not None + product.pac_bio_run + for product in product_metrics + if product.pac_bio_run is not None ] if len(lib_lims_data) != len(product_metrics): raise Exception("Partially linked LIMS data or no linked LIMS data")