Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Back end support for pool metrics #224

Merged
merged 18 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions lang_qc/db/helper/wells.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -21,6 +21,7 @@

import logging
from datetime import date, datetime, timedelta
from statistics import mean, stdev
from typing import ClassVar, List

from pydantic import BaseModel, ConfigDict, Field
Expand All @@ -33,11 +34,13 @@
)
from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
from lang_qc.models.pager import PagedResponse
from lang_qc.models.qc_flow_status import QcFlowStatusEnum
from lang_qc.models.qc_state import QcState as QcStateModel
from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError
from lang_qc.util.type_checksum import PacBioWellSHA256

"""
This package is using an undocumented feature of Pydantic, type
Expand All @@ -64,7 +67,7 @@ class WellWh(BaseModel):
# The TestClient seems to be keeping these instances alive and changing them.

def get_mlwh_well_by_product_id(
self, id_product: str
self, id_product: PacBioWellSHA256
) -> PacBioRunWellMetrics | None:
"""
Returns a well row record from the well metrics table or
Expand All @@ -77,6 +80,52 @@ def get_mlwh_well_by_product_id(
)
).scalar_one_or_none()

def get_metrics_by_well_product_id(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other QC metrics in lang_qc/models/pacbio/qc_data.py have class methods to self-populate themselves. It might be reasonable to move this code to such class method

self, id_product: PacBioWellSHA256
) -> QCPoolMetrics | None:
well = self.get_mlwh_well_by_product_id(id_product)
if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode:

product_metrics = well.pac_bio_product_metrics
lib_lims_data = [
product.pac_bio_run
for product in product_metrics
if product.pac_bio_run is not None
]
if len(lib_lims_data) != len(product_metrics):
raise Exception("Partially linked LIMS data or no linked LIMS data")

cov: float | None
if any(p.hifi_num_reads is None for p in product_metrics):
cov = None
else:
hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
cov = stdev(hifi_reads) / mean(hifi_reads) * 100

sample_stats = []
for (i, prod) in enumerate(product_metrics):
sample_stats.append(
SampleDeplexingStats(
id_product=prod.id_pac_bio_product,
tag1_name=lib_lims_data[i].tag_identifier,
tag2_name=lib_lims_data[i].tag2_identifier,
deplexing_barcode=prod.barcode4deplexing,
hifi_read_bases=prod.hifi_read_bases,
hifi_num_reads=prod.hifi_num_reads,
hifi_read_length_mean=prod.hifi_read_length_mean,
hifi_bases_percent=prod.hifi_bases_percent,
percentage_total_reads=(
prod.hifi_num_reads / well.hifi_num_reads * 100
if (well.hifi_num_reads and prod.hifi_num_reads)
else None
),
)
)

return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats)

return None

def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
"""
Get recent not QC-ed completed wells from the mlwh database.
Expand Down
5 changes: 5 additions & 0 deletions lang_qc/db/mlwh_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,11 @@ class PacBioProductMetrics(Base):
hifi_read_length_mean = Column(
mysqlINTEGER(unsigned=True), nullable=True, comment="The mean HiFi read length"
)
barcode4deplexing = Column(
mysqlVARCHAR(62),
nullable=True,
comment="The barcode recorded in producing deplexed metrics for this product",
)
barcode_quality_score_mean = Column(
mysqlSMALLINT(unsigned=True),
nullable=True,
Expand Down
31 changes: 27 additions & 4 deletions lang_qc/endpoints/pacbio_well.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Adam Blanchet
Expand Down Expand Up @@ -37,6 +37,7 @@
from lang_qc.db.mlwh_connection import get_mlwh_db
from lang_qc.db.qc_connection import get_qc_db
from lang_qc.db.qc_schema import User
from lang_qc.models.pacbio.qc_data import QCPoolMetrics
from lang_qc.models.pacbio.well import (
PacBioPagedWells,
PacBioWellFull,
Expand All @@ -51,7 +52,7 @@
MissingLimsDataError,
RunNotFoundError,
)
from lang_qc.util.type_checksum import ChecksumSHA256
from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256

"""
A collection of API endpoints that are specific to the PacBio sequencing
Expand Down Expand Up @@ -204,7 +205,7 @@ def get_well_lims_info(
response_model=PacBioWellFull,
)
def get_seq_metrics(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
mlwhdb_session: Session = Depends(get_mlwh_db),
qcdb_session: Session = Depends(get_qc_db),
) -> PacBioWellFull:
Expand All @@ -216,6 +217,28 @@ def get_seq_metrics(
return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state)


@router.get(
"/products/{id_product}/seq_level/pool",
summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID",
responses={
status.HTTP_404_NOT_FOUND: {"description": "Product not found"},
status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
},
response_model=QCPoolMetrics,
)
def get_product_metrics(
id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db)
) -> QCPoolMetrics:
metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id(
id_product
)
if metrics is None:
raise HTTPException(
status_code=404, detail="Well does not have any pool metrics"
)
return metrics


@router.post(
"/products/{id_product}/qc_claim",
summary="Claim the well to start QC",
Expand All @@ -241,7 +264,7 @@ def get_seq_metrics(
status_code=status.HTTP_201_CREATED,
)
def claim_qc(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
user: User = Depends(check_user),
qcdb_session: Session = Depends(get_qc_db),
mlwhdb_session: Session = Depends(get_mlwh_db),
Expand Down
32 changes: 31 additions & 1 deletion lang_qc/models/pacbio/qc_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -23,6 +23,7 @@
from pydantic import BaseModel, ConfigDict, Field

from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.util.type_checksum import PacBioProductSHA256


# Pydantic prohibits us from defining these as @classmethod or @staticmethod
Expand Down Expand Up @@ -153,3 +154,32 @@ def from_orm(cls, obj: PacBioRunWellMetrics):
qc_data[name]["value"] = getattr(obj, name, None)

return cls.model_validate(qc_data)


class SampleDeplexingStats(BaseModel):
"""
A representation of metrics for one product, some direct from the DB and others inferred

For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence.
As a result tag2_name will be None for most data in or before 2024.
"""

id_product: PacBioProductSHA256
tag1_name: str | None
tag2_name: str | None
mgcam marked this conversation as resolved.
Show resolved Hide resolved
deplexing_barcode: str | None
hifi_read_bases: int | None
hifi_num_reads: int | None
hifi_read_length_mean: float | None
hifi_bases_percent: float | None
percentage_total_reads: float | None


class QCPoolMetrics(BaseModel):
pool_coeff_of_variance: float | None = Field(
title="Coefficient of variance for reads in the pool",
description="Percentage of the standard deviation w.r.t. mean, when pool is more than one",
)
products: list[SampleDeplexingStats] = Field(
title="List of products and their metrics"
)
18 changes: 18 additions & 0 deletions lang_qc/util/type_checksum.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,21 @@ def validate(cls, v, _):

def __repr__(self):
return f"ChecksumSHA256({super().__repr__()})"


class PacBioWellSHA256(ChecksumSHA256):
"""
A checksum generated from the coordinates of a single well on a plate in a PacBio run
"""

pass


class PacBioProductSHA256(ChecksumSHA256):
"""
A checksum generated from the combination of run, well, plate and any tags required for
deplexing, see `npg_id_generation.pac_bio.PacBioEntity`.
Tags only contribute to the checksum when samples are multiplexed.
"""

pass
Loading