Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Well libs #229

Merged
merged 5 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 33 additions & 3 deletions lang_qc/db/mlwh_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,30 @@
from sqlalchemy.dialects.mysql import SMALLINT as mysqlSMALLINT
from sqlalchemy.dialects.mysql import TINYINT as mysqlTINYINT
from sqlalchemy.dialects.mysql import VARCHAR as mysqlVARCHAR
from sqlalchemy.orm import declarative_base, relationship
from sqlalchemy.orm import DeclarativeBase, relationship

Base = declarative_base()

class Base(DeclarativeBase):
"""
A base class for declarative class definitions for the ml warehouse database.
"""

def _get_row_description(self, fields: list[str]) -> str:
"""
Returns a printable representation of the database table row. Interprets
a list of strings given as the `fields` argument as a list of column
names. Combines the name of the class, names of the given columns
and respective values into a row description. The columns for which
the row has a NULL value are omitted from the description.
"""

pairs = []
for name in fields:
value = self.__getattribute__(name)
if value is not None:
pairs.append(f"{name}={value}")
description = ", ".join(pairs)
return f"{self.__module__}.{self.__class__.__name__}: {description}"


class Sample(Base):
Expand Down Expand Up @@ -538,7 +559,16 @@ class PacBioRunWellMetrics(Base):
"PacBioProductMetrics", back_populates="pac_bio_run_well_metrics"
)

def get_experiment_info(self):
"""Custom or customised methods are added below"""

def __repr__(self):
"""Returns a printable representation of the database row"""

return self._get_row_description(
["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"]
)

def get_experiment_info(self) -> list[PacBioRun]:
"""Returns a list of PacBioRun mlwh database rows.

Returns LIMS information about the PacBio experiment
Expand Down
33 changes: 32 additions & 1 deletion lang_qc/endpoints/pacbio_well.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,18 @@
from lang_qc.db.mlwh_connection import get_mlwh_db
from lang_qc.db.qc_connection import get_qc_db
from lang_qc.db.qc_schema import User
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull
from lang_qc.models.pacbio.well import (
PacBioPagedWells,
PacBioWellFull,
PacBioWellLibraries,
)
from lang_qc.models.qc_flow_status import QcFlowStatusEnum
from lang_qc.models.qc_state import QcState, QcStateBasic
from lang_qc.util.auth import check_user
from lang_qc.util.errors import (
InconsistentInputError,
InvalidDictValueError,
MissingLimsDataError,
RunNotFoundError,
)
from lang_qc.util.type_checksum import ChecksumSHA256
Expand Down Expand Up @@ -163,6 +168,32 @@ def get_wells_in_run(
return response


@router.get(
"/wells/{id_product}/libraries",
summary="Get well summary and LIMS data for all libraries",
responses={
status.HTTP_404_NOT_FOUND: {"description": "Well product does not exist"},
status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"},
},
response_model=PacBioWellLibraries,
)
def get_well_lims_info(
id_product: ChecksumSHA256,
mlwhdb_session: Session = Depends(get_mlwh_db),
) -> PacBioWellLibraries:

db_well = _find_well_product_or_error(id_product, mlwhdb_session)
well_libraries: PacBioWellLibraries
try:
well_libraries = PacBioWellLibraries(db_well=db_well)
except MissingLimsDataError as err:
# 409 - Request conflicts with the current state of the server.
raise HTTPException(409, detail=str(err))

return well_libraries


@router.get(
"/products/{id_product}/seq_level",
summary="Get full sequencing QC metrics and state for a product",
Expand Down
160 changes: 121 additions & 39 deletions lang_qc/models/pacbio/experiment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 Genome Research Ltd.
# Copyright (c) 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -19,14 +19,96 @@
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.

from typing import List
from typing import Any

from pydantic import BaseModel, ConfigDict, Field
from pydantic import Field, model_validator
from pydantic.dataclasses import dataclass

from lang_qc.db.mlwh_schema import PacBioRun


class PacBioExperiment(BaseModel):
@dataclass(kw_only=True, frozen=True)
class PacBioLibrary:
nerdstrike marked this conversation as resolved.
Show resolved Hide resolved
"""
This model represents LIMS data associated with a PacBio library.

The fields of the model can be assigned directly via the constructor.
However, if the `db_library` field, a single row of the PacBioRun table
class, is set via the constructor, the rest of the fields are populated
using this database row object, while any other information passed to the
constructor is disregarded.

The `db_library` field is not present in the model instance that is
returned by the constructor.
"""

db_library: PacBioRun = Field(init_var=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spot check - a "library" is the prep of one sample, therefore 1:1 with a row in pac_bio_run? As distinct from pool, which is a pool of libraries?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. A pool might contain one library as long as the library is indexed. The term pool is only used with indexed libraries.


study_id: str = Field(
title="LIMS-specific study identifier",
)
study_name: str = Field(
title="Study name",
)
sample_id: str = Field(
title="LIMS-specific Sample identifier",
)
sample_name: str = Field(
title="Sample name",
)
tag_sequence: list = Field(
title="Tag sequence",
description="""
Tag sequences as a list. An empty list for a non-indexed library.
""",
)
library_type: str | None = Field(
default=None,
title="Library type",
)
pool_name: str | None = Field(
default=None,
title="Pool name",
description="""
The pac_bio_library_tube_barcode from TRACTION, AKA pool name
""",
)

@model_validator(mode="before")
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
"""
Populates the fields of this object with information available
in the LIMS system. Errors if the `db_library` attribute is not
set via the constructor.
"""

# https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi
if "db_library" not in values.kwargs:
return values.kwargs
db_row: PacBioRun = values.kwargs["db_library"]
if db_row is None:
raise ValueError("None db_library value is not allowed.")

assigned = dict()
study = db_row.study
assigned["study_name"] = study.name
assigned["study_id"] = study.id_study_lims
sample = db_row.sample
assigned["sample_name"] = sample.name
assigned["sample_id"] = sample.id_sample_lims
assigned["library_type"] = db_row.pipeline_id_lims
assigned["pool_name"] = db_row.pac_bio_library_tube_barcode
assigned["tag_sequence"] = []
if tag := db_row.tag_sequence:
assigned["tag_sequence"].append(tag)
if tag := db_row.tag2_sequence:
assigned["tag_sequence"].append(tag)

return assigned


@dataclass(kw_only=True, frozen=True)
class PacBioExperiment:
"""
A response model that contains laboratory tracking information
about the PacBio wells and samples prior to the start of the
Expand All @@ -43,28 +125,30 @@ class PacBioExperiment(BaseModel):
(library).
"""

db_libraries: list[PacBioRun] = Field(init_var=True)

study_id: list = Field(
title="Study identifier",
description="""
Study identifiers as a sorted list of unique strings (to cover
an unlikely case of multiple studies).
""",
)
study_name: str = Field(
study_name: str | None = Field(
default=None,
title="Study name",
description="""
Study name, is not set in case of multiple studies.
""",
)
sample_id: str = Field(
sample_id: str | None = Field(
default=None,
title="Sample identifier",
description="""
Sample identifier, is not set in case of multiple samples.
""",
)
sample_name: str = Field(
sample_name: str | None = Field(
default=None,
title="Sample name",
description="""
Expand Down Expand Up @@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel):
unlikely case of multiple library types.
""",
)
pool_name: str = Field(
pool_name: str | None = Field(
default=None,
title="Pool name",
description="""
The pac_bio_library_tube_barcode from TRACTION, AKA pool name
""",
)
model_config = ConfigDict(from_attributes=True, extra="forbid")

@classmethod
def from_orm(cls, lims_db_rows: List[PacBioRun]):
@model_validator(mode="before")
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
"""
A factory method, creates an instance of the PacBioLimsData class.
Should be given a non-empty list of PacBioRun table row objects as
an argument.
Populates the fields of this object with information available
in the LIMS system.
Errors if the `db_libraries` attribute is not set via the constructor.
"""

lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"]
num_samples = len(lims_db_rows)
if num_samples == 0:
raise Exception("Cannot create PacBioLimsData object, no data.")
if any(row is None for row in lims_db_rows):
raise Exception("Cannot create PacBioLimsData object, None row.")
raise ValueError("Empty db_libraries list is not allowed.")

lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows]

# Using sets for some data instead of lists because we do not
# want repetitions.
lims_data = {
"num_samples": num_samples,
"study_id": set(),
"library_type": set(),
"tag_sequence": [],
}
study_name = None
for row in lims_db_rows:
lims_data["study_id"].add(row.study.id_study_lims)
lims_data["library_type"].add(row.pipeline_id_lims)
study_name = row.study.name
if pool_name := row.pac_bio_library_tube_barcode:
lims_data["pool_name"] = pool_name
if num_samples == 1:
if tag := row.tag_sequence:
lims_data["tag_sequence"].append(tag)
if tag := row.tag2_sequence:
lims_data["tag_sequence"].append(tag)
lims_data["sample_id"] = row.sample.id_sample_lims
lims_data["sample_name"] = row.sample.name
lims_data["study_name"] = row.study.name

lims_data["study_id"] = {o.study_id for o in lib_objects} # returns a set
lims_data["library_type"] = {
o.library_type if o.library_type is not None else "UNKNOWN"
for o in lib_objects
}

pool_names = {o.pool_name for o in lib_objects}
if len(pool_names) > 1:
raise ValueError("Multiple pool names.")
lims_data["pool_name"] = pool_names.pop()

o = lib_objects[0]
if num_samples == 1:
lims_data["tag_sequence"] = o.tag_sequence
lims_data["sample_id"] = o.sample_id
lims_data["sample_name"] = o.sample_name
lims_data["study_name"] = o.study_name
if len(lims_data["study_id"]) == 1:
lims_data["study_name"] = study_name
lims_data["study_name"] = o.study_name

# Convert sets back to lists and sort so that the list items are
# Convert sets back to lists and sort so that the items are
# in a predictable order.
for key in ("library_type", "study_id"):
lims_data[key] = sorted(lims_data[key])

return cls.model_validate(lims_data)
return lims_data
Loading