-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Well libs #229
Well libs #229
Changes from 4 commits
5c77509
fbff9a7
ed54cab
00df2ed
fd3e9ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2023 Genome Research Ltd. | ||
# Copyright (c) 2023, 2024 Genome Research Ltd. | ||
# | ||
# Authors: | ||
# Marina Gourtovaia <[email protected]> | ||
|
@@ -19,14 +19,96 @@ | |
# You should have received a copy of the GNU General Public License along with | ||
# this program. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
from typing import List | ||
from typing import Any | ||
|
||
from pydantic import BaseModel, ConfigDict, Field | ||
from pydantic import Field, model_validator | ||
from pydantic.dataclasses import dataclass | ||
|
||
from lang_qc.db.mlwh_schema import PacBioRun | ||
|
||
|
||
class PacBioExperiment(BaseModel): | ||
@dataclass(kw_only=True, frozen=True) | ||
class PacBioLibrary: | ||
nerdstrike marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
This model represents LIMS data associated with a PacBio library. | ||
|
||
The fields of the model can be assigned directly via the constructor. | ||
However, if the `db_library` field, a single row of the PacBioRun table | ||
class, is set via the constructor, the rest of the fields are populated | ||
using this database row object, while any other information passed to the | ||
constructor is disregarded. | ||
|
||
The `db_library` field is not present in the model instance that is | ||
returned by the constructor. | ||
""" | ||
|
||
db_library: PacBioRun = Field(init_var=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Spot check - a "library" is the prep of one sample, therefore 1:1 with a row in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. A pool might contain one library as long as the library is indexed. The term |
||
|
||
study_id: str = Field( | ||
title="LIMS-specific study identifier", | ||
) | ||
study_name: str = Field( | ||
title="Study name", | ||
) | ||
sample_id: str = Field( | ||
title="LIMS-specific Sample identifier", | ||
) | ||
sample_name: str = Field( | ||
title="Sample name", | ||
) | ||
tag_sequence: list = Field( | ||
title="Tag sequence", | ||
description=""" | ||
Tag sequences as a list. An empty list for a non-indexed library. | ||
""", | ||
) | ||
library_type: str | None = Field( | ||
default=None, | ||
title="Library type", | ||
) | ||
pool_name: str | None = Field( | ||
default=None, | ||
title="Pool name", | ||
description=""" | ||
The pac_bio_library_tube_barcode from TRACTION, AKA pool name | ||
""", | ||
) | ||
|
||
@model_validator(mode="before") | ||
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: | ||
""" | ||
Populates the fields of this object with information available | ||
in the LIMS system. Errors if the `db_library` attribute is not | ||
set via the constructor. | ||
""" | ||
|
||
# https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi | ||
if "db_library" not in values.kwargs: | ||
return values.kwargs | ||
db_row: PacBioRun = values.kwargs["db_library"] | ||
if db_row is None: | ||
raise ValueError("None db_library value is not allowed.") | ||
|
||
assigned = dict() | ||
study = db_row.study | ||
assigned["study_name"] = study.name | ||
assigned["study_id"] = study.id_study_lims | ||
sample = db_row.sample | ||
assigned["sample_name"] = sample.name | ||
assigned["sample_id"] = sample.id_sample_lims | ||
assigned["library_type"] = db_row.pipeline_id_lims | ||
assigned["pool_name"] = db_row.pac_bio_library_tube_barcode | ||
assigned["tag_sequence"] = [] | ||
if tag := db_row.tag_sequence: | ||
assigned["tag_sequence"].append(tag) | ||
if tag := db_row.tag2_sequence: | ||
assigned["tag_sequence"].append(tag) | ||
|
||
return assigned | ||
|
||
|
||
@dataclass(kw_only=True, frozen=True) | ||
class PacBioExperiment: | ||
""" | ||
A response model that contains laboratory tracking information | ||
about the PacBio wells and samples prior to the start of the | ||
|
@@ -43,28 +125,30 @@ class PacBioExperiment(BaseModel): | |
(library). | ||
""" | ||
|
||
db_libraries: list[PacBioRun] = Field(init_var=True) | ||
|
||
study_id: list = Field( | ||
title="Study identifier", | ||
description=""" | ||
Study identifiers as a sorted list of unique strings (to cover | ||
an unlikely case of multiple studies). | ||
""", | ||
) | ||
study_name: str = Field( | ||
study_name: str | None = Field( | ||
default=None, | ||
title="Study name", | ||
description=""" | ||
Study name, is not set in case of multiple studies. | ||
""", | ||
) | ||
sample_id: str = Field( | ||
sample_id: str | None = Field( | ||
default=None, | ||
title="Sample identifier", | ||
description=""" | ||
Sample identifier, is not set in case of multiple samples. | ||
""", | ||
) | ||
sample_name: str = Field( | ||
sample_name: str | None = Field( | ||
default=None, | ||
title="Sample name", | ||
description=""" | ||
|
@@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel): | |
unlikely case of multiple library types. | ||
""", | ||
) | ||
pool_name: str = Field( | ||
pool_name: str | None = Field( | ||
default=None, | ||
title="Pool name", | ||
description=""" | ||
The pac_bio_library_tube_barcode from TRACTION, AKA pool name | ||
""", | ||
) | ||
model_config = ConfigDict(from_attributes=True, extra="forbid") | ||
|
||
@classmethod | ||
def from_orm(cls, lims_db_rows: List[PacBioRun]): | ||
@model_validator(mode="before") | ||
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: | ||
""" | ||
A factory method, creates an instance of the PacBioLimsData class. | ||
Should be given a non-empty list of PacBioRun table row objects as | ||
an argument. | ||
Populates the fields of this object with information available | ||
in the LIMS system. | ||
Errors if the `db_libraries` attribute is not set via the constructor. | ||
""" | ||
|
||
lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"] | ||
num_samples = len(lims_db_rows) | ||
if num_samples == 0: | ||
raise Exception("Cannot create PacBioLimsData object, no data.") | ||
if any(row is None for row in lims_db_rows): | ||
raise Exception("Cannot create PacBioLimsData object, None row.") | ||
raise ValueError("Empty db_libraries list is not allowed.") | ||
|
||
lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows] | ||
|
||
# Using sets for some data instead of lists because we do not | ||
# want repetitions. | ||
lims_data = { | ||
"num_samples": num_samples, | ||
"study_id": set(), | ||
"library_type": set(), | ||
"tag_sequence": [], | ||
} | ||
study_name = None | ||
for row in lims_db_rows: | ||
lims_data["study_id"].add(row.study.id_study_lims) | ||
lims_data["library_type"].add(row.pipeline_id_lims) | ||
study_name = row.study.name | ||
if pool_name := row.pac_bio_library_tube_barcode: | ||
lims_data["pool_name"] = pool_name | ||
if num_samples == 1: | ||
if tag := row.tag_sequence: | ||
lims_data["tag_sequence"].append(tag) | ||
if tag := row.tag2_sequence: | ||
lims_data["tag_sequence"].append(tag) | ||
lims_data["sample_id"] = row.sample.id_sample_lims | ||
lims_data["sample_name"] = row.sample.name | ||
lims_data["study_name"] = row.study.name | ||
|
||
lims_data["study_id"] = {o.study_id for o in lib_objects} # returns a set | ||
lims_data["library_type"] = { | ||
o.library_type if o.library_type is not None else "UNKNOWN" | ||
for o in lib_objects | ||
} | ||
|
||
pool_names = {o.pool_name for o in lib_objects} | ||
if len(pool_names) > 1: | ||
raise ValueError("Multiple pool names.") | ||
lims_data["pool_name"] = pool_names.pop() | ||
|
||
o = lib_objects[0] | ||
if num_samples == 1: | ||
lims_data["tag_sequence"] = o.tag_sequence | ||
lims_data["sample_id"] = o.sample_id | ||
lims_data["sample_name"] = o.sample_name | ||
lims_data["study_name"] = o.study_name | ||
if len(lims_data["study_id"]) == 1: | ||
lims_data["study_name"] = study_name | ||
lims_data["study_name"] = o.study_name | ||
|
||
# Convert sets back to lists and sort so that the list items are | ||
# Convert sets back to lists and sort so that the items are | ||
# in a predictable order. | ||
for key in ("library_type", "study_id"): | ||
lims_data[key] = sorted(lims_data[key]) | ||
|
||
return cls.model_validate(lims_data) | ||
return lims_data |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe make this guy private?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done