Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: revamp somatic sv calling purityploidy estimation #582

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ef6c75a
feat: clean-up manta. TODO: enable exome & rna
ericblanc20 Dec 11, 2024
422a16d
feat: initial implementation of a simple wrapper class for all snappy…
ericblanc20 Dec 13, 2024
beaad3c
feat: initial implementation of a simple wrapper class for all snappy…
ericblanc20 Dec 13, 2024
c61c336
feat: clean-up manta. TODO: enable exome & rna
ericblanc20 Dec 11, 2024
b8045a5
Merge branch '573-revamp-somatic-sv-calling-purityploidy-estimation' …
tedil Dec 13, 2024
4a6c2ec
chore: utility snappy wrapper class for code deduplication (#583)
tedil Dec 13, 2024
50108de
feat: initial implementation of a simple wrapper class for all snappy…
ericblanc20 Dec 13, 2024
049e146
feat: clean-up manta. TODO: enable exome & rna
ericblanc20 Dec 11, 2024
4b6b5f0
update to use ShellWrapper
tedil Dec 13, 2024
3431c5d
merge
tedil Dec 13, 2024
dbeb3ed
feat: Add snakemake parameter --ignore-incomplete to snappy-pipeline …
ErikaZ95 Dec 17, 2024
5036b7d
feat: clean-up manta. TODO: enable exome & rna
ericblanc20 Dec 11, 2024
3958c8b
feat: initial implementation of a simple wrapper class for all snappy…
ericblanc20 Dec 13, 2024
82109a9
refactor: complete re-write of ASCAT support
ericblanc20 Dec 17, 2024
ba593ea
--amend
ericblanc20 Dec 17, 2024
3c7b9bd
style: use getattr for attributes
ericblanc20 Dec 17, 2024
48f98cf
refactor: abstract wrapper, common models & generic ignore_chroms
ericblanc20 Dec 17, 2024
56e0159
refactor: use SnappyWrapper rather than SimpleWrapper
ericblanc20 Dec 17, 2024
9555dfc
docs: correct misleading comment
ericblanc20 Dec 17, 2024
3363996
refactor: merged SimpleWrapper into SnappyWrapper
ericblanc20 Dec 17, 2024
ba3ea51
refactor: proper naming of wrapper scripts for logs
ericblanc20 Dec 17, 2024
dbadee4
refactor: sample sheets to pandas dataframe
ericblanc20 Dec 18, 2024
5f532a4
feat: Add basic guess_sex step, required for CNV
ericblanc20 Dec 18, 2024
fd714d6
refactor: adapt ascat to use the new guess_sex step
ericblanc20 Dec 18, 2024
2282879
refactor: added ratios to coverage table & improved cutoffs (for WGS)
ericblanc20 Dec 19, 2024
fd8a152
refactor: revert to piping into R rather than Rscript, because when u…
ericblanc20 Dec 19, 2024
fbf8aaf
fix: numerous smaller bug fixes
ericblanc20 Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion snappy_pipeline/apps/snappy_snake.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
cbioportal_export,
gene_expression_quantification,
gene_expression_report,
guess_sex,
helper_gcnv_model_targeted,
helper_gcnv_model_wgs,
hla_typing,
Expand Down Expand Up @@ -70,9 +71,10 @@
#: Mapping from step name to module
STEP_TO_MODULE = {
"adapter_trimming": adapter_trimming,
"cbioportal_export": cbioportal_export,
"gene_expression_quantification": gene_expression_quantification,
"gene_expression_report": gene_expression_report,
"cbioportal_export": cbioportal_export,
"guess_sex": guess_sex,
"helper_gcnv_model_targeted": helper_gcnv_model_targeted,
"helper_gcnv_model_wgs": helper_gcnv_model_wgs,
"hla_typing": hla_typing,
Expand Down Expand Up @@ -154,6 +156,8 @@ def run(wrapper_args): # noqa: C901
snakemake_argv.append("--unlock")
if wrapper_args.rerun_incomplete:
snakemake_argv.append("--rerun-incomplete")
if wrapper_args.ignore_incomplete:
snakemake_argv.append("--ignore-incomplete")
if wrapper_args.touch:
snakemake_argv.append("--touch")
if wrapper_args.detailed_summary:
Expand Down Expand Up @@ -254,6 +258,9 @@ def main(argv=None):
group.add_argument(
"--rerun-incomplete", action="store_true", default=False, help="Rerun incomplete jobs"
)
group.add_argument(
"--ignore-incomplete", action="store_true", default=False, help="Ignore incomplete jobs"
)
group.add_argument(
"--cleanup-metadata",
action="store_true",
Expand Down
71 changes: 71 additions & 0 deletions snappy_pipeline/models/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import enum

from typing import Annotated
from pydantic import Field, model_validator

from snappy_pipeline.models import SnappyModel


class LibraryKitEntry(SnappyModel):
"""
Mapping from enrichment kit to target region BED file, for either computing per--target
region coverage or selecting targeted exons.

The following will match both the stock IDT library kit and the ones
with spike-ins seen fromr Yale genomics. The path above would be
mapped to the name "default".
- name: IDT_xGen_V1_0
pattern: "xGen Exome Research Panel V1\\.0*"
path: "path/to/targets.bed"
"""

name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])]

pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])]

path: Annotated[str, Field(examples=["path/to/targets.bed"])]


class LibraryKit(SnappyModel):
path_target_interval_list_mapping: list[LibraryKitEntry] = []
"""Connects sample-based library kit in sample sheets with corresponding bed files"""


class SexValue(enum.StrEnum):
MALE = "male"
FEMALE = "female"


class SexOrigin(enum.StrEnum):
AUTOMATIC = "auto"
SAMPLESHEET = "samplesheet"
CONFIG = "config"


class Sex(SnappyModel):
source: SexOrigin = SexOrigin.AUTOMATIC
"""Where is the sex information taken from? auto (guessed from data), samplesheet or config (single value for the cohort)"""

path_guess_sex: str | None = None
"""Path to the ``guess_sex`` step, where the decision files can be found"""

guess_sex_tool: str | None = None
"""Tool used to guess the sex"""

cohort: SexValue | None = None
"""Sex of the cohort"""

column_name: str | None = None
"""Column name of the sex information in the sample sheet"""

@model_validator(mode="after")
def ensure_valid_values(self):
if self.source == SexOrigin.CONFIG and not self.cohort:
raise ValueError("Undefined cohort sex value in configuration file")
if self.source == SexOrigin.SAMPLESHEET and not self.column_name:
raise ValueError("Undefined column name for sex information")
if self.source == SexOrigin.AUTOMATIC and (
not self.path_guess_sex or not self.guess_sex_tool
):
raise ValueError("Path to or tool used by the 'guess_sex' step are missing")
return self
2 changes: 2 additions & 0 deletions snappy_pipeline/workflow_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
GeneExpressionQuantification,
)
from snappy_pipeline.workflows.gene_expression_report.model import GeneExpressionReport
from snappy_pipeline.workflows.guess_sex.model import GuessSex
from snappy_pipeline.workflows.helper_gcnv_model_targeted.model import HelperGcnvModelTargeted
from snappy_pipeline.workflows.helper_gcnv_model_wgs.model import HelperGcnvModelWgs
from snappy_pipeline.workflows.hla_typing.model import HlaTyping
Expand Down Expand Up @@ -100,6 +101,7 @@ class StepConfig(TypedDict, total=False):
cbioportal_export: CbioportalExport
gene_expression_quantification: GeneExpressionQuantification
gene_expression_report: GeneExpressionReport
guess_sex: GuessSex
helper_gcnv_model_targeted: HelperGcnvModelTargeted
helper_gcnv_model_wgs: HelperGcnvModelWgs
hla_typing: HlaTyping
Expand Down
62 changes: 62 additions & 0 deletions snappy_pipeline/workflows/common/samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd

from biomedsheets.models import NGSLibrary, Sheet


def sample_sheets(sheets: list[Sheet]) -> pd.DataFrame:
"""Creates a pandas data frame from snappy's list of samples sheets

:param sheets: list of sheets provided by the abstract BaseStep class.
The list **MUST** be ``self.sheets`` and **NOT** ``self.shortcut_sheets``,
(because the latter diverges for cancer, germline & generic sheets).
:returns: a pandas data frame containing all extra info columns.
The data frame is guaranteed to have at least 4 columns with their entry names
(``bio_entity``, ``bio_sample``, ``test_sample`` & ``ngs_library``).
The data frame is indexed by the ngs library name.
No duplicated rows not duplicated ngs librari names are allowed.
Duplicate lables for extra info are also forbidden.
"""
table: pd.DataFrame = None

for sheet in sheets:
for bio_entity in sheet.bio_entities.values():
if bio_entity.disabled:
continue
for bio_sample in bio_entity.bio_samples.values():
if bio_sample.disabled:
continue
for test_sample in bio_sample.test_samples.values():
if test_sample.disabled:
continue
for ngs_library in test_sample.ngs_libraries.values():
if ngs_library.disabled:
continue
d = _ngs_library_to_df(ngs_library)
table = pd.concat([table, d], axis=0, ignore_index=True)

assert not any(table.duplicated()), "Duplicated entries in sample sheets"
assert not any(table["ngs_library"].duplicated()), "Duplicated NGS libraries"

table.set_index("ngs_library", drop=False, inplace=True)
return table


def _ngs_library_to_df(ngs_library: NGSLibrary) -> pd.DataFrame:
test_sample = ngs_library.test_sample
bio_sample = test_sample.bio_sample
bio_entity = bio_sample.bio_entity

d = {
"bio_entity": bio_entity.name,
"bio_sample": bio_sample.name,
"test_sample": test_sample.name,
"ngs_library": ngs_library.name,
}

for o in (bio_entity, bio_sample, test_sample, ngs_library):
extra_infos = getattr(o, "extra_infos")
for k, v in extra_infos.items():
assert k not in d, f"Extra info '{k}' already present elsewhere in {ngs_library.name}"
d[k] = v

return pd.DataFrame.from_dict({k: [v] for k, v in d.items()})
73 changes: 73 additions & 0 deletions snappy_pipeline/workflows/guess_sex/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
"""CUBI Pipeline guess_sex step Snakefile"""

import os

from snappy_pipeline import expand_ref
from snappy_pipeline.workflows.guess_sex import GuessSexWorkflow

__author__ = "Eric Blanc <[email protected]>"


# Configuration ===============================================================


configfile: "config.yaml"


# Expand "$ref" JSON pointers in configuration (also works for YAML)
config, lookup_paths, config_paths = expand_ref("config.yaml", config)

# WorkflowImpl Object Setup ===================================================

wf = GuessSexWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())

# Rules =======================================================================


localrules:
# Linking files from work/ to output/ should be done locally
guess_sex_link_out_run,


rule all:
input:
wf.get_result_files(),


# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Generic linking out ---------------------------------------------------------


rule guess_sex_link_out_run:
input:
wf.get_input_files("link_out", "run"),
output:
wf.get_output_files("link_out", "run"),
run:
shell(wf.get_shell_cmd("link_out", "run", wildcards))


# Actual actions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Samtools --------------------------------------------------------------------


rule guess_sex_samtools_run:
input:
unpack(wf.get_input_files("samtools", "run")),
output:
**wf.get_output_files("samtools", "run"),
params:
**{"args": wf.get_args("samtools", "run")},
threads: wf.get_resource("samtools", "run", "threads")
resources:
time=wf.get_resource("samtools", "run", "time"),
memory=wf.get_resource("samtools", "run", "memory"),
partition=wf.get_resource("samtools", "run", "partition"),
tmpdir=wf.get_resource("samtools", "run", "tmpdir"),
log:
**wf.get_log_file("samtools", "run"),
wrapper:
wf.wrapper_path("samtools/guess_sex")
Loading
Loading