bihealth · tedil · Dec 11, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 11, 2024
diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -22,6 +22,7 @@
     cbioportal_export,
     gene_expression_quantification,
     gene_expression_report,
+    guess_sex,
     helper_gcnv_model_targeted,
     helper_gcnv_model_wgs,
     hla_typing,
@@ -70,9 +71,10 @@
 #: Mapping from step name to module
 STEP_TO_MODULE = {
     "adapter_trimming": adapter_trimming,
+    "cbioportal_export": cbioportal_export,
     "gene_expression_quantification": gene_expression_quantification,
     "gene_expression_report": gene_expression_report,
-    "cbioportal_export": cbioportal_export,
+    "guess_sex": guess_sex,
     "helper_gcnv_model_targeted": helper_gcnv_model_targeted,
     "helper_gcnv_model_wgs": helper_gcnv_model_wgs,
     "hla_typing": hla_typing,
@@ -154,6 +156,8 @@ def run(wrapper_args):  # noqa: C901
         snakemake_argv.append("--unlock")
     if wrapper_args.rerun_incomplete:
         snakemake_argv.append("--rerun-incomplete")
+    if wrapper_args.ignore_incomplete:
+        snakemake_argv.append("--ignore-incomplete")
     if wrapper_args.touch:
         snakemake_argv.append("--touch")
     if wrapper_args.detailed_summary:
@@ -254,6 +258,9 @@ def main(argv=None):
     group.add_argument(
         "--rerun-incomplete", action="store_true", default=False, help="Rerun incomplete jobs"
     )
+    group.add_argument(
+        "--ignore-incomplete", action="store_true", default=False, help="Ignore incomplete jobs"
+    )
     group.add_argument(
         "--cleanup-metadata",
         action="store_true",

diff --git a/snappy_pipeline/models/common.py b/snappy_pipeline/models/common.py
@@ -0,0 +1,71 @@
+import enum
+
+from typing import Annotated
+from pydantic import Field, model_validator
+
+from snappy_pipeline.models import SnappyModel
+
+
+class LibraryKitEntry(SnappyModel):
+    """
+    Mapping from enrichment kit to target region BED file, for either computing per--target
+    region coverage or selecting targeted exons.
+
+    The following will match both the stock IDT library kit and the ones
+    with spike-ins seen fromr Yale genomics.  The path above would be
+    mapped to the name "default".
+      - name: IDT_xGen_V1_0
+        pattern: "xGen Exome Research Panel V1\\.0*"
+        path: "path/to/targets.bed"
+    """
+
+    name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])]
+
+    pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])]
+
+    path: Annotated[str, Field(examples=["path/to/targets.bed"])]
+
+
+class LibraryKit(SnappyModel):
+    path_target_interval_list_mapping: list[LibraryKitEntry] = []
+    """Connects sample-based library kit in sample sheets with corresponding bed files"""
+
+
+class SexValue(enum.StrEnum):
+    MALE = "male"
+    FEMALE = "female"
+
+
+class SexOrigin(enum.StrEnum):
+    AUTOMATIC = "auto"
+    SAMPLESHEET = "samplesheet"
+    CONFIG = "config"
+
+
+class Sex(SnappyModel):
+    source: SexOrigin = SexOrigin.AUTOMATIC
+    """Where is the sex information taken from? auto (guessed from data), samplesheet or config (single value for the cohort)"""
+
+    path_guess_sex: str | None = None
+    """Path to the ``guess_sex`` step, where the decision files can be found"""
+
+    guess_sex_tool: str | None = None
+    """Tool used to guess the sex"""
+
+    cohort: SexValue | None = None
+    """Sex of the cohort"""
+
+    column_name: str | None = None
+    """Column name of the sex information in the sample sheet"""
+
+    @model_validator(mode="after")
+    def ensure_valid_values(self):
+        if self.source == SexOrigin.CONFIG and not self.cohort:
+            raise ValueError("Undefined cohort sex value in configuration file")
+        if self.source == SexOrigin.SAMPLESHEET and not self.column_name:
+            raise ValueError("Undefined column name for sex information")
+        if self.source == SexOrigin.AUTOMATIC and (
+            not self.path_guess_sex or not self.guess_sex_tool
+        ):
+            raise ValueError("Path to or tool used by the 'guess_sex' step are missing")
+        return self
diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py
@@ -10,6 +10,7 @@
     GeneExpressionQuantification,
 )
 from snappy_pipeline.workflows.gene_expression_report.model import GeneExpressionReport
+from snappy_pipeline.workflows.guess_sex.model import GuessSex
 from snappy_pipeline.workflows.helper_gcnv_model_targeted.model import HelperGcnvModelTargeted
 from snappy_pipeline.workflows.helper_gcnv_model_wgs.model import HelperGcnvModelWgs
 from snappy_pipeline.workflows.hla_typing.model import HlaTyping
@@ -100,6 +101,7 @@ class StepConfig(TypedDict, total=False):
     cbioportal_export: CbioportalExport
     gene_expression_quantification: GeneExpressionQuantification
     gene_expression_report: GeneExpressionReport
+    guess_sex: GuessSex
     helper_gcnv_model_targeted: HelperGcnvModelTargeted
     helper_gcnv_model_wgs: HelperGcnvModelWgs
     hla_typing: HlaTyping

diff --git a/snappy_pipeline/workflows/common/samplesheet.py b/snappy_pipeline/workflows/common/samplesheet.py
@@ -0,0 +1,62 @@
+import pandas as pd
+
+from biomedsheets.models import NGSLibrary, Sheet
+
+
+def sample_sheets(sheets: list[Sheet]) -> pd.DataFrame:
+    """Creates a pandas data frame from snappy's list of samples sheets
+
+    :param sheets: list of sheets provided by the abstract BaseStep class.
+       The list **MUST** be ``self.sheets`` and **NOT** ``self.shortcut_sheets``,
+       (because the latter diverges for cancer, germline & generic sheets).
+    :returns: a pandas data frame containing all extra info columns.
+       The data frame is guaranteed to have at least 4 columns with their entry names
+       (``bio_entity``, ``bio_sample``, ``test_sample`` & ``ngs_library``).
+       The data frame is indexed by the ngs library name.
+       No duplicated rows not duplicated ngs librari names are allowed.
+       Duplicate lables for extra info are also forbidden.
+    """
+    table: pd.DataFrame = None
+
+    for sheet in sheets:
+        for bio_entity in sheet.bio_entities.values():
+            if bio_entity.disabled:
+                continue
+            for bio_sample in bio_entity.bio_samples.values():
+                if bio_sample.disabled:
+                    continue
+                for test_sample in bio_sample.test_samples.values():
+                    if test_sample.disabled:
+                        continue
+                    for ngs_library in test_sample.ngs_libraries.values():
+                        if ngs_library.disabled:
+                            continue
+                        d = _ngs_library_to_df(ngs_library)
+                        table = pd.concat([table, d], axis=0, ignore_index=True)
+
+    assert not any(table.duplicated()), "Duplicated entries in sample sheets"
+    assert not any(table["ngs_library"].duplicated()), "Duplicated NGS libraries"
+
+    table.set_index("ngs_library", drop=False, inplace=True)
+    return table
+
+
+def _ngs_library_to_df(ngs_library: NGSLibrary) -> pd.DataFrame:
+    test_sample = ngs_library.test_sample
+    bio_sample = test_sample.bio_sample
+    bio_entity = bio_sample.bio_entity
+
+    d = {
+        "bio_entity": bio_entity.name,
+        "bio_sample": bio_sample.name,
+        "test_sample": test_sample.name,
+        "ngs_library": ngs_library.name,
+    }
+
+    for o in (bio_entity, bio_sample, test_sample, ngs_library):
+        extra_infos = getattr(o, "extra_infos")
+        for k, v in extra_infos.items():
+            assert k not in d, f"Extra info '{k}' already present elsewhere in {ngs_library.name}"
+            d[k] = v
+
+    return pd.DataFrame.from_dict({k: [v] for k, v in d.items()})
diff --git a/snappy_pipeline/workflows/guess_sex/Snakefile b/snappy_pipeline/workflows/guess_sex/Snakefile
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline guess_sex step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.guess_sex import GuessSexWorkflow
+
+__author__ = "Eric Blanc <[email protected]>"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+
+wf = GuessSexWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+# Rules =======================================================================
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    guess_sex_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Generic linking out ---------------------------------------------------------
+
+
+rule guess_sex_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+
+# Actual actions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Samtools --------------------------------------------------------------------
+
+
+rule guess_sex_samtools_run:
+    input:
+        unpack(wf.get_input_files("samtools", "run")),
+    output:
+        **wf.get_output_files("samtools", "run"),
+    params:
+        **{"args": wf.get_args("samtools", "run")},
+    threads: wf.get_resource("samtools", "run", "threads")
+    resources:
+        time=wf.get_resource("samtools", "run", "time"),
+        memory=wf.get_resource("samtools", "run", "memory"),
+        partition=wf.get_resource("samtools", "run", "partition"),
+        tmpdir=wf.get_resource("samtools", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("samtools", "run"),
+    wrapper:
+        wf.wrapper_path("samtools/guess_sex")