bihealth · ericblanc20 · Dec 23, 2024 · Dec 23, 2024 · Jan 7, 2025 · Jan 7, 2025
diff --git a/docs/somatic_cnv.rst b/docs/somatic_cnv.rst
@@ -9,11 +9,39 @@ Somatic variant calling is implemented differently for exome and whole genome da
 The whole genome data "branch" is currently under review, as GRCh38 support in ``Control-FREEC`` (the main workhorse for WGS CNV calling) is not complete.
 CNV calling in WGS data can also be done using ``cnvkit``, but its pipeline implementation is also incomplete.
 
-The following documentation is restricted to the tools currently implemented to process exome data: ``cnvkit``, ``purecn`` & ``sequenza``.
+The WGS and WES branches will be merged and extended in many ways, so that, for example, somatic and germline small variants can be used
+to determine tumor sample purity, improve segmentation and identify Loss Of Heterozigocity (LOH).
+This work in on-going, and some preliminary steps are already complete. This steps are briefly outlined below, followed by some description of
+the current WES "branch". The latter will be completely amended as soon as the merged code is enabled, but until then, it is still of interest.
+
+Small variants file generation
+==============================
+
+The flexibility of ``vcf`` files is here a problem, since each implemented CNV calling tool has different requirements for it.
+This is why specific ``snappy`` pileline steps were needed to generate a ``vcf`` file which contains enough information to be used by all tools.
+
+Generally, the CNV calling needs two types of variants: germline variants to help segmentation & determine LOH, and the somatic variants
+to estimate tumor purity.
+
+The ``germline_variants`` step provide a set of will-supported SNPs (sufficient read depth from well mapped reads, and balanced support for
+the reference and alternative alleles). The current implementation very primitive, done using ``bcftools`` commands suite.
+The step is agnostic about the sample status and sample sheet type (``generic``, ``cancer_matched`` or ``germline_variants``) (unlike the
+``variant_calling`` step, which performs the same operation only for ``germline_variants`` sample sheets). When run on its own, the step
+will call variants on all samples, including the tumors. This is unnecessary, and for CNV calling purposes, running the step on normal samples only
+is quite sufficient.
+
+Most tools require that the somatic variants be flagged with ``SOMATIC``, and that the support (read depth) for each allele is provided.
+The genotype of the locus in normal & tumor samples is also needed by some tools, as flags that label known variants, typically those
+frequently encountered in human populations. The ``somatic_variants_for_cnv`` step merges the germline variants identified in the normal samples at
+the ``germline_variants`` step, and merges them with somatic variants from either the ``somatic_variant_calling`` step, or the ``somatic_variant_filtration``
+step if only filtered variants must be used. It can also flag variants if they are found in human variation databases, such as ``gnomAD`` or ``dbSNP``.
+
 
 Output & performance
 ====================
 
+The following documentation is restricted to the tools currently implemented to process exome data: ``cnvkit``, ``purecn`` & ``sequenza``.
+
 The 3 methods generally broadly agree on the log ratio of coverage between tumor & normal samples. 
 
 However, the segmentation and the number of copies assigned to a segment can be quite different between the algorithms.

diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -22,6 +22,8 @@
     cbioportal_export,
     gene_expression_quantification,
     gene_expression_report,
+    germline_snvs,
+    guess_sex,
     helper_gcnv_model_targeted,
     helper_gcnv_model_wgs,
     hla_typing,
@@ -41,6 +43,7 @@
     somatic_variant_calling,
     somatic_variant_filtration,
     somatic_variant_signatures,
+    somatic_variants_for_cnv,
     somatic_wgs_cnv_calling,
     somatic_wgs_sv_calling,
     sv_calling_targeted,
@@ -70,9 +73,11 @@
 #: Mapping from step name to module
 STEP_TO_MODULE = {
     "adapter_trimming": adapter_trimming,
+    "cbioportal_export": cbioportal_export,
     "gene_expression_quantification": gene_expression_quantification,
     "gene_expression_report": gene_expression_report,
-    "cbioportal_export": cbioportal_export,
+    "germline_snvs": germline_snvs,
+    "guess_sex": guess_sex,
     "helper_gcnv_model_targeted": helper_gcnv_model_targeted,
     "helper_gcnv_model_wgs": helper_gcnv_model_wgs,
     "hla_typing": hla_typing,
@@ -92,6 +97,7 @@
     "somatic_variant_calling": somatic_variant_calling,
     "somatic_variant_filtration": somatic_variant_filtration,
     "somatic_variant_signatures": somatic_variant_signatures,
+    "somatic_variants_for_cnv": somatic_variants_for_cnv,
     "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling,
     "somatic_wgs_sv_calling": somatic_wgs_sv_calling,
     "sv_calling_targeted": sv_calling_targeted,

diff --git a/snappy_pipeline/models/bcftools.py b/snappy_pipeline/models/bcftools.py
@@ -0,0 +1,236 @@
+import enum
+import re
+
+from typing import Annotated, Any
+
+from pydantic import Field, AliasGenerator
+
+from snappy_pipeline.models import SnappyModel
+
+# TODO: These regex need to be checked & solidified!!
+VCF_TAG_PATTERN = re.compile(
+    r"""
+^(
+(?P<info>INFO/([A-Za-z_][0-9A-Za-z_.]*|1000G))            # INFO allows 1000G as tag (see https://samtools.github.io/hts-specs/VCFv4.5.pdf, section 1.6.1.8)
+|
+(?P<fmt_or_id>((FORMAT|FMT)/)?([A-Za-z_][0-9A-Za-z_.]*))  # FORMAT doesn't (see section 1.6.2), & IDs without INFO or FORMAT are allowed by bcftools
+)$
+""",
+    re.VERBOSE,
+)
+ANNOTATION_VCF_TAG_PATTERN = re.compile(
+    r"""
+^
+(?P<prefix>([\^=~\+\.-]|\.\+)?)                                                                 # Prefix: ^, =~, +, -, .+
+(?P<target>(INFO/([A-Za-z_][0-9A-Za-z_.]*|1000G)|((FORMAT|FMT)/)?([A-Za-z_][0-9A-Za-z_.]*)))    # VCF_TAG_PATTERN
+(:=(?P<src>(INFO/([A-Za-z_][0-9A-Za-z_.]*|1000G)|((FORMAT|FMT)/)?([A-Za-z_][0-9A-Za-z_.]*))))?  # Source VCF_TAG PATTERN (when source & dest are different tags)
+$""",
+    re.VERBOSE,
+)
+# TODO: create the pattern from BcftoolsMarkSitePresentAbsent enum
+PRESENT_ABSENT_PATTERN = re.compile("^([+-])(.+)$")
+
+
+def BcftoolsModelAliasGenerator(option: str, exceptions: dict[str, str] = {}) -> str:
+    """
+    The alias generator should replace underscores with dashes for an option,
+    unless it belongs to the list of exceptions.
+    In that case, the alias is generated from the exceptions dict.
+
+    Currently, there is no replacement at all, because the automatic alias generation
+    seems to interfere with the re-validation occurring when a sub-workflow is registered.
+    """
+    if option in exceptions:
+        return exceptions[option]
+    return option.replace("_", "_")
+
+
+class BcftoolsModel(SnappyModel):
+    model_config = {
+        "alias_generator": AliasGenerator(
+            serialization_alias=lambda x: BcftoolsModelAliasGenerator(x)
+        )
+    }
+
+    extra_args: Annotated[
+        dict[str, Any],
+        Field(
+            examples=[
+                {"'max-depth'": 5000},
+                {"columns": ["FORMAT/DP", "-INFO/SCBZ"]},
+                {"'illumina1.3+'": True},
+            ],
+            alias="extra_args",
+        ),
+    ] = {}
+    """
+    Placeholder for arguments available to the tool, but not present in the model.
+    The arguments must be input as 'option: value'. Values should not contain the apostrophe (') character.
+
+    The user is responsible to check that the arguments make sense
+    (for example that the maximum allowed value is greater that the minimum allowed value).
+
+    To turn off a flag set in the model, the user should add the flag to the extra arguments and set it to False.
+    """
+
+
+# class BcftoolsBamFlag(enum.StrEnum):
+#     PAIRED = enum.auto()
+#     PROPER_PAIR = enum.auto()
+#     UNMAP = enum.auto()
+#     MUNMAP = enum.auto()
+#     REVERSE = enum.auto()
+#     MREVERSE = enum.auto()
+#     READ1 = enum.auto()
+#     READ2 = enum.auto()
+#     SECONDARY = enum.auto()
+#     QCFAIL = enum.auto()
+#     DUP = enum.auto()
+#     SUPPLEMENTARY = enum.auto()
+#
+#
+# def check_enum_name(x: Any) -> list[int]:
+#     if isinstance(x, BcftoolsBamFlags):
+#         return [x.value]
+#     if isinstance(x, str):
+#         if x not in BcftoolsBamFlags.__members__:
+#             raise ValueError(f"Illegal flag '{x}'")
+#         return [BcftoolsBamFlags[x].value]
+#     if isinstance(x, int):
+#         all_flags = 0
+#         l = []
+#         for v in BcftoolsBamFlags:
+#             all_flags |= v.value
+#             if x & v != 0:
+#                 l.append(v.value)
+#         if x & all_flags != 0:
+#             raise ValueError(f"Unexpected flags in '{x}'")
+#         return list(set(l))
+#     else:
+#         raise ValueError(f"Illegal object '{x}'")
+#
+#
+# def convert_enum_names(x: list[Any] | BcftoolsBamFlags | str | int) -> list[int]:
+#     if isinstance(x, list):
+#         l = []
+#         for v in x:
+#             l += check_enum_name(v)
+#         l = list(set(l))
+#     else:
+#         l = check_enum_name(x)
+#     return l
+#
+#
+# class BcftoolsBamModel(BcftoolsModel):
+#     skip_all_set: list[BcftoolsBamFlag | int] | int = []
+#     """Skip reads with all of the bits set"""
+#     skip_any_set: list[BcftoolsBamFlag | int] | int = [
+#         BcftoolsBamFlag.UNMAP,
+#         BcftoolsBamFlag.SECONDARY,
+#         BcftoolsBamFlag.QCFAIL,
+#         BcftoolsBamFlag.DUP
+#     ]
+#     """Skip reads with any of the bits set [UNMAP,SECONDARY,QCFAIL,DUP]"""
+#     skip_all_unset: list[BcftoolsBamFlag | int] | int = []
+#     """Skip reads with all of the bits unset"""
+#     skip_any_unset: list[BcftoolsBamFlag | int] | int = []
+#     ""Skip reads with any of the bits unset"""
+#
+#     validate_skip_all_set = field_validator("skip_all_set", mode="before")(convert_enum_names)
+#     validate_skip_any_set = field_validator("skip_any_set", mode="before")(convert_enum_names)
+#     validate_skip_all_unset = field_validator("skip_all_unset", mode="before")(convert_enum_names)
+#     validate_skip_any_unset = field_validator("skip_any_unset", mode="before")(convert_enum_names)
+
+
+class BcftoolsBamFlag(enum.StrEnum):
+    PAIRED = "PAIRED"
+    PROPER_PAIR = "PROPER_PAIR"
+    UNMAP = "UNMAP"
+    MUNMAP = "MUNMAP"
+    REVERSE = "REVERSE"
+    MREVERSE = "MREVERSE"
+    READ1 = "READ1"
+    READ2 = "READ2"
+    SECONDARY = "SECONDARY"
+    QCFAIL = "QCFAIL"
+    DUP = "DUP"
+    SUPPLEMENTARY = "SUPPLEMENTARY"
+
+
+def ensure_unique(x: list[BcftoolsBamFlag], set_type: str = "?"):
+    if len(x) > len(set(x)):
+        raise ValueError("Duplicated flags '{}' in set '{}'".format(", ".join(x), set_type))
+
+
+def ensure_no_common_elements(
+    x: list[BcftoolsBamFlag], y: list[BcftoolsBamFlag], set_type: str = "?"
+):
+    intersection = set(x) & set(y)
+    if len(intersection) > 0:
+        raise ValueError(
+            "Flag(s) '{}' are common to '{}' sets".format(", ".join(intersection), set_type)
+        )
+
+
+def ensure_valid_bit_sets(model: BcftoolsModel):
+    for a in ("skip_all_set", "skip_any_set", "skip_all_unset", "skip_any_unset"):
+        try:
+            getattr(model, a)
+        except AttributeError:
+            raise ValueError(f"Model doesn't contain option '{a}'")
+
+    ensure_unique(model.skip_all_set, set_type="all-set")
+    ensure_unique(model.skip_any_set, set_type="any-set")
+    ensure_unique(model.skip_all_unset, set_type="all-unset")
+    ensure_unique(model.skip_any_unset, set_type="any-unset")
+
+    ensure_no_common_elements(model.skip_all_set, model.skip_any_set, set_type="set")
+    ensure_no_common_elements(model.skip_all_unset, model.skip_any_unset, set_type="unset")
+
+    ensure_no_common_elements(
+        set(model.skip_all_set + model.skip_any_set),
+        set(model.skip_all_unset + model.skip_any_unset),
+        set_type="complete",
+    )
+
+    return model
+
+
+class BcftoolsCaller(enum.StrEnum):
+    CONSENSUS = "consensus"
+    MULTIALLELIC = "multiallelic"
+
+
+class BcftoolsPloidy(enum.StrEnum):
+    GRCh37 = "GRCh37"
+    GRCh38 = "GRCh38"
+    X = "X"
+    Y = "Y"
+    HAPLOID = "1"
+    DIPLOID = "2"
+
+
+class BcftoolsMarkSitePresentAbsent(enum.StrEnum):
+    PRESENT = "+"
+    ABSENT = "-"
+
+
+def ensure_valid_mark_sites_tag(option_str: str):
+    if option_str is not None:
+        m = PRESENT_ABSENT_PATTERN.match(option_str)
+        if not m:
+            raise ValueError(
+                f"Mark sites tag '{option_str} must be prefixed with "
+                f"'{BcftoolsMarkSitePresentAbsent.PRESENT}' (mark present sites) or "
+                f"'{BcftoolsMarkSitePresentAbsent.ABSENT}' (mark absent sites)"
+            )
+        if not VCF_TAG_PATTERN.match(m.group(2)):
+            raise ValueError(f"Illegal Mark sites tag '{m.group(2)}'")
+    return option_str
+
+
+def ensure_valid_tags(tags: list[str]):
+    for tag in tags:
+        if not ANNOTATION_VCF_TAG_PATTERN.match(tag):
+            raise ValueError(f"Illegal annotation tag pattern '{tag}'")
+    return tags