Skip to content

Commit

Permalink
Merge branch 'pydantic' of github.com:bihealth/snappy-pipeline into p…
Browse files Browse the repository at this point in the history
…ydantic
  • Loading branch information
tedil committed Jun 3, 2024
2 parents c68e056 + 7e1f359 commit 257c542
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 72 deletions.
29 changes: 15 additions & 14 deletions snappy_pipeline/workflows/somatic_variant_signatures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,23 +268,24 @@ def get_result_files(self):
assert len(mappers) > 0, "No valid mapper"
callers = set(config.tools_somatic_variant_calling) & set(SOMATIC_VARIANT_CALLERS_MATCHED)
assert len(callers) > 0, "No valid somatic variant caller"

anno_callers = []
filters = []
regions = []
if config.is_filtered:
anno_callers = set(config.tools_somatic_variant_annotation) & set(ANNOTATION_TOOLS)
assert len(anno_callers) > 0, "No valid somatic variant annotation tool"
filters = list(
self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys()
)
filters.append("no_filter")
filters = set(filters) & set(config.filters)
regions = list(
self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys()
)
regions.append("genome_wide")
regions = set(regions) & set(config.filtered_regions)
else:
anno_callers = []
filters = []
regions = []
if len(config.filters) > 0:
filters = list(
self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys()
)
filters.append("no_filter")
filters = set(filters) & set(config.filters)
regions = list(
self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys()
)
regions.append("genome_wide")
regions = set(regions) & set(config.filtered_regions)

yield from self._yield_result_files_matched(
os.path.join("output", name_pattern, "out", name_pattern + ".tsv"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def variant_type(x, args=None):
return [variant_type]


strip_sequence_version_pattern = re.compile("\.[0-9]+$") # noqa: W605
strip_sequence_version_pattern = re.compile(r"\.[0-9]+$")


def strip_sequence_version(x, args):
Expand Down
2 changes: 1 addition & 1 deletion snappy_wrappers/wrappers/vcf2maf/vcf_to_table/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class VcfParser:
pattern = re.compile("^([^\[\]\s]+)(\[([0-9]+|REF)\])?$") # noqa: W605
pattern = re.compile(r"^([^\[\]\s]+)(\[([0-9]+|REF)\])?$")

def __init__(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,28 +167,14 @@ class ProteinMutationFormatException(Exception):

@functools.lru_cache
def _build_protein_pattern():
prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?" # noqa: W605
postfix = "\)?$" # noqa: W605
prefix = r"^(([A-z0-9_\.\(\)-]+):)?p\.\(?"
postfix = r"\)?$"

aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")"
aaTer = (
"("
+ "|".join(aa_codes_short)
+ "|"
+ "|".join(aa_codes_long)
+ "|\*|Ter" # noqa: W605
+ ")"
)
aaAll = (
"("
+ "|".join(aa_codes_short)
+ "|"
+ "|".join(aa_codes_long)
+ "|\*|Ter|=|\?" # noqa: W605
+ ")"
)
aaTer = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter" + ")"
aaAll = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter|=|\?" + ")"
nb = "([0-9]+)"
nb_unknown = "([0-9]+|\?)" # noqa: W605
nb_unknown = r"([0-9]+|\?)"

interval = aa + nb + "_" + aa + nb
one_or_interval = aa + nb + "(_" + aa + nb + ")?"
Expand All @@ -202,17 +188,17 @@ def _build_protein_pattern():

delins = one_or_interval + "delins" + "(" + aa + "*)" + aaTer

frameshift = aaTer + nb + aa + "fs" + "(Ter|\*)" + nb_unknown # noqa: W605
frameshift = aaTer + nb + aa + "fs" + r"(Ter|\*)" + nb_unknown

extensionN = "(Met|M)1" + "ext(-[0-9]+|\?)" # noqa: W605
extensionC = "(Ter|\*)" + nb + aa + "ext" + "(Ter|\*)" + nb_unknown # noqa: W605
extensionN = "(Met|M)1" + r"ext(-[0-9]+|\?)"
extensionC = r"(Ter|\*)" + nb + aa + "ext" + r"(Ter|\*)" + nb_unknown

pattern = (
prefix
+ "("
+ "|".join(
[
"(0\??|=|\?)", # noqa: W605
r"(0\??|=|\?)",
substitution, # 4: ref, 5: position, 6: alt
duplication, # 7: start, 8: start pos, 10: end, 11: end pos
deletion, # 12: start, 13: start pos, 15: end, 16: end pos
Expand All @@ -232,18 +218,11 @@ def _build_protein_pattern():

@functools.lru_cache
def _build_silent_dinucleotide():
prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?" # noqa: W605
postfix = "\)?$" # noqa: W605
prefix = r"^(([A-z0-9_\.\(\)-]+):)?p\.\(?"
postfix = r"\)?$"

aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")"
aaTer = (
"("
+ "|".join(aa_codes_short)
+ "|"
+ "|".join(aa_codes_long)
+ "|\*|Ter" # noqa: W605
+ ")"
)
aaTer = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter" + ")"
nb = "([0-9]+)"

return re.compile(prefix + aa + aaTer + "*" + nb + "=" + postfix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ def somatic_variant_signatures_workflow(
def test_tabulate_vcf_step_part_get_input_files(somatic_variant_signatures_workflow):
"""Tests TabulateVariantsStepPart.get_input_files()"""
base_name = (
"SOMATIC_VARIANT_FILTRATION/output/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter}.{region}/out/"
"{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter}.{region}"
"SOMATIC_VARIANT_FILTRATION/output/{mapper}.{var_caller}.{anno_caller}.filtered.{tumor_library}/out/"
"{mapper}.{var_caller}.{anno_caller}.filtered.{tumor_library}"
)
expected = {
"vcf": base_name + ".vcf.gz",
Expand All @@ -104,8 +104,8 @@ def test_tabulate_vcf_step_part_get_output_files(somatic_variant_signatures_work
"""Tests TabulateVariantsStepPart.get_output_files()"""
expected = {
"tsv": (
"work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/out/"
"{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}.tsv"
"work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/out/"
"{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}.tsv"
)
}
actual = somatic_variant_signatures_workflow.get_output_files("tabulate_vcf", "run")
Expand All @@ -114,7 +114,7 @@ def test_tabulate_vcf_step_part_get_output_files(somatic_variant_signatures_work

def test_tabulate_vcf_step_part_get_log_file(somatic_variant_signatures_workflow):
"""Tests TabulateVariantsStepPart.get_log_file()"""
expected = "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/log/snakemake.tabulate_vcf.log"
expected = "work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/log/snakemake.tabulate_vcf.log"
actual = somatic_variant_signatures_workflow.get_log_file("tabulate_vcf", "run")
assert actual == expected

Expand Down Expand Up @@ -145,8 +145,8 @@ def test_deconstruct_sigs_step_part_get_input_files(somatic_variant_signatures_w
"""Tests DeconstructSigsStepPart.get_input_files()"""
expected = {
"tsv": (
"work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/out/"
"{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}.tsv"
"work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/out/"
"{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}.tsv"
)
}
actual = somatic_variant_signatures_workflow.get_input_files("deconstruct_sigs", "run")
Expand All @@ -156,8 +156,8 @@ def test_deconstruct_sigs_step_part_get_input_files(somatic_variant_signatures_w
def test_deconstruct_sigs_step_part_get_output_files(somatic_variant_signatures_workflow):
"""Tests DeconstructSigsStepPart.get_output_files()"""
base_name_out = (
"work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}/out/"
"{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}"
"work/{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}/out/"
"{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}"
)
expected = {
"tsv": base_name_out + ".tsv",
Expand All @@ -170,7 +170,7 @@ def test_deconstruct_sigs_step_part_get_output_files(somatic_variant_signatures_
def test_deconstruct_sigs_step_part_get_log_file(somatic_variant_signatures_workflow):
"""Tests DeconstructSigsStepPart.get_log_file()"""
expected = (
"work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}/log/"
"work/{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}/log/"
"snakemake.deconstruct_sigs.log"
)
actual = somatic_variant_signatures_workflow.get_log_file("deconstruct_sigs", "run")
Expand Down Expand Up @@ -201,7 +201,7 @@ def test_somatic_variant_signatures_workflow(somatic_variant_signatures_workflow
assert actual == expected

# Check result file construction
name_pattern = "{mapper}.{caller}.{annotator}.dkfz_bias_filter.eb_filter.deconstruct_sigs.P00{i}-T{t}-DNA1-WGS1.{filt}.{region}"
name_pattern = "{mapper}.{caller}.{annotator}.filtered.deconstruct_sigs.P00{i}-T{t}-DNA1-WGS1"
tpl = "output/" + name_pattern + "/out/" + name_pattern + ".tsv"
expected = [
tpl.format(
Expand All @@ -210,21 +210,11 @@ def test_somatic_variant_signatures_workflow(somatic_variant_signatures_workflow
annotator=annotator,
i=i,
t=t,
filt=filt,
region=region,
)
for i, t in ((1, 1), (2, 1), (2, 2))
for mapper in ("bwa",)
for caller in ("mutect", "scalpel")
for annotator in ("vep", "jannovar")
for filt in (
"no_filter",
"dkfz_only",
"dkfz_and_ebfilter",
"dkfz_and_ebfilter_and_oxog",
"dkfz_and_oxog",
)
for region in ("genome_wide",)
for caller in ("mutect",)
for annotator in ("vep",)
]
expected = set(expected)
actual = set(somatic_variant_signatures_workflow.get_result_files())
Expand Down

0 comments on commit 257c542

Please sign in to comment.