-
Notifications
You must be signed in to change notification settings - Fork 2
/
common.smk
156 lines (132 loc) · 6.33 KB
/
common.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import itertools
import json
import jsonschema
import os
containers = {
'bcftools': 'docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4',
'bedtools-2.26-python-2.7': 'docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0',
'bwa-0.7.17-samtools-1.10': 'docker://quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0',
'chunked-scatter': 'docker://quay.io/biocontainers/chunked-scatter:1.0.0--py_0',
'cutadapt': 'docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0',
'debian': 'docker://debian:buster-slim',
'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
'gatk': 'docker://broadinstitute/gatk3:3.7-0',
'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
'python3': 'docker://python:3.6-slim',
'vtools': 'docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0'
}
def process_config():
""" Process the config file and set the default values """
def set_default(key, value):
"""Set default config values"""
if key not in config:
config[key] = value
# Read the json schema
with open(srcdir('config/schema.json'), 'rt') as fin:
schema = json.load(fin)
# Validate the config against the schema
try:
jsonschema.validate(config, schema)
except jsonschema.ValidationError as e:
raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if 'baitsfile' in config and 'targetsfile' not in config:
msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
raise jsonschema.ValidationError(msg)
# If you specify a target file but no baitsfile, we use the targets as
# baits. This is needed because picard HsMetrics needs both a baitfile and
# targets file as input
if 'targetsfile' in config and 'baitsfile' not in config:
set_default('baitsfile', config['targetsfile'])
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
if s1 in s2:
raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))
# Set the default config values
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)
set_default('multisample_vcf', False)
# Hide the absolute path so the snakemake linter doesn't cry about it
set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))
def coverage_stats(wildcards):
files = expand("{sample}/coverage/refFlat_coverage.tsv",
sample=config["samples"])
return files if "refflat" in config else []
def coverage_files(wildcards):
""" Return a list of all coverage files
The coverage is calculated for each sample, for each specified threshold
"""
# We only calculate the coverage when this is specified in the
# configuration
if 'coverage_threshold' not in config:
return list()
# Fetch the values we need from the configuration
samples = config['samples']
thresholds = config['coverage_threshold']
files = list()
for sample, threshold in itertools.product(samples, thresholds):
files.append(f'{sample}/vcf/{sample}_{threshold}.bed')
return files
def sample_bamfiles(wildcards):
""" Determine the bam files for a sample (one for each readgroup)
"""
files = list()
sample = config['samples'][wildcards.sample]
sample_name = wildcards.sample
for read_group in sample['read_groups']:
files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
return files
def gather_gvcf(wildcards):
""" Gather the gvcf files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_gvcf_tbi(wildcards):
""" Gather the gvcf index files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_vcf(wildcards):
""" Gather the vcf files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_vcf_tbi(wildcards):
""" Gather the vcf index files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def sample_cutadapt_files(wildcards):
""" Determine the cutadapt log files files for a sample (one for each
readgroup).
"""
files = list()
sample = config['samples'][wildcards.sample]
sample_name = wildcards.sample
for read_group in sample['read_groups']:
files.append(f'{sample_name}/pre_process/{sample_name}-{read_group}.txt')
return files
def all_trimmed_fastqc(wildcards):
""" Determine the trimmed fastq files for each sample """
fastq_files = list()
for sample in config['samples']:
for read_group in config['samples'][sample]['read_groups']:
fastq_files.append(f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done")
return fastq_files