Merge pull request #34 from ghga-de/20-output-proper-vcf

re-arrange resources for dkfz cluster
ghga-de · Jun 11, 2024 · a4e5bad · a4e5bad
2 parents 955868b + cebc260
commit a4e5bad
Show file tree

Hide file tree

Showing 25 changed files with 375 additions and 230 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -39,6 +39,9 @@ jobs:
       - name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
         run: |
           if [[ $NXF_VER = '' ]]; then sleep 1200; fi
-      - name: BASIC Run the basic pipeline only for SNV calling with docker
+      - name: BASIC Run the basic pipeline  with docker
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker
+      - name: BASIC Run the basic pipeline when --runcontig set to "ALT_HLA" (contigs does not exist in bam)
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --runcontigs "ALT_HLA"
diff --git a/.gitignore b/.gitignore
@@ -10,9 +10,12 @@
 __pycache__
 output/
 results/
+result/
+results37/
 test.xml
 test_output/
 tests/data/
 work/
+testdata_hg37/
 .github/CODEOWNERS-tmp
 bin/vcfparser.pyc
diff --git a/assets/config/convertToStdVCF.json b/assets/config/convertToStdVCF.json
@@ -42,12 +42,12 @@
     "dbSNP": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in dbSNP database used: dbsnp_snv"
+      "description": "variant in dbSNP database used: dbsnp_indel"
     },
     "DB": {
       "number": "0",
       "type": "Flag",
-      "description": "variant in 1000Genomes (k_genomes) or dbSNP (dbsnp_snv)"
+      "description": "variant in 1000Genomes (k_genomes) or dbSNP (dbsnp_indel)"
     },
     "HSDEPTH": {
       "number": "0",
@@ -72,7 +72,7 @@
     "FREQ": {
       "number": "0",
       "type": "Flag",
-      "description": "variant allele frequency below ?"
+      "description": "Variant allele frequency above 0.001 in GnomAD or above 0.001 in local control database"
     },
     "TAR": {
       "number": "0",
@@ -102,7 +102,7 @@
     "ALTC": {
       "number": "0",
       "type": "Flag",
-      "description": "Alternative reads in control"
+      "description": "Alternative reads in control and other filter not PASS"
     },
     "ALTCFR": {
       "number": "0",
@@ -122,7 +122,7 @@
     "VAF": {
       "number": "0",
       "type": "Flag",
-      "description": "Variant allele frequency in tumor < ' + str(args.newpun) + ' times allele frequency in control"
+      "description": "Variant allele frequency in tumor < 10% and other filter not PASS"
     },
     "TSR_ctrl": {
       "number": "0",
@@ -172,7 +172,7 @@
     "QD": {
       "number": "1",
       "type": "Float",
-      "description": "Variant-quality/read-depth for this variant"
+      "description": "Variants fail quality/depth filter."
     },
     "Q20": {
       "number": "0",
@@ -203,6 +203,16 @@
       "number": "0",
       "type": "Flag",
       "description": "Too many haplotypes are supported by the data in this region."
+    },
+    "MQ": {
+      "number": "0",
+      "type": "Flag",
+      "description": "Root-mean-square mapping quality across calling region is low"
+    },
+    "SC": {
+      "number": "0",
+      "type": "Flag",
+      "description": "Variants fail sequence-context filter. Surrounding sequence is low-complexity"
     }
   },
 
@@ -815,6 +825,11 @@
       "type": "Integer",
       "description": "Sequencing Bias Check 2",
       "new_info_id": "SB2"
+    },
+    "MAFCommon": {
+      "number": "1",
+      "type": "String",
+      "description": "Indicates that the variant is common in the gnomAD or local controls."
     }
   }
 }
diff --git a/assets/samplesheet_hg38_WGS.csv b/assets/samplesheet_hg38_WGS.csv
@@ -1,2 +1,3 @@
 sample,tumor,tumor_index,control,control_index
-SEQC2_LL1,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam.bai,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam.bai
+SEQC2_LL1,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/tumor01_SEQC2_LL1_merged.mdup.bam.bai,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam,/omics/odcf/analysis/OE0526_projects/public_data_analyses/seqc2/sequencing/whole_genome_sequencing/results_per_pid/SEQC2_LL1/alignment/control01_SEQC2_LL1_merged.mdup.bam.bai
+SEQC2_LL2,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/tumor01/paired/merged-alignment/tumor01_SEQC2_IL2_merged.mdup.bam,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/tumor01/paired/merged-alignment/tumor01_SEQC2_IL2_merged.mdup.bam.bai,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/control01/paired/merged-alignment/control01_SEQC2_IL2_merged.mdup.bam,/omics/odcf/project/public_data/seqc2/sequencing/whole_genome_sequencing/view-by-pid/SEQC2_IL2/control01/paired/merged-alignment/control01_SEQC2_IL2_merged.mdup.bam.bai
diff --git a/bin/confidenceAnnotation_SNVs.py b/bin/confidenceAnnotation_SNVs.py
@@ -37,6 +37,56 @@ def validate_refgenome(refname):
     if refname not in valid_refgenome:
         raise ValueError('Reference name (--refgenome) is not valid: %s. Valid reference genome names are %s' % (refname, ', '.join(valid_refgenome)))
 
+## Helper functions
+def is_hg37(args):
+    return args.refgenome[0] == "hs37d5"
+
+def is_hg38(args):
+    return args.refgenome[0] == "GRCh38"
+
+def get_fixed_headers(args):
+    fixed_headers = ["^INFO$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", 
+                             "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
+                             "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
+                             "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
+
+    hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"]
+    if is_hg37(args):
+        fixed_headers += hs37d5_headers
+
+    if not args.no_control:
+        fixed_headers += ["^INFO_control", "^ANNOTATION_control$"]
+
+    return fixed_headers
+
+def get_variable_headers(args):
+    variable_headers = {
+        "ANNOVAR_SEGDUP_COL": "^SEGDUP$",
+        "KGENOMES_COL": "^1K_GENOMES$",
+        "DBSNP_COL": "^DBSNP$"
+    }
+
+    if args.no_control or is_hg38(args) or args.skipREMAP:
+        variable_headers.update({
+            "GNOMAD_EXOMES_COL": "^GNOMAD_EXOMES$",
+            "GNOMAD_GENOMES_COL": "^GNOMAD_GENOMES$",
+            "LOCALCONTROL_WGS_COL": "^LocalControlAF_WGS$",
+            "LOCALCONTROL_WES_COL": "^LocalControlAF_WES$"
+        })
+
+    return variable_headers
+
+def check_max_maf(headers, help, args, column_name, max_maf_attribute):
+    column_valid_key = "{0}_VALID".format(column_name)
+    in_column = False
+
+    if help[column_valid_key]:
+        maf_values = map(float, extract_info(help[column_name], "AF").split(','))
+        if any(af > max_maf_attribute for af in maf_values):
+            in_column = True
+
+    return in_column
+
 def main(args):
 
     validate_refgenome(args.refgenome[0])
@@ -53,6 +103,7 @@ def main(args):
         header += '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation">\n' \
                   '##INFO=<ID=GERMLINE,Number=0,Type=Flag,Description="Indicates if record is a germline mutation">\n' \
                   '##INFO=<ID=UNCLEAR,Number=0,Type=Flag,Description="Indicates if the somatic status of a mutation is unclear">\n' \
+                  '##INFO=<ID=MAFCommon,Number=0,Type=Flag,Description="Indicates if the variant is present in the gnomAD or local control database">\n' \
                   '##INFO=<ID=VT,Number=1,Type=String,Description="Variant type, can be SNP, INS or DEL">\n' \
                   '##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency in primary data, for each ALT allele, in the same order as listed">\n' \
                   '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership">\n' \
@@ -83,7 +134,6 @@ def main(args):
                   '##FILTER=<ID=YALT,Description="Variant on Y chromosome with low allele frequency">\n' \
                   '##FILTER=<ID=VAF,Description="Variant allele frequency in tumor < ' + str(args.newpun) + ' times allele frequency in control">\n' \
                   '##FILTER=<ID=BI,Description="Bias towards a PCR strand or sequencing strand">\n' \
-                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>0.05%)">\n' \
                   '##SAMPLE=<ID=CONTROL,SampleName=control_' + args.pid + ',Individual=' + args.pid + ',Description="Control">\n' \
                   '##SAMPLE=<ID=TUMOR,SampleName=tumor_'+args.pid+',Individual='+args.pid+',Description="Tumor">\n'\
                   '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
@@ -107,26 +157,9 @@ def main(args):
 
         if line[0] == "#":
             headers = list(line[1:].rstrip().split('\t'))
-            fixed_headers = ["^INFO$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", 
-                             "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
-                             "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
-                             "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
-
-            if args.refgenome[0] == "hs37d5":
-                fixed_headers = ["^INFO$", "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", "REPEAT_MASKER", "DUKE_EXCLUDED",
-                                 "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", "^RECLASSIFICATION$", "^PENALTIES$",
-                                 "^seqBiasPresent$", "^seqingBiasPresent$", "^seqBiasPresent_1$", "^seqingBiasPresent_1$",
-                                 "^seqBiasPresent_2$", "^seqingBiasPresent_2$"]
 
-            variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", }
-
-            if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
-                variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
-                variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
-                variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
-                variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"
-            if not args.no_control:
-                fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
+            fixed_headers = get_fixed_headers(args)
+            variable_headers = get_variable_headers(args)
 
             header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers)
 
@@ -206,7 +239,7 @@ def main(args):
             # for potential re-classification (e.g. low coverage in control and in dbSNP => probably germline)
             classification = help["ANNOTATION_control"] # start with original classification
 
-        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
+        if args.no_control or is_hg38(args) or args.skipREMAP:
             inGnomAD_WES = False
             inGnomAD_WGS = False
             inLocalControl_WES = False
@@ -228,7 +261,7 @@ def main(args):
             in1KG = True
             if args.no_control:
                 af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF")
-                if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))):
+                if af is not None and any(float(af) > args.kgenome_maxMAF for af in af.split(',')):
                     in1KG_AF = True
             infofield["1000G"] = "1000G"
         # dbSNP
@@ -247,27 +280,11 @@ def main(args):
             if "COMMON=1" in help["DBSNP_COL"]:
                 is_commonSNP = True
 
-        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
-            if indbSNP and is_commonSNP and not is_clinic:
-                #reasons += "dbSNP(NoControl)"
-                pass
-            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
-                inGnomAD_WES = True
-                infofield["gnomAD_Exomes"] = "gnomAD_Exomes"
-                #reasons += "gnomAD_Exomes(NoControl)"
-            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
-                inGnomAD_WGS = True
-                infofield["gnomAD_Genomes"] = "gnomAD_Genomes"
-                #reasons += "gnomAD_Genomes(NoControl)"
-
-            if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
-                inLocalControl_WGS = True
-                infofield["LocalControl_WGS"] = "LocalControl_WGS"
-                #reasons += "LocalControl_WGS(NoControl)"
-            if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
-                inLocalControl_WES = True
-                infofield["LocalControl_WES"] = "LocalControl_WES"
-                #reasons += "LocalControl_WES(NoControl)"
+        if args.no_control or is_hg38(args) or args.skipREMAP:
+            inGnomAD_WES = check_max_maf(headers, help, args, "GNOMAD_EXOMES_COL", args.gnomAD_WES_maxMAF)
+            inGnomAD_WGS = check_max_maf(headers, help, args, "GNOMAD_GENOMES_COL", args.gnomAD_WGS_maxMAF)
+            inLocalControl_WGS = check_max_maf(headers, help, args, "LOCALCONTROL_WGS_COL", args.localControl_WGS_maxMAF)
+            inLocalControl_WES = check_max_maf(headers, help, args, "LOCALCONTROL_WES_COL", args.localControl_WES_maxMAF)
 
 
         # Punish for biases round 1
@@ -316,7 +333,7 @@ def main(args):
                 filterfield["BI"] = 1
 
         # Only for hg19 reference genome
-        if args.refgenome[0] == "hs37d5" and not args.skipREMAP:
+        if is_hg37(args) and not args.skipREMAP:
             # 2) annotations of regions that cause problems: some classes of repeats from RepeatMasker track,
             # segmental duplications, (cf. Reumers et al. 2012, Nature Biotech 30:61), external blacklists, mapability
             # simple repeats and low complexity (not the same as homopolymer, but similar enough);
@@ -405,13 +422,13 @@ def main(args):
         # the confidence are reduced and thrown out. This is implemented for hg38 and could be
         # used with skipREMAP option for hg19.
         # inLocalControl_WES: Needs to be generated from a new hg38 dataset
-        filterfield["FREQ"] = 0
-        if(args.refgenome[0] == 'GRCh38' or args.skipREMAP):
+        common_tag = ""
+        if(is_hg38(args) or args.skipREMAP):
             if(inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS):
-                #reasons += 'commonSNP_or_technicalArtifact(-3)'
-                #classification = "SNP_support_germline"
-                #confidence -= 3
-                filterfield["FREQ"] = 1
+                common_tag = ";MAFCommon"
+        # Add the common tag to the INFO field
+        if common_tag not in entries[7]:
+            entries[7] += common_tag
 
         # if others have found the SNP already, it may be interesting despite low score
         # - but only if it's not a weird region.
@@ -605,8 +622,7 @@ def main(args):
                         reasons += "Germline_ALT<0.3(-2)"
                         filterfield["FRC"] = 1
                 if in1KG or (indbSNP and not (is_precious or is_clinic)): # but this supports again - number of reads may be low!
-                    if filterfield['FREQ'] == 0:
-                        classification += "_SNP_support"
+                    classification += "_SNP_support"
 
                 if depthC <= 10: # very probably germline due to skewed distribution at low coverage
                     classification += "_lowCov"	# => can end up as "germline_SNP_support_lowCov"
@@ -701,7 +717,7 @@ def main(args):
             filters_line = [] if entries[6] == "" else entries[6].split(';')
             if args.pancanout is not None:
                 filters_pancan = []
-            for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI", "FREQ"):
+            for filter in ("RE","BL","DP","SB","TAC","dbSNP","DB","HSDEPTH","MAP","SBAF","FRQ","TAR","UNCLEAR","DPHIGH","DPLOWC","1PS","ALTC","ALTCFR","FRC","YALT","VAF","BI"):
                 if filterfield.get(filter, 0) == 1:
                     if args.pancanout is not None:
                         filters_pancan.append(filter)

diff --git a/bin/convertToStdVCF.py b/bin/convertToStdVCF.py
@@ -66,10 +66,21 @@ def convert_str_to_dict(str_to_convert, pair_separator, key_val_separator):
     :param key_val_separator: String or character that separates keys from values
     :return: dictionary that was created from the string
     """
-    pairs = str_to_convert.split(pair_separator) 
-    valid_pairs = filter(lambda pair: len(pair.split(key_val_separator)) == 2, pairs)
-    return dict(pair.split(key_val_separator) for pair in valid_pairs)
-
+    valid_pairs = {}
+
+    pairs = str_to_convert.split(pair_separator)
+    for pair in pairs:
+        key_val = pair.split(key_val_separator)
+        # convert the key-value pairs
+        if len(key_val) == 2:
+            key, value = key_val
+            valid_pairs[key] = value
+        # convert the flags to pairs with value "true"
+        elif len(key_val) == 1:
+            key = key_val[0]
+            valid_pairs[key] = "true"
+
+    return valid_pairs
 
 def convert_dict_to_str(dict_to_convert, pair_separator, key_val_separator):
     """
@@ -192,7 +203,7 @@ def update_keys_used(line, col_indices, keys_used):
     line_as_list = line.rstrip().split("\t")
 
     new_line = check_line(line_as_list[col_indices["INFO"]])
-    print(new_line)
+    #print(new_line)
     INFO_old_keys = convert_str_to_dict(new_line, ';', '=').keys()
     #INFO_old_keys = convert_str_to_dict(line_as_list[col_indices["INFO"]], ';', '=').keys()
     keys_used["INFO"].update(INFO_old_keys)
@@ -288,7 +299,7 @@ def extract_desired_INFO_fields_from_all(raw_INFO_field, used_and_desired_keys,
     :return: Dictionary with contents from the INFO or INFO_control field
     """
     old_INFO_dict = convert_str_to_dict(raw_INFO_field, ";", "=")
-    print(old_INFO_dict)
+    #print(old_INFO_dict)
 
     if rename_control:
         for key in old_INFO_dict.keys():