ccmbioinfo · Madelinehazel · Feb 18, 2021 · Feb 18, 2021 · Mar 11, 2021
diff --git a/SVRecords/SVGrouper.py b/SVRecords/SVGrouper.py
@@ -56,7 +56,7 @@ def split_Ensembl_ids(id_list):
         ann_dfs = []
 
         index_fields = list(self.index_cols.keys()) #CHR POS STOP needs to be first 3 columns for creation of BedTool instance
-        sample_sv_fields = index_fields + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples']
+        sample_sv_fields = index_fields + ['variants/ALT'] + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples']
         parse_fields = list(set(sample_sv_fields + ann_fields))
 
         for vcf_path in vcf_paths:
@@ -69,8 +69,8 @@ def split_Ensembl_ids(id_list):
             # if 'chr' in CHROM field, remove
             vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']]
 
-            # if 'chr' in CHROM field, remove
-            vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']]
+            # grab alt allele
+            vcf_dict['variants/ALT'] = [alt[0] for alt in vcf_dict['variants/ALT']]
 
             # drop un-needed fields from vcf, cannot pass in parse_fields to read_vcf() because ANN_gene_id is unknown until ANNTransformer runs
             for key in list(vcf_dict.keys()):
@@ -95,6 +95,13 @@ def split_Ensembl_ids(id_list):
             df['variants/END'] = df['variants/END'].astype(int)
             df = df.drop_duplicates()
 
+            # for BND, make POS=END
+            bnd = df['variants/SVTYPE'] == 'BND'
+            df.loc[bnd, ['variants/POS']] = df.loc[bnd, ['variants/END']].values
+            df['variants/END'] = df['variants/END'].astype(int)
+            df['variants/POS'] = df['variants/POS'].astype(int)
+            df = df.drop_duplicates()
+
             intervals.extend(df[sample_sv_fields].itertuples(index=False))
 
             if ann_fields:
@@ -103,8 +110,8 @@ def split_Ensembl_ids(id_list):
         ann_df = pd.concat(ann_dfs).astype(str).rename(columns=self.index_cols).set_index(list(self.index_cols.values())) if ann_fields else pd.DataFrame()
         ann_df = ann_df[~ann_df.index.duplicated(keep='first')] #annotations for the same SV in a vcf can have slighly differing fields (ex. SVSCORE_MEAN)
 
-        for i in intervals:
-            print(i)
+        # for i in intervals:
+        #     print(i)
 
         return BedTool(intervals), ann_df, sample_names
 
@@ -113,11 +120,11 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5):
 
         for l in bedtool.intersect(bedtool, wa=True, wb=True, F=reciprocal_overlap, f=reciprocal_overlap):
 
-            ref_chr, ref_start, ref_end, ref_svtype, ref_gt, ref_genes, ref_name, \
-            samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_genes, samp_name = l
+            ref_chr, ref_start, ref_end, ref_svtype, ref_alt, ref_gt, ref_genes, ref_name, \
+            samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_genes, samp_name = l
 
-            ref_interval = (ref_chr, ref_start, ref_end, ref_svtype)
-            samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name)
+            ref_interval = (ref_chr, ref_start, ref_end, ref_svtype, ref_alt)
+            samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name)
 
             if (samp_interval not in already_grouped_intervals) and (ref_svtype == samp_svtype):
                 self._add_interval(ref_interval, ref_genes, samp_interval)
@@ -126,13 +133,13 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5):
         self.df.sort_index(inplace=True)
 
     def _add_interval(self, ref_interval, ref_genes, samp_interval):
-        samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name = samp_interval
+        samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name = samp_interval
 
         #Get reference to row
-        if ref_interval not in self.df.index:
+        if ref_interval[:-1] not in self.df.index:
             #make new row
-            self.df.loc[ref_interval, :] = np.nan
-            row = self.df.loc[ref_interval, :]
+            self.df.loc[ref_interval[:-1],  :] = np.nan
+            row = self.df.loc[ref_interval[:-1], :]
             row['N_SAMPLES'] = 0
             row['Ensembl Gene ID'] = ref_genes
 
@@ -141,14 +148,17 @@ def _add_interval(self, ref_interval, ref_genes, samp_interval):
                 row["%s_SV_DETAILS" % name] = []
                 row["%s_GENOTYPE" % name] = []
         else:
-            row = self.df.loc[ref_interval, :]
+            row = self.df.loc[ref_interval[:-1], :]
 
         #Set values for row
         if row[samp_name] == 0:
             row['N_SAMPLES'] += 1
             row[samp_name] = 1
 
-        row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype))
+        if samp_svtype == 'BND':
+            row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype, samp_alt))
+        else:
+            row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype))
         row["%s_GENOTYPE" % samp_name].append(samp_gt)
 
     def write(self, outfile_name):

diff --git a/crg.intersect_sv_vcfs.py b/crg.intersect_sv_vcfs.py
@@ -15,7 +15,10 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom
     protein_coding_ENSG = make_exon_gene_set(protein_coding_genes)
 
     print("Grouping like structural variants ...")
-    sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col])
+    try:
+        sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col])
+    except KeyError:
+        sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols)
     sample_cols = [ col for col in sv_records.df.columns if col != MetaSV_col ]
     sample_genotype_cols = [col for col in sample_cols if col.endswith('_GENOTYPE')]
 
@@ -42,6 +45,8 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom
             sv_records.df[col] = "na"
 
     # format and rearrange the columns
+    if MetaSV_col not in sv_records.df.columns:
+        sv_records.df['variants/NUM_SVTOOLS'] = '1'
     sv_records.df = sv_records.df[ [col for col in sample_cols if col not in set(SVScore_cols + sample_genotype_cols + ['N_SAMPLES', 'Ensembl Gene ID']) ] + \
     [ 'variants/SVLEN', ] + \
     [ MetaSV_col ] + \

diff --git a/crg.intersect_sv_vcfs.sh b/crg.intersect_sv_vcfs.sh
@@ -52,23 +52,27 @@ do
 	tabix $FILE
 done
 
-#make filtered report
-echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}"
-${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}
+MANTA_BND=`ls ${FAMILY}_*/${FAMILY}/final/${FAMILY}*/*manta.BND.vcf.gz.snpeff.vcf.svscore.vcf | tr '\n' ' '`
 
-${HOME}/crg/cap_report.py ${FAMILY}.wgs.sv.${TODAY}.tsv
-
-if [[ "$OSTYPE" == *"darwin"* ]]; then
-	open -a 'Microsoft Excel' ${FAMILY}.wgs.sv.${TODAY}.tsv
-fi
-
-
-#make unfiltered report
-echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}"
-${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}
-
-${HOME}/crg/cap_report.py ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv
+REPORT_CMD=`echo ${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS}`
+for type in filtered bnd
+do
+	if [ $type = "filtered" ]; then
+		echo $REPORT_CMD "-o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}"
+		$REPORT_CMD -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}
+	elif [ $type = "unfiltered" ]; then
+		echo $REPORT_CMD "-o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}"
+		$REPORT_CMD -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}
+	else
+		echo $REPORT_CMD "-o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND}"
+		$REPORT_CMD -o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND}
+	fi
+done
 
-if [[ "$OSTYPE" == *"darwin"* ]]; then
-	open -a 'Microsoft Excel' ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv
-fi
+for report in ${FAMILY}.wgs.sv.${TODAY}.tsv ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv ${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv
+do
+	${HOME}/crg/cap_report.py $report
+	if [[ "$OSTYPE" == *"darwin"* ]]; then
+	open -a 'Microsoft Excel' $report
+	fi	
+done
diff --git a/crg.merge.annotate.sv.sh b/crg.merge.annotate.sv.sh
@@ -12,6 +12,30 @@ if [ ! -f $logfile ]; then
 	touch $logfile;
 fi;
 
+#get manta BNDs supported by at least 5 split reads or read pairs
+for f in $FAMILY_*/$FAMILY/final/$FAMILY*
+do
+	bcftools view -i 'INFO/SVTYPE="BND" && (FORMAT/PR[0:1] >= 5 || FORMAT/SR[0:1] >= 5)' -O z $f/${FAMILY}-manta.vcf.gz > $f/${FAMILY}-manta.BND.vcf.gz
+done
+
+
+#run snpeff on manta vcf for each sample
+for f in $FAMILY_*/$FAMILY/final/$FAMILY*
+do
+	snpeff_manta_jobs+=($(qsub ~/crg/crg.snpeff.sh -F $f/${FAMILY}-manta.BND.vcf.gz))
+done
+snpeff_manta_string=$( IFS=$':'; echo "${snpeff_manta_jobs[*]}" )
+
+
+#run svscore on manta vcf for each sample 
+for f in $FAMILY_*/$FAMILY/final/$FAMILY*
+do
+	SAMPLE="$(echo $f | cut -d'/' -f1)"
+	svscore_manta_jobs+=($(qsub ~/crg/crg.svscore.sh -F $f/${FAMILY}-manta.BND.vcf.gz.snpeff.vcf -W depend=afterany:"${snpeff_manta_string}"))
+done
+svscore_manta_string=$( IFS=$':'; echo "${svscore_manta_jobs[*]}" )
+
+
 #run metasv on each sample
 for f in $FAMILY_*/$FAMILY/final/$FAMILY* 
 do 
@@ -22,7 +46,7 @@ do
 done
 metasv_string=$( IFS=$':'; echo "${metasv_jobs[*]}" )
 
-#run snpeff on each sample
+#run snpeff on metasv vcf for each sample
 for f in $FAMILY_*/$FAMILY/final/$FAMILY*
 do
 	SAMPLE="$(echo $f | cut -d'/' -f1)"
@@ -44,7 +68,8 @@ echo "snpeff=${snpeff_string}" >> ${logfile}
 echo "svscore=${svscore_string}" >> ${logfile}
 echo "Merging SVs with metaSV, annotating with snpeff, scoring with svscore, and creating report..." 
 
-jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY  -W depend=afterany:"${svscore_string}")
+depend_string=`echo ${svscore_string}:${svscore_manta_string}`
+jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY  -W depend=afterany:"${depend_string}")
 
 echo "intersect=$jobid">> ${logfile}