diff --git a/SVRecords/SVGrouper.py b/SVRecords/SVGrouper.py index 860b539..3fb91be 100644 --- a/SVRecords/SVGrouper.py +++ b/SVRecords/SVGrouper.py @@ -56,7 +56,7 @@ def split_Ensembl_ids(id_list): ann_dfs = [] index_fields = list(self.index_cols.keys()) #CHR POS STOP needs to be first 3 columns for creation of BedTool instance - sample_sv_fields = index_fields + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples'] + sample_sv_fields = index_fields + ['variants/ALT'] + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples'] parse_fields = list(set(sample_sv_fields + ann_fields)) for vcf_path in vcf_paths: @@ -69,8 +69,8 @@ def split_Ensembl_ids(id_list): # if 'chr' in CHROM field, remove vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']] - # if 'chr' in CHROM field, remove - vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']] + # grab alt allele + vcf_dict['variants/ALT'] = [alt[0] for alt in vcf_dict['variants/ALT']] # drop un-needed fields from vcf, cannot pass in parse_fields to read_vcf() because ANN_gene_id is unknown until ANNTransformer runs for key in list(vcf_dict.keys()): @@ -95,6 +95,13 @@ def split_Ensembl_ids(id_list): df['variants/END'] = df['variants/END'].astype(int) df = df.drop_duplicates() + # for BND, make POS=END + bnd = df['variants/SVTYPE'] == 'BND' + df.loc[bnd, ['variants/POS']] = df.loc[bnd, ['variants/END']].values + df['variants/END'] = df['variants/END'].astype(int) + df['variants/POS'] = df['variants/POS'].astype(int) + df = df.drop_duplicates() + intervals.extend(df[sample_sv_fields].itertuples(index=False)) if ann_fields: @@ -103,8 +110,8 @@ def split_Ensembl_ids(id_list): ann_df = pd.concat(ann_dfs).astype(str).rename(columns=self.index_cols).set_index(list(self.index_cols.values())) if ann_fields else pd.DataFrame() ann_df = ann_df[~ann_df.index.duplicated(keep='first')] #annotations for the same SV in a vcf can have slighly differing fields (ex. SVSCORE_MEAN) - for i in intervals: - print(i) + # for i in intervals: + # print(i) return BedTool(intervals), ann_df, sample_names @@ -113,11 +120,11 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5): for l in bedtool.intersect(bedtool, wa=True, wb=True, F=reciprocal_overlap, f=reciprocal_overlap): - ref_chr, ref_start, ref_end, ref_svtype, ref_gt, ref_genes, ref_name, \ - samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_genes, samp_name = l + ref_chr, ref_start, ref_end, ref_svtype, ref_alt, ref_gt, ref_genes, ref_name, \ + samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_genes, samp_name = l - ref_interval = (ref_chr, ref_start, ref_end, ref_svtype) - samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name) + ref_interval = (ref_chr, ref_start, ref_end, ref_svtype, ref_alt) + samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name) if (samp_interval not in already_grouped_intervals) and (ref_svtype == samp_svtype): self._add_interval(ref_interval, ref_genes, samp_interval) @@ -126,13 +133,13 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5): self.df.sort_index(inplace=True) def _add_interval(self, ref_interval, ref_genes, samp_interval): - samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name = samp_interval + samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name = samp_interval #Get reference to row - if ref_interval not in self.df.index: + if ref_interval[:-1] not in self.df.index: #make new row - self.df.loc[ref_interval, :] = np.nan - row = self.df.loc[ref_interval, :] + self.df.loc[ref_interval[:-1], :] = np.nan + row = self.df.loc[ref_interval[:-1], :] row['N_SAMPLES'] = 0 row['Ensembl Gene ID'] = ref_genes @@ -141,14 +148,17 @@ def _add_interval(self, ref_interval, ref_genes, samp_interval): row["%s_SV_DETAILS" % name] = [] row["%s_GENOTYPE" % name] = [] else: - row = self.df.loc[ref_interval, :] + row = self.df.loc[ref_interval[:-1], :] #Set values for row if row[samp_name] == 0: row['N_SAMPLES'] += 1 row[samp_name] = 1 - row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype)) + if samp_svtype == 'BND': + row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype, samp_alt)) + else: + row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype)) row["%s_GENOTYPE" % samp_name].append(samp_gt) def write(self, outfile_name): diff --git a/crg.intersect_sv_vcfs.py b/crg.intersect_sv_vcfs.py old mode 100644 new mode 100755 index 00651e8..c30b38b --- a/crg.intersect_sv_vcfs.py +++ b/crg.intersect_sv_vcfs.py @@ -15,7 +15,10 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom protein_coding_ENSG = make_exon_gene_set(protein_coding_genes) print("Grouping like structural variants ...") - sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col]) + try: + sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col]) + except KeyError: + sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols) sample_cols = [ col for col in sv_records.df.columns if col != MetaSV_col ] sample_genotype_cols = [col for col in sample_cols if col.endswith('_GENOTYPE')] @@ -42,6 +45,8 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom sv_records.df[col] = "na" # format and rearrange the columns + if MetaSV_col not in sv_records.df.columns: + sv_records.df['variants/NUM_SVTOOLS'] = '1' sv_records.df = sv_records.df[ [col for col in sample_cols if col not in set(SVScore_cols + sample_genotype_cols + ['N_SAMPLES', 'Ensembl Gene ID']) ] + \ [ 'variants/SVLEN', ] + \ [ MetaSV_col ] + \ diff --git a/crg.intersect_sv_vcfs.sh b/crg.intersect_sv_vcfs.sh index 8cb0b01..1b3e5c0 100755 --- a/crg.intersect_sv_vcfs.sh +++ b/crg.intersect_sv_vcfs.sh @@ -52,23 +52,27 @@ do tabix $FILE done -#make filtered report -echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}" -${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES} +MANTA_BND=`ls ${FAMILY}_*/${FAMILY}/final/${FAMILY}*/*manta.BND.vcf.gz.snpeff.vcf.svscore.vcf | tr '\n' ' '` -${HOME}/crg/cap_report.py ${FAMILY}.wgs.sv.${TODAY}.tsv - -if [[ "$OSTYPE" == *"darwin"* ]]; then - open -a 'Microsoft Excel' ${FAMILY}.wgs.sv.${TODAY}.tsv -fi - - -#make unfiltered report -echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}" -${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES} - -${HOME}/crg/cap_report.py ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv +REPORT_CMD=`echo ${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS}` +for type in filtered bnd +do + if [ $type = "filtered" ]; then + echo $REPORT_CMD "-o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}" + $REPORT_CMD -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES} + elif [ $type = "unfiltered" ]; then + echo $REPORT_CMD "-o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}" + $REPORT_CMD -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES} + else + echo $REPORT_CMD "-o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND}" + $REPORT_CMD -o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND} + fi +done -if [[ "$OSTYPE" == *"darwin"* ]]; then - open -a 'Microsoft Excel' ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -fi +for report in ${FAMILY}.wgs.sv.${TODAY}.tsv ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv ${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv +do + ${HOME}/crg/cap_report.py $report + if [[ "$OSTYPE" == *"darwin"* ]]; then + open -a 'Microsoft Excel' $report + fi +done diff --git a/crg.merge.annotate.sv.sh b/crg.merge.annotate.sv.sh index 4302a5d..e2ceab0 100644 --- a/crg.merge.annotate.sv.sh +++ b/crg.merge.annotate.sv.sh @@ -12,6 +12,30 @@ if [ ! -f $logfile ]; then touch $logfile; fi; +#get manta BNDs supported by at least 5 split reads or read pairs +for f in $FAMILY_*/$FAMILY/final/$FAMILY* +do + bcftools view -i 'INFO/SVTYPE="BND" && (FORMAT/PR[0:1] >= 5 || FORMAT/SR[0:1] >= 5)' -O z $f/${FAMILY}-manta.vcf.gz > $f/${FAMILY}-manta.BND.vcf.gz +done + + +#run snpeff on manta vcf for each sample +for f in $FAMILY_*/$FAMILY/final/$FAMILY* +do + snpeff_manta_jobs+=($(qsub ~/crg/crg.snpeff.sh -F $f/${FAMILY}-manta.BND.vcf.gz)) +done +snpeff_manta_string=$( IFS=$':'; echo "${snpeff_manta_jobs[*]}" ) + + +#run svscore on manta vcf for each sample +for f in $FAMILY_*/$FAMILY/final/$FAMILY* +do + SAMPLE="$(echo $f | cut -d'/' -f1)" + svscore_manta_jobs+=($(qsub ~/crg/crg.svscore.sh -F $f/${FAMILY}-manta.BND.vcf.gz.snpeff.vcf -W depend=afterany:"${snpeff_manta_string}")) +done +svscore_manta_string=$( IFS=$':'; echo "${svscore_manta_jobs[*]}" ) + + #run metasv on each sample for f in $FAMILY_*/$FAMILY/final/$FAMILY* do @@ -22,7 +46,7 @@ do done metasv_string=$( IFS=$':'; echo "${metasv_jobs[*]}" ) -#run snpeff on each sample +#run snpeff on metasv vcf for each sample for f in $FAMILY_*/$FAMILY/final/$FAMILY* do SAMPLE="$(echo $f | cut -d'/' -f1)" @@ -44,7 +68,8 @@ echo "snpeff=${snpeff_string}" >> ${logfile} echo "svscore=${svscore_string}" >> ${logfile} echo "Merging SVs with metaSV, annotating with snpeff, scoring with svscore, and creating report..." -jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY -W depend=afterany:"${svscore_string}") +depend_string=`echo ${svscore_string}:${svscore_manta_string}` +jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY -W depend=afterany:"${depend_string}") echo "intersect=$jobid">> ${logfile}