Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create report from Manta BNDs #12

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions SVRecords/SVGrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def split_Ensembl_ids(id_list):
ann_dfs = []

index_fields = list(self.index_cols.keys()) #CHR POS STOP needs to be first 3 columns for creation of BedTool instance
sample_sv_fields = index_fields + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples']
sample_sv_fields = index_fields + ['variants/ALT'] + ['calldata/GT', 'variants/ANN_Gene_ID', 'samples']
parse_fields = list(set(sample_sv_fields + ann_fields))

for vcf_path in vcf_paths:
Expand All @@ -69,8 +69,8 @@ def split_Ensembl_ids(id_list):
# if 'chr' in CHROM field, remove
vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']]

# if 'chr' in CHROM field, remove
vcf_dict['variants/CHROM'] = [chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']]
# grab alt allele
vcf_dict['variants/ALT'] = [alt[0] for alt in vcf_dict['variants/ALT']]

# drop un-needed fields from vcf, cannot pass in parse_fields to read_vcf() because ANN_gene_id is unknown until ANNTransformer runs
for key in list(vcf_dict.keys()):
Expand All @@ -95,6 +95,13 @@ def split_Ensembl_ids(id_list):
df['variants/END'] = df['variants/END'].astype(int)
df = df.drop_duplicates()

# for BND, make POS=END
bnd = df['variants/SVTYPE'] == 'BND'
df.loc[bnd, ['variants/POS']] = df.loc[bnd, ['variants/END']].values
df['variants/END'] = df['variants/END'].astype(int)
df['variants/POS'] = df['variants/POS'].astype(int)
df = df.drop_duplicates()

intervals.extend(df[sample_sv_fields].itertuples(index=False))

if ann_fields:
Expand All @@ -103,8 +110,8 @@ def split_Ensembl_ids(id_list):
ann_df = pd.concat(ann_dfs).astype(str).rename(columns=self.index_cols).set_index(list(self.index_cols.values())) if ann_fields else pd.DataFrame()
ann_df = ann_df[~ann_df.index.duplicated(keep='first')] #annotations for the same SV in a vcf can have slighly differing fields (ex. SVSCORE_MEAN)

for i in intervals:
print(i)
# for i in intervals:
# print(i)

return BedTool(intervals), ann_df, sample_names

Expand All @@ -113,11 +120,11 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5):

for l in bedtool.intersect(bedtool, wa=True, wb=True, F=reciprocal_overlap, f=reciprocal_overlap):

ref_chr, ref_start, ref_end, ref_svtype, ref_gt, ref_genes, ref_name, \
samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_genes, samp_name = l
ref_chr, ref_start, ref_end, ref_svtype, ref_alt, ref_gt, ref_genes, ref_name, \
samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_genes, samp_name = l

ref_interval = (ref_chr, ref_start, ref_end, ref_svtype)
samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name)
ref_interval = (ref_chr, ref_start, ref_end, ref_svtype, ref_alt)
samp_interval = (samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name)

if (samp_interval not in already_grouped_intervals) and (ref_svtype == samp_svtype):
self._add_interval(ref_interval, ref_genes, samp_interval)
Expand All @@ -126,13 +133,13 @@ def _group_sv(self, bedtool, reciprocal_overlap=0.5):
self.df.sort_index(inplace=True)

def _add_interval(self, ref_interval, ref_genes, samp_interval):
samp_chr, samp_start, samp_end, samp_svtype, samp_gt, samp_name = samp_interval
samp_chr, samp_start, samp_end, samp_svtype, samp_alt, samp_gt, samp_name = samp_interval

#Get reference to row
if ref_interval not in self.df.index:
if ref_interval[:-1] not in self.df.index:
#make new row
self.df.loc[ref_interval, :] = np.nan
row = self.df.loc[ref_interval, :]
self.df.loc[ref_interval[:-1], :] = np.nan
row = self.df.loc[ref_interval[:-1], :]
row['N_SAMPLES'] = 0
row['Ensembl Gene ID'] = ref_genes

Expand All @@ -141,14 +148,17 @@ def _add_interval(self, ref_interval, ref_genes, samp_interval):
row["%s_SV_DETAILS" % name] = []
row["%s_GENOTYPE" % name] = []
else:
row = self.df.loc[ref_interval, :]
row = self.df.loc[ref_interval[:-1], :]

#Set values for row
if row[samp_name] == 0:
row['N_SAMPLES'] += 1
row[samp_name] = 1

row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype))
if samp_svtype == 'BND':
row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype, samp_alt))
else:
row["%s_SV_DETAILS" % samp_name].append('{}:{}-{}:{}'.format(samp_chr, samp_start, samp_end, samp_svtype))
row["%s_GENOTYPE" % samp_name].append(samp_gt)

def write(self, outfile_name):
Expand Down
7 changes: 6 additions & 1 deletion crg.intersect_sv_vcfs.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom
protein_coding_ENSG = make_exon_gene_set(protein_coding_genes)

print("Grouping like structural variants ...")
sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col])
try:
sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols + [MetaSV_col])
except KeyError:
sv_records = SVGrouper(vcfs, ann_fields=SVScore_cols)
sample_cols = [ col for col in sv_records.df.columns if col != MetaSV_col ]
sample_genotype_cols = [col for col in sample_cols if col.endswith('_GENOTYPE')]

Expand All @@ -42,6 +45,8 @@ def main(protein_coding_genes, exon_bed, hgmd_db, hpo, exac, omim, biomart, gnom
sv_records.df[col] = "na"

# format and rearrange the columns
if MetaSV_col not in sv_records.df.columns:
sv_records.df['variants/NUM_SVTOOLS'] = '1'
sv_records.df = sv_records.df[ [col for col in sample_cols if col not in set(SVScore_cols + sample_genotype_cols + ['N_SAMPLES', 'Ensembl Gene ID']) ] + \
[ 'variants/SVLEN', ] + \
[ MetaSV_col ] + \
Expand Down
40 changes: 22 additions & 18 deletions crg.intersect_sv_vcfs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,27 @@ do
tabix $FILE
done

#make filtered report
echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}"
${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}
MANTA_BND=`ls ${FAMILY}_*/${FAMILY}/final/${FAMILY}*/*manta.BND.vcf.gz.snpeff.vcf.svscore.vcf | tr '\n' ' '`

${HOME}/crg/cap_report.py ${FAMILY}.wgs.sv.${TODAY}.tsv

if [[ "$OSTYPE" == *"darwin"* ]]; then
open -a 'Microsoft Excel' ${FAMILY}.wgs.sv.${TODAY}.tsv
fi


#make unfiltered report
echo "${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}"
${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS} -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}

${HOME}/crg/cap_report.py ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv
REPORT_CMD=`echo ${PY} ${HOME}/crg/crg.intersect_sv_vcfs.py -protein_coding_genes=${PROTEIN_CODING_GENES} -exon_bed=${EXON_BED} -hgmd=${HGMD} -hpo=${HPO} -exac=${EXAC} -omim=${OMIM} -biomart=${BIOMART} -gnomad=${GNOMAD} -sv_counts ${MSSNG_MANTA_COUNTS} ${MSSNG_LUMPY_COUNTS}`
for type in filtered bnd
do
if [ $type = "filtered" ]; then
echo $REPORT_CMD "-o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}"
$REPORT_CMD -o=${FAMILY}.wgs.sv.${TODAY}.tsv -i ${FILTERED_FILES}
elif [ $type = "unfiltered" ]; then
echo $REPORT_CMD "-o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}"
$REPORT_CMD -o=${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv -i ${IN_FILES}
else
echo $REPORT_CMD "-o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND}"
$REPORT_CMD -o=${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv -i ${MANTA_BND}
fi
done

if [[ "$OSTYPE" == *"darwin"* ]]; then
open -a 'Microsoft Excel' ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv
fi
for report in ${FAMILY}.wgs.sv.${TODAY}.tsv ${FAMILY}.unfiltered.wgs.sv.${TODAY}.tsv ${FAMILY}.manta.bnd.wgs.sv.${TODAY}.tsv
do
${HOME}/crg/cap_report.py $report
if [[ "$OSTYPE" == *"darwin"* ]]; then
open -a 'Microsoft Excel' $report
fi
done
29 changes: 27 additions & 2 deletions crg.merge.annotate.sv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,30 @@ if [ ! -f $logfile ]; then
touch $logfile;
fi;

#get manta BNDs supported by at least 5 split reads or read pairs
for f in $FAMILY_*/$FAMILY/final/$FAMILY*
do
bcftools view -i 'INFO/SVTYPE="BND" && (FORMAT/PR[0:1] >= 5 || FORMAT/SR[0:1] >= 5)' -O z $f/${FAMILY}-manta.vcf.gz > $f/${FAMILY}-manta.BND.vcf.gz
done


#run snpeff on manta vcf for each sample
for f in $FAMILY_*/$FAMILY/final/$FAMILY*
do
snpeff_manta_jobs+=($(qsub ~/crg/crg.snpeff.sh -F $f/${FAMILY}-manta.BND.vcf.gz))
done
snpeff_manta_string=$( IFS=$':'; echo "${snpeff_manta_jobs[*]}" )


#run svscore on manta vcf for each sample
for f in $FAMILY_*/$FAMILY/final/$FAMILY*
do
SAMPLE="$(echo $f | cut -d'/' -f1)"
svscore_manta_jobs+=($(qsub ~/crg/crg.svscore.sh -F $f/${FAMILY}-manta.BND.vcf.gz.snpeff.vcf -W depend=afterany:"${snpeff_manta_string}"))
done
svscore_manta_string=$( IFS=$':'; echo "${svscore_manta_jobs[*]}" )


#run metasv on each sample
for f in $FAMILY_*/$FAMILY/final/$FAMILY*
do
Expand All @@ -22,7 +46,7 @@ do
done
metasv_string=$( IFS=$':'; echo "${metasv_jobs[*]}" )

#run snpeff on each sample
#run snpeff on metasv vcf for each sample
for f in $FAMILY_*/$FAMILY/final/$FAMILY*
do
SAMPLE="$(echo $f | cut -d'/' -f1)"
Expand All @@ -44,7 +68,8 @@ echo "snpeff=${snpeff_string}" >> ${logfile}
echo "svscore=${svscore_string}" >> ${logfile}
echo "Merging SVs with metaSV, annotating with snpeff, scoring with svscore, and creating report..."

jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY -W depend=afterany:"${svscore_string}")
depend_string=`echo ${svscore_string}:${svscore_manta_string}`
jobid=$(qsub ~/crg/crg.intersect_sv_vcfs.sh -F $FAMILY -W depend=afterany:"${depend_string}")

echo "intersect=$jobid">> ${logfile}

Expand Down