Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mity v1.1.0 (FINAL) #243

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config_hpf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ tools:
crg: "~/crg"
crg2: "~/crg2"
ehdn: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0"
mity: "/hpf/largeprojects/ccmbio/ajain/mity/mity_package/bin"
mity: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/mity_v1.1.0/bin"
melt: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/MELT/MELTv2.2.2/MELT.jar"
orad: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/orad_2_6_1/orad"
annovar: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/annovar"
Expand Down
11 changes: 11 additions & 0 deletions envs/mt_report.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
channels:
- bioconda
- conda-forge
dependencies:
- python > 3.5.4
- pandas==1.0.1
- pysam==0.15.3
- python-dateutil==2.8.0
- pyvcf==0.6.8
- xlsxwriter==1.1.8
- openpyxl==3.0.3
47 changes: 22 additions & 25 deletions rules/mito_variants.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ rule mity_call:
input:
bam=[expand("recal/{family}_{sample}.bam".format(family=config["run"]["project"], sample=s)) for s in samples.index]
output:
protected("mitochondrial_variants/{family}.pre-normalised.mity.vcf.gz")
protected("mitochondrial_variants/{family}.mity.call.vcf.gz")
params:
outdir="mitochondrial_variants/",
prefix="{family}.pre-normalised",
tool=config["tools"]["mity"]
log:
"logs/mity/mity_call/{family}.mity_call.log"
Expand All @@ -15,42 +14,40 @@ rule mity_call:

rule mity_normalise:
input:
"mitochondrial_variants/{family}.pre-normalised.mity.vcf.gz"
"mitochondrial_variants/{family}.mity.call.vcf.gz"
output:
protected("mitochondrial_variants/{family}.normalised.mity.vcf.gz")
protected("mitochondrial_variants/{family}.mity.normalise.decompose.vcf.gz")
params:
outdir="mitochondrial_variants/",
tool=config["tools"]["mity"]
log:
"logs/mity/mity_normalise/{family}.mity_normalise.log"
wrapper:
get_wrapper_path("mity/normalise")

rule mito_vcfanno:
input:
"mitochondrial_variants/{family}.normalised.mity.vcf.gz"
output:
"mitochondrial_variants/vcfanno/{family}.normalised.mity.vcfanno.vcf"
log:
"logs/mity/vcfanno/{family}.mity.vcfanno.log"
threads: 10
resources:
mem_mb = 20000
params:
conf = config["annotation"]["mt.vcfanno"]["conf"],
base_path = config["annotation"]["mt.vcfanno"]["base_path"]
wrapper:
get_wrapper_path("vcfanno")

rule mity_report:
input:
"mitochondrial_variants/vcfanno/{family}.normalised.mity.vcfanno.vcf"
"mitochondrial_variants/{family}.mity.normalise.decompose.vcf.gz"
output:
"report/mitochondrial/{family}.mitochondrial.report.csv"
"mitochondrial_variants/{family}.mity.annotated.vcf",
"mitochondrial_variants/{family}.annotated_variants.xlsx"
params:
outdir="report/mitochondrial/",
prefix="{family}_mito",
outdir="mitochondrial_variants/",
tool=config["tools"]["mity"]
log:
"logs/mity/mity_report/{family}.mity_report.log"
wrapper:
get_wrapper_path("mity/report")
get_wrapper_path("mity/report")

rule generate_mt_report:
input:
vcf="mitochondrial_variants/{family}.mity.annotated.vcf.gz",
report="mitochondrial_variants/{family}.annotated_variants.xlsx"
output:
"report/mitochondrial/{family}.mitochondrial.report.csv"
log:
"logs/report/mitochondrial/{family}.mitochondrial.report.log"
conda:
"../envs/mt_report.yaml"
script:
"../scripts/mt_report.py"
161 changes: 74 additions & 87 deletions wrappers/mity/report/process_report.py → scripts/mt_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import io
import gzip
from datetime import datetime


def log_message(*message):
Expand Down Expand Up @@ -49,23 +50,6 @@ def get_INFO_annot(df):

return annotations_df


def correct_mitotip_interpretations(df):
"""Change the MitoTip_interpretation column to correct values based on values of MitoTip_Score"""

# Change datatype of column
df["MitoTip_score"] = pd.to_numeric(df["MitoTip_score"], errors="coerce")

df.loc[df["MitoTip_score"] > 8.44, "MitoTip_interpretation"] = "possibly benign"
df.loc[
df["MitoTip_score"] > 12.66, "MitoTip_interpretation"
] = "possibly pathogenic"
df.loc[df["MitoTip_score"] > 16.25, "MitoTip_interpretation"] = "likely pathogenic"
df["MitoTip_score"] = df["MitoTip_score"].replace(np.nan, ".")
log_message("Corrected the MitoTip_interpretation column in report.")
return df


def change_annot_9155(df):
"""Change status_mitomap column for m.9155A>G from . to Confirmed"""
variant_entry = df.loc[df.HGVS == "m.9155A>G"]
Expand All @@ -80,25 +64,23 @@ def change_annot_9155(df):
)
return df


def concat_df(df1, df2):
"""Concatenate two dataframes along axis 1 (column)"""

concatenated_df = pd.concat([df1, df2], axis=1)
log_message("Successfully joined the two dataframes")
return concatenated_df


def remove_cols(df):
"""Remove unwanted columns from the report dataframe"""

# List of column to be removed from the file
remove_cols = [
"tier",
"ref_depth",
"total_locus_depth",
"variant_quality",
"locus_mitomap",
"TIER",
"REF DEPTH",
"TOTAL LOCUS DEPTH",
"VARIANT QUALITY",
"LOCUS MITOMAP",
"QUAL",
# "FILTER",
"MQM_INFO",
Expand Down Expand Up @@ -135,15 +117,15 @@ def split_cols_by_sample(grouped_df):
samples_info = {}
for variant, row in grouped_df.iterrows():
sample = str.split(row.SAMPLE, ";")
alt_depth = str.split(row.alt_depth, ";")
sample_depth = str.split(row.total_sample_depth, ";")
variant_heteroplasmy = str.split(row.variant_heteroplasmy, ";")
ALT_DEPTH = str.split(row["ALT DEPTH"], ";")
SAMPLE_DEPTH = str.split(row["TOTAL SAMPLE DEPTH"], ";")
VARIANT_HETEROPLASMY = str.split(row["VARIANT HETEROPLASMY"], ";")

x = {}
for i in range(0, len(sample)):
x[f"{sample[i]}.variant_heteroplasmy"] = variant_heteroplasmy[i]
x[f"{sample[i]}.alt_depth"] = alt_depth[i]
x[f"{sample[i]}.total_sample_depth"] = sample_depth[i]
x[f"{sample[i]}.VARIANT HETEROPLASMY"] = VARIANT_HETEROPLASMY[i]
x[f"{sample[i]}.ALT DEPTH"] = ALT_DEPTH[i]
x[f"{sample[i]}.TOTAL SAMPLE DEPTH"] = SAMPLE_DEPTH[i]
samples_info[variant] = x
df = pd.DataFrame(samples_info).transpose().sort_index(axis=1).fillna("-")
df["HGVS"] = df.index
Expand All @@ -157,26 +139,26 @@ def sort_by_sample(df):
[
"HGVS",
"SAMPLE",
"alt_depth",
"total_sample_depth",
"variant_heteroplasmy",
"ALT DEPTH",
"TOTAL SAMPLE DEPTH",
"VARIANT HETEROPLASMY",
]
]
grouped_df = subset_df.groupby("HGVS").agg(
{
"SAMPLE": lambda x: ";".join(str(i) for i in x),
"alt_depth": lambda x: ";".join(str(i) for i in x),
"total_sample_depth": lambda x: ";".join(str(i) for i in x),
"variant_heteroplasmy": lambda x: ";".join(str(i) for i in x),
"ALT DEPTH": lambda x: ";".join(str(i) for i in x),
"TOTAL SAMPLE DEPTH": lambda x: ";".join(str(i) for i in x),
"VARIANT HETEROPLASMY": lambda x: ";".join(str(i) for i in x),
}
)

df = df.drop(
[
"SAMPLE",
"alt_depth",
"total_sample_depth",
"variant_heteroplasmy",
"ALT DEPTH",
"TOTAL SAMPLE DEPTH",
"VARIANT HETEROPLASMY",
],
1,
)
Expand All @@ -200,13 +182,13 @@ def get_vcf_info(vcf,report,samples):
#If pos, ref and alt match with the respective columns in the VCF then get the sample depth for that sample
depth=list(vcf[(vcf["POS"]==pos) & (vcf["REF"]==ref) & (vcf["ALT"]==alt)][i])[0].split(":")[1]
vaf=list(vcf[(vcf["POS"]==pos) & (vcf["REF"]==ref) & (vcf["ALT"]==alt)][i])[0].split(":")[9]
AD=list(vcf[(vcf["POS"]==pos) & (vcf["REF"]==ref) & (vcf["ALT"]==alt)][i])[0].split(":")[6]
AD=list(vcf[(vcf["POS"]==pos) & (vcf["REF"]==ref) & (vcf["ALT"]==alt)][i])[0].split(":")[5]
sample_depths.append(depth)
vafs.append(vaf)
alt_depths.append(AD)
report[f"{i}.total_sample_depth"]=sample_depths
report[f"{i}.variant_heteroplasmy"]=vafs
report[f"{i}.alt_depth"]=alt_depths
report[f"{i}.TOTAL SAMPLE DEPTH"]=sample_depths
report[f"{i}.VARIANT HETEROPLASMY"]=vafs
report[f"{i}.ALT DEPTH"]=alt_depths

return report

Expand All @@ -230,9 +212,9 @@ def reorder_cols(df):

colnames = df.columns

variant_heteroplasmy = [x for x in colnames if x.endswith("variant_heteroplasmy")]
alt_depth = [x for x in colnames if x.endswith("alt_depth")]
total_sample_depth = [x for x in colnames if x.endswith("total_sample_depth")]
variant_heteroplasmy = [x for x in colnames if x.endswith("VARIANT HETEROPLASMY")]
alt_depth = [x for x in colnames if x.endswith("ALT DEPTH")]
total_sample_depth = [x for x in colnames if x.endswith("TOTAL SAMPLE DEPTH")]

df=remove_blacklist_pos(df)

Expand All @@ -242,43 +224,43 @@ def reorder_cols(df):
"REF",
"ALT",
"HGVS",
"gene/locus",
"gene/locus description",
"COHORT_COUNT",
"GENE/LOCUS",
"GENE/LOCUS DESCRIPTION",
"COHORT COUNT",
"SAMPLE",
]

col_list2 = [
"disease_mitomap",
"status_mitomap",
"disease_amino_acid_change_mitomap",
"allele_frequency_mitomap",
"GenBank_frequency_mitomap",
"DISEASE MITOMAP",
"STATUS MITOMAP",
"DISEASE AMINO ACID CHANGE MITOMAP",
"ALLELE FREQUENCY MITOMAP",
"GENBANK FREQUENCY MITOMAP",
"clinvar_pathogenic",
"gnomAD_AC_hom",
"gnomAD_AC_het",
"gnomAD_AF_hom",
"gnomAD_AF_het",
"gnomAD_max_hl",
"homoplasmy_mitomap",
"heteroplasmy_mitomap",
"num_references_mitomap",
"variant_amino_acid_change_mitomap",
"codon_position_mitomap",
"codon_number_mitomap",
"num_disease_references_mitomap",
"RNA_mitomap",
"commercial_panels",
"phylotree_haplotype",
"MitoTip_score",
"MitoTip_interpretation",
"MitoTip_percentile",
"anticodon",
"MGRB_frequency",
"MGRB_FILTER",
"MGRB_AC",
"MGRB_AN",
"phylotree_mut",
"HOMOPLASMY MITOMAP",
"HETEROPLASMY MITOMAP",
"NUMBER OF REFERENCES MITOMAP",
"VARIANT AMINO ACID CHANGE MITOMAP",
"CODON POSITION MITOMAP",
"CODON NUMBER MITOMAP",
"NUM DISEASE REFERENCES MITOMAP",
"RNA MITOMAP",
"COMMERCIAL PANELS",
"PHYLOTREE HAPLOTYPE",
"MITOTIP SCORE",
"MITOTIP INTERPRETATION",
"MITOTIP PERCENTILE",
"ANTICODON",
"MGRB FREQUENCY",
"MGRB FILTER",
"MGRB AC",
"MGRB AN",
"PHYLOTREE MUT",
]
final_col_list = (
col_list + variant_heteroplasmy + alt_depth + total_sample_depth + col_list2
Expand All @@ -288,20 +270,20 @@ def reorder_cols(df):

# replace '.'/'-' with '0' for some columns
replace_col_values = [
"allele_frequency_mitomap",
"GenBank_frequency_mitomap",
"ALLELE FREQUENCY MITOMAP",
"GENBANK FREQUENCY MITOMAP",
"gnomAD_AC_hom",
"gnomAD_AC_het",
"gnomAD_AF_hom",
"gnomAD_AF_het",
"gnomAD_max_hl",
"MGRB_frequency",
"MGRB_FILTER",
"MGRB_AC",
"MGRB_AN",
"MGRB FREQUENCY",
#"MGRB FILTER",
"MGRB AC",
"MGRB AN",
]

reordered_df[replace_col_values] = reordered_df[replace_col_values].replace(".", 0)
for col in replace_col_values:
reordered_df[col] = reordered_df[col].replace(".", 0)

log_message(
"Replaced . and - with 0 for frequency columns and rearanged the columns in the dataframe"
Expand All @@ -323,7 +305,7 @@ def read_vcf(vcf):
return vcf_df

def main(vcf, report, family):
logfile = f"logs/mity/mity_report/{family}.mity_report.log"
logfile = f"logs/report/mitochondrial/{family}.mitochondrial.report.log"
logging.basicConfig(
filename=logfile,
filemode="w",
Expand All @@ -332,28 +314,33 @@ def main(vcf, report, family):
datefmt="%Y-%m-%d %H:%M",
)

report_df = pd.read_csv(report)
report_df = pd.read_excel(report,engine="openpyxl")
vcf_df=read_vcf(vcf)

CV_gnomad_annots_df = get_INFO_annot(report_df)
merged_report = concat_df(report_df, CV_gnomad_annots_df)
final_report = remove_cols(merged_report)
final_report = check_sort(vcf_df,final_report)
final_report = reorder_cols(final_report)
final_report = correct_mitotip_interpretations(final_report)
final_report = change_annot_9155(final_report)

final_report.to_csv(
f"report/mitochondrial/{family}.mitochondrial.report.csv", index=False
)

current_date=datetime.now().strftime("%Y-%m-%d")

final_report.to_csv(
f"report/mitochondrial/{family}.mitochondrial.report.{current_date}.csv", index=False
)

log_message(
"Final formatted report containing annotated list of mitochondrial variants created!"
)


if __name__ == "__main__":
family = snakemake.wildcards.family
vcf= snakemake.input
report = f"report/mitochondrial/{family}_mito.annotated_variants.csv"
vcf= snakemake.input.vcf
report=snakemake.input.report
main(vcf,report,family)
Loading