Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update column names for gnomad v4 update #63

Open
wants to merge 6 commits into
base: hg38
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions cre.gemini.variant_impacts.vcf2db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
# database schema: https://gemini.readthedocs.io/en/latest/content/database_schema.html#the-variants-table
# by default bcbio writes PASS only variants to the database

#PBS -l walltime=1:00:00,nodes=1:ppn=1
#PBS -joe .
#PBS -d .
#PBS -l vmem=10g,mem=10g
#SBATCH --time=1:00:00
#SBATCH --ntasks-per-node=1
#SBATCH --mem=10G

if [ -z $file ]
then
Expand Down Expand Up @@ -81,7 +80,7 @@ fi
initialQuery=$sQuery" from variants v,variant_impacts i" #store field selection


sQuery=$initialQuery" where "$severity_filter" v.gnomad_af_popmax <= "$max_af" and \
sQuery=$initialQuery" where "$severity_filter" v.gnomad_fafmax_faf95_max <= "$max_af" and \
v.variant_id=i.variant_id "$caller_filter""

s_gt_filter=''
Expand Down Expand Up @@ -111,13 +110,13 @@ else
# grab the clinvar variants
cQuery=$initialQuery
# everything that has a clinvar_sig value
cQuery=$cQuery" where v.gnomad_af_popmax <= ${max_af} and v.variant_id=i.variant_id and v.clinvar_status <> '' "$caller_filter" "
#cQuery=$cQuery" where gnomad_af_popmax <= ${max_af} and v.variant_id=i.variant_id and clinvar_sig <> ''"
cQuery=$cQuery" where v.gnomad_fafmax_faf95_max <= ${max_af} and v.variant_id=i.variant_id and v.clinvar_status <> '' "$caller_filter" "
#cQuery=$cQuery" where gnomad_af_grpax <= ${max_af} and v.variant_id=i.variant_id and clinvar_sig <> ''"
# only get variants where AD >= 1 (any sample with an alternate read)
s_gt_filter="(gt_alt_depths).(*).(>=1).(any) or (gt_alt_depths).(*).(==-1).(all)"
gemini query -q "$cQuery" --gt-filter "$s_gt_filter" $file
# add variants where gnomad freq is > 1%, Clinvar is pathogenic, likely pathogenic or conflicting and any status except no assertion
cQuery=$initialQuery
cQuery=$cQuery" where v.gnomad_af_popmax > ${max_af} and v.variant_id=i.variant_id and v.clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity') "$caller_filter""
cQuery=$cQuery" where v.gnomad_fafmax_faf95_max > ${max_af} and v.variant_id=i.variant_id and v.clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity') "$caller_filter""
gemini query -q "$cQuery" $file
fi
19 changes: 10 additions & 9 deletions cre.gemini2txt.vcf2db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
# when using vcfanno/vcfdb loader some fields are different
# for some reason \n in the query string does not work here

#PBS -l walltime=1:00:00,nodes=1:ppn=1
#PBS -joe .
#PBS -d .
#PBS -l vmem=10g,mem=10g
#SBATCH --time=1:00:00
#SBATCH --ntasks-per-node=1
#SBATCH --mem=10G

if [ -z $file ]
then
Expand Down Expand Up @@ -79,10 +78,12 @@ sQuery="select \
domains as Protein_domains,\
rs_ids as rsIDs,\
gnomad_af as Gnomad_af,\
gnomad_af_popmax as Gnomad_af_popmax,\
gnomad_af_grpmax as Gnomad_af_grpmax,\
gnomad_ac as Gnomad_ac,\
gnomad_hom as Gnomad_hom,\
gnomad_male_ac as Gnomad_male_ac, \
gnomad_male_ac as Gnomad_male_ac, \
gnomad_fafmax_faf95_max as Gnomad_fafmax_faf95_max, \
gnomad_filter as Gnomad_filter, \
sift_score as Sift_score,\
polyphen_score as Polyphen_score,\
cadd_phred as Cadd_score,\
Expand Down Expand Up @@ -131,7 +132,7 @@ initialQuery=$sQuery # keep the field selection part for later use

#max_aaf_all frequency is from gemini.conf and does not include gnomad WGS frequencing, gnomad WES only
#gnomad_af includes gnomad WGS
sQuery=$sQuery" where gnomad_af_popmax <= "${max_af}" "$caller_filter""${severity_filter}""
sQuery=$sQuery" where gnomad_fafmax_faf95_max <= "${max_af}" "$caller_filter""${severity_filter}""

s_gt_filter=''
# denovo 0/1 is exported in cre.sh
Expand Down Expand Up @@ -163,14 +164,14 @@ else

# also get the clinvar variants (duplicates will be removed later)
cQuery=$initialQuery
cQuery=$cQuery" where gnomad_af_popmax <= ${max_af} "$caller_filter" and Clinvar <> ''"
cQuery=$cQuery" where gnomad_fafmax_faf95_max <= ${max_af} "$caller_filter" and Clinvar <> ''"
# only get variants where AD >= 1 (any sample with an alternate read)
c_gt_filter="(gt_alt_depths).(*).(>=1).(any) or (gt_alt_depths).(*).(==-1).(all)"
gemini query -q "$cQuery" --gt-filter "$c_gt_filter" $file

# if allele frequency is > 1% and Clinvar is pathogenic, likely pathogenic or conflicting and any status except for no assertion
cQuery=$initialQuery
cQuery=$cQuery" where gnomad_af_popmax > ${max_af} "$caller_filter" and Clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity')"
cQuery=$cQuery" where gnomad_fafmax_faf95_max > ${max_af} "$caller_filter" and Clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity')"
gemini query -q "$cQuery" $file

fi
65 changes: 65 additions & 0 deletions cre.gnomad_scores_v4.1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pandas as pd

"""
This script processes the gnomAD v4.1 constraint scores present here /hpf/largeprojects/ccmbio/ajain/crg2_hg38/gnomad_v4.1/constraint_metrics/gnomad.v4.1.constraint_metrics.tsv
to retain only Ensemble mane select genes.

Final processed file is located in ~/cre/data/gnomad_scores_v4.1.csv

Script written by Anjali Jain
Date: Aug 27, 2024
"""

# Read in the constraints file that was downloaded from gnomAD: https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv
constraints = pd.read_csv(
"/hpf/largeprojects/ccmbio/ajain/crg2_hg38/gnomad_v4.1/constraint_metrics/gnomad.v4.1.constraint_metrics.tsv",
sep="\t",
)

# Filtering for the Ensembl canonical transcript
constraints = constraints[
constraints["gene_id"].str.startswith("ENSG") & constraints["mane_select"] == True
]

# Check if there are any duplicates
if len(constraints["gene_id"]) == len(set(constraints["gene_id"])):
print(
f"There are no duplicates in this dataframe. There are {len(constraints['gene_id'])} unique ensembl mane select genes"
)
else:
print("Duplicates found!")

# Keeping columns that are relevant
constraints = constraints[
[
"gene_id",
"transcript",
"lof.oe",
"lof.oe_ci.lower",
"lof.oe_ci.upper",
"lof.pLI",
"lof.pNull",
"lof.pRec",
"mis.oe",
"mis.z_score",
]
]

# renaming the columns
constraints = constraints.rename(
columns={
"gene_id": "Ensembl_gene_id",
"transcript": "Ensembl_transcript_id",
"lof.oe": "Gnomad_oe_lof_score",
"lof.oe_ci.lower": "Gnomad_oe_ci_lower",
"lof.oe_ci.upper": "Gnomad_oe_ci_upper",
"lof.pLI": "Gnomad_pLI_score",
"lof.pNull": "Gnomad_pnull_score",
"lof.pRec": "Gnomad_prec_score",
"mis.oe": "Gnomad_oe_mis_score",
"mis.z_score": "Gnomad_mis_z_score",
}
)

# Write the constraints score to ~/cre/data/gnomad_scores_v4.1.csv
constraints.to_csv("~/cre/data/gnomad_scores_v4.1.csv", index=False)
33 changes: 20 additions & 13 deletions cre.vcf2db.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ create_report <- function(family, samples, type){
# Column3 = GNOMAD_Link
variants$GNOMAD_POS <- with(variants, paste(Chrom,Pos,Ref,Alt, sep='-'))
sGNOMAD1 <- "=HYPERLINK(\"http://gnomad.broadinstitute.org/variant/"
sGNOMAD2 <- "?dataset=gnomad_r3"
sGNOMAD2 <- "?dataset=gnomad_r4"
sGNOMAD3 <- "\",\"GNOMAD_link\")"
variants$GNOMAD_Link <- with(variants, paste(sGNOMAD1, GNOMAD_POS, sGNOMAD2, sGNOMAD3, sep = ''))

Expand Down Expand Up @@ -270,12 +270,19 @@ create_report <- function(family, samples, type){

# population frequencies
# Column34 = Gnomad_af
# Column35 = Gnomad_af_popmax
# Column35 = gnomad_af_grpmax

# Gnomad gene constraint scores
# Column36 = Gnomad_oe_lof_score
# Column37 = Gnomad_oe_mis_score
gnomad_scores_file <- paste0(default_tables_path, "/gnomad_scores.csv")
# Column = Ensembl_transcript_id
# Column = Gnomad_oe_ci_lower
# Column = Gnomad_oe_ci_upper
# Column = Gnomad_pLI_score
# Column = Gnomad_pnull_score
# Column = Gnomad_prec_score
# Column = Gnomad_mis_z_score
gnomad_scores_file <- paste0(default_tables_path, "/gnomad_scores_v4.1.csv")
gnomad_scores <- read.csv(gnomad_scores_file, stringsAsFactors = F)
variants <- merge(variants, gnomad_scores, all.x = T, all.y = F)

Expand Down Expand Up @@ -411,7 +418,7 @@ create_report <- function(family, samples, type){
# Column 57: ENH_cellline_tissue

# replace -1 with 0
for (field in c("Trio_coverage", "Gnomad_af", "Gnomad_af_popmax")){
for (field in c("Trio_coverage", "Gnomad_af", "Gnomad_af_grpmax")){
variants[,field] <- with(variants, gsub("-1", "0", get(field), fixed = T))
variants[,field] <- with(variants, gsub("None", "0", get(field), fixed = T))
}
Expand Down Expand Up @@ -453,13 +460,13 @@ select_and_write2 <- function(variants, samples, prefix, type)
"C4R_WES_counts", "C4R_WES_samples"),
wgs_counts,
c("HGMD_id", "HGMD_gene", "HGMD_tag", "HGMD_ref",
"Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
"Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom","Gnomad_male_ac","Gnomad_fafmax_faf95_max", "Gnomad_filter",
"Ensembl_transcript_id", "AA_position", "Exon", "Protein_domains", "rsIDs",
"Gnomad_oe_lof_score", "Gnomad_oe_mis_score", "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score",
"Gnomad_oe_lof_score", "Gnomad_oe_ci_lower","Gnomad_oe_ci_upper","Gnomad_oe_mis_score", "Gnomad_mis_z_score","Gnomad_pLI_score","Gnomad_pnull_score","Gnomad_prec_score",
"Conserved_in_30_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score", "Gerp_score", "AlphaMissense"),
noncoding_scores,
c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal", "Gnomad_male_ac",
"Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)]
c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal",
"Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)]

variants <- variants[order(variants$Position),]

Expand Down Expand Up @@ -828,15 +835,15 @@ clinical_report <- function(project,samples,type){
# for clinical, only keep variants where one of the alt depths was >= 20
full_report <- dplyr::filter_at(full_report, paste0("Alt_depths.",samples), any_vars(as.integer(.) >= 20))
filtered_report <- subset(full_report,
Quality > 1000 & Gnomad_af_popmax < 0.005 & C4R_WES_counts < 6,
Quality > 1000 & Gnomad_af_grpmax < 0.005 & C4R_WES_counts < 6,
select = c("Position", "GNOMAD_Link", "Ref", "Alt", "Gene", paste0("Zygosity.", samples),
paste0("Burden.",samples),
paste0("Alt_depths.",samples),
"Variation", "Info", "Refseq_change", "omim_phenotype", "omim_inheritance",
"Orphanet", "Clinvar", "C4R_WES_counts",
"Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
"Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom","Gnomad_male_ac","Gnomad_fafmax_faf95_max","Gnomad_filter",
"Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score",
"Imprinting_status", "Pseudoautosomal", "Gnomad_male_ac", "UCE_100bp","UCE_200bp")
"Imprinting_status", "Pseudoautosomal", "UCE_100bp","UCE_200bp")
)

# recalculate burden using the filtered report
Expand All @@ -860,9 +867,9 @@ clinical_report <- function(project,samples,type){
paste0("Burden.", samples),
"Variation", "Info", "Refseq_change", "omim_phenotype", "omim_inheritance",
"Orphanet", "Clinvar", "C4R_WES_counts",
"Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
"Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom", "Gnomad_male_ac","Gnomad_fafmax_faf95_max","Gnomad_filter",
"Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score",
"Imprinting_status", "Pseudoautosomal", "Gnomad_male_ac", "UCE_100bp", "UCE_200bp")]
"Imprinting_status", "Pseudoautosomal", "UCE_100bp", "UCE_200bp")]

write.csv(filtered_report, paste0(project, ".clinical.", type, ".", datetime, ".csv"), row.names = F)
}
Expand Down
Loading