ccmbioinfo · anjalijain22 · Aug 13, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/cre.gemini.variant_impacts.vcf2db.sh b/cre.gemini.variant_impacts.vcf2db.sh
@@ -3,10 +3,9 @@
 #   database schema: https://gemini.readthedocs.io/en/latest/content/database_schema.html#the-variants-table
 #   by default bcbio writes PASS only variants to the database
 
-#PBS -l walltime=1:00:00,nodes=1:ppn=1
-#PBS -joe .
-#PBS -d .
-#PBS -l vmem=10g,mem=10g
+#SBATCH --time=1:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=10G
 
 if [ -z $file ]
 then
@@ -81,7 +80,7 @@ fi
 initialQuery=$sQuery" from variants v,variant_impacts i" #store field selection
 
 
-sQuery=$initialQuery" where "$severity_filter" v.gnomad_af_popmax <= "$max_af" and \
+sQuery=$initialQuery" where "$severity_filter" v.gnomad_fafmax_faf95_max <= "$max_af" and \
 v.variant_id=i.variant_id "$caller_filter""
 
 s_gt_filter=''
@@ -111,13 +110,13 @@ else
     # grab the clinvar variants
     cQuery=$initialQuery 
     # everything that has a clinvar_sig value
-	cQuery=$cQuery" where v.gnomad_af_popmax <= ${max_af} and v.variant_id=i.variant_id and v.clinvar_status <> '' "$caller_filter" "
-    #cQuery=$cQuery" where gnomad_af_popmax <= ${max_af} and v.variant_id=i.variant_id and clinvar_sig <> ''"
+	cQuery=$cQuery" where v.gnomad_fafmax_faf95_max <= ${max_af} and v.variant_id=i.variant_id and v.clinvar_status <> '' "$caller_filter" "
+    #cQuery=$cQuery" where gnomad_af_grpax <= ${max_af} and v.variant_id=i.variant_id and clinvar_sig <> ''"
     # only get variants where AD >= 1 (any sample with an alternate read)
     s_gt_filter="(gt_alt_depths).(*).(>=1).(any) or (gt_alt_depths).(*).(==-1).(all)"
     gemini query -q "$cQuery" --gt-filter "$s_gt_filter" $file
     # add variants where gnomad freq is > 1%, Clinvar is pathogenic, likely pathogenic or conflicting and any status except no assertion 
     cQuery=$initialQuery
-    cQuery=$cQuery" where v.gnomad_af_popmax > ${max_af} and v.variant_id=i.variant_id and v.clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity') "$caller_filter""
+    cQuery=$cQuery" where v.gnomad_fafmax_faf95_max > ${max_af} and v.variant_id=i.variant_id and v.clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity') "$caller_filter""
     gemini query -q "$cQuery" $file
 fi
diff --git a/cre.gemini2txt.vcf2db.sh b/cre.gemini2txt.vcf2db.sh
@@ -8,10 +8,9 @@
 #  when using vcfanno/vcfdb loader some fields are different
 #  for some reason \n in the query string does not work here
 
-#PBS -l walltime=1:00:00,nodes=1:ppn=1
-#PBS -joe .
-#PBS -d .
-#PBS -l vmem=10g,mem=10g
+#SBATCH --time=1:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=10G
 
 if [ -z $file ]
 then
@@ -79,10 +78,12 @@ sQuery="select \
         domains as Protein_domains,\
         rs_ids as rsIDs,\
         gnomad_af as Gnomad_af,\
-        gnomad_af_popmax as Gnomad_af_popmax,\
+        gnomad_af_grpmax as Gnomad_af_grpmax,\
         gnomad_ac as Gnomad_ac,\
         gnomad_hom as Gnomad_hom,\
-	      gnomad_male_ac as Gnomad_male_ac, \
+	    gnomad_male_ac as Gnomad_male_ac, \
+        gnomad_fafmax_faf95_max as Gnomad_fafmax_faf95_max, \
+        gnomad_filter as Gnomad_filter, \
         sift_score as Sift_score,\
         polyphen_score as Polyphen_score,\
         cadd_phred as Cadd_score,\
@@ -131,7 +132,7 @@ initialQuery=$sQuery # keep the field selection part for later use
 
 #max_aaf_all frequency is from gemini.conf and does not include gnomad WGS frequencing, gnomad WES only
 #gnomad_af includes gnomad WGS
-sQuery=$sQuery" where gnomad_af_popmax <= "${max_af}" "$caller_filter""${severity_filter}""
+sQuery=$sQuery" where gnomad_fafmax_faf95_max <= "${max_af}" "$caller_filter""${severity_filter}""
 
 s_gt_filter=''
 # denovo 0/1 is exported in cre.sh
@@ -163,14 +164,14 @@ else
 
     # also get the clinvar variants (duplicates will be removed later)
     cQuery=$initialQuery
-    cQuery=$cQuery" where gnomad_af_popmax <= ${max_af} "$caller_filter" and Clinvar <> ''"
+    cQuery=$cQuery" where gnomad_fafmax_faf95_max <= ${max_af} "$caller_filter" and Clinvar <> ''"
     # only get variants where AD >= 1 (any sample with an alternate read)
     c_gt_filter="(gt_alt_depths).(*).(>=1).(any) or (gt_alt_depths).(*).(==-1).(all)"
     gemini query -q "$cQuery" --gt-filter "$c_gt_filter" $file
 
     # if allele frequency is > 1% and Clinvar is pathogenic, likely pathogenic or conflicting and any status except for no assertion
     cQuery=$initialQuery
-    cQuery=$cQuery" where gnomad_af_popmax > ${max_af} "$caller_filter" and Clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity')"
+    cQuery=$cQuery" where gnomad_fafmax_faf95_max > ${max_af} "$caller_filter" and Clinvar_status != 'no_assertion_criteria_provided' and Clinvar in ('Pathogenic', 'Likely_pathogenic', 'Conflicting_interpretations_of_pathogenicity')"
     gemini query -q "$cQuery" $file
 
 fi
diff --git a/cre.gnomad_scores_v4.1.py b/cre.gnomad_scores_v4.1.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+"""
+This script processes the gnomAD v4.1 constraint scores present here /hpf/largeprojects/ccmbio/ajain/crg2_hg38/gnomad_v4.1/constraint_metrics/gnomad.v4.1.constraint_metrics.tsv
+to retain only Ensemble mane select genes.
+
+Final processed file is located in ~/cre/data/gnomad_scores_v4.1.csv
+
+Script written by Anjali Jain
+Date: Aug 27, 2024
+"""
+
+# Read in the constraints file that was downloaded from gnomAD: https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/constraint/gnomad.v4.1.constraint_metrics.tsv
+constraints = pd.read_csv(
+    "/hpf/largeprojects/ccmbio/ajain/crg2_hg38/gnomad_v4.1/constraint_metrics/gnomad.v4.1.constraint_metrics.tsv",
+    sep="\t",
+)
+
+# Filtering for the Ensembl canonical transcript
+constraints = constraints[
+    constraints["gene_id"].str.startswith("ENSG") & constraints["mane_select"] == True
+]
+
+# Check if there are any duplicates
+if len(constraints["gene_id"]) == len(set(constraints["gene_id"])):
+    print(
+        f"There are no duplicates in this dataframe. There are {len(constraints['gene_id'])} unique ensembl mane select genes"
+    )
+else:
+    print("Duplicates found!")
+
+# Keeping columns that are relevant
+constraints = constraints[
+    [
+        "gene_id",
+        "transcript",
+        "lof.oe",
+        "lof.oe_ci.lower",
+        "lof.oe_ci.upper",
+        "lof.pLI",
+        "lof.pNull",
+        "lof.pRec",
+        "mis.oe",
+        "mis.z_score",
+    ]
+]
+
+# renaming the columns
+constraints = constraints.rename(
+    columns={
+        "gene_id": "Ensembl_gene_id",
+        "transcript": "Ensembl_transcript_id",
+        "lof.oe": "Gnomad_oe_lof_score",
+        "lof.oe_ci.lower": "Gnomad_oe_ci_lower",
+        "lof.oe_ci.upper": "Gnomad_oe_ci_upper",
+        "lof.pLI": "Gnomad_pLI_score",
+        "lof.pNull": "Gnomad_pnull_score",
+        "lof.pRec": "Gnomad_prec_score",
+        "mis.oe": "Gnomad_oe_mis_score",
+        "mis.z_score": "Gnomad_mis_z_score",
+    }
+)
+
+# Write the constraints score to ~/cre/data/gnomad_scores_v4.1.csv
+constraints.to_csv("~/cre/data/gnomad_scores_v4.1.csv", index=False)
diff --git a/cre.vcf2db.R b/cre.vcf2db.R
@@ -87,7 +87,7 @@ create_report <- function(family, samples, type){
     # Column3 = GNOMAD_Link
     variants$GNOMAD_POS <- with(variants, paste(Chrom,Pos,Ref,Alt, sep='-'))
     sGNOMAD1 <- "=HYPERLINK(\"http://gnomad.broadinstitute.org/variant/"
-    sGNOMAD2 <- "?dataset=gnomad_r3"
+    sGNOMAD2 <- "?dataset=gnomad_r4"
     sGNOMAD3 <- "\",\"GNOMAD_link\")"
     variants$GNOMAD_Link <- with(variants, paste(sGNOMAD1, GNOMAD_POS, sGNOMAD2, sGNOMAD3, sep = ''))
 
@@ -270,12 +270,19 @@ create_report <- function(family, samples, type){
 
     # population frequencies
     # Column34 = Gnomad_af
-    # Column35 = Gnomad_af_popmax
+    # Column35 = gnomad_af_grpmax
 
     # Gnomad gene constraint scores
     # Column36 = Gnomad_oe_lof_score
     # Column37 = Gnomad_oe_mis_score
-    gnomad_scores_file <- paste0(default_tables_path, "/gnomad_scores.csv")
+    # Column = Ensembl_transcript_id
+    # Column = Gnomad_oe_ci_lower
+    # Column = Gnomad_oe_ci_upper
+    # Column = Gnomad_pLI_score
+    # Column = Gnomad_pnull_score
+    # Column = Gnomad_prec_score
+    # Column = Gnomad_mis_z_score 
+    gnomad_scores_file <- paste0(default_tables_path, "/gnomad_scores_v4.1.csv")
     gnomad_scores <- read.csv(gnomad_scores_file, stringsAsFactors = F)
     variants <- merge(variants, gnomad_scores, all.x = T, all.y = F)
 
@@ -411,7 +418,7 @@ create_report <- function(family, samples, type){
     # Column 57: ENH_cellline_tissue
 
     # replace -1 with 0
-    for (field in c("Trio_coverage", "Gnomad_af", "Gnomad_af_popmax")){
+    for (field in c("Trio_coverage", "Gnomad_af", "Gnomad_af_grpmax")){
         variants[,field] <- with(variants, gsub("-1", "0", get(field), fixed = T))
         variants[,field] <- with(variants, gsub("None", "0", get(field), fixed = T))
     }
@@ -453,13 +460,13 @@ select_and_write2 <- function(variants, samples, prefix, type)
                             "C4R_WES_counts", "C4R_WES_samples"), 
                            wgs_counts, 
                            c("HGMD_id", "HGMD_gene", "HGMD_tag", "HGMD_ref",
-                            "Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
+                            "Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom","Gnomad_male_ac","Gnomad_fafmax_faf95_max", "Gnomad_filter",
                             "Ensembl_transcript_id", "AA_position", "Exon", "Protein_domains", "rsIDs",
-                            "Gnomad_oe_lof_score", "Gnomad_oe_mis_score", "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score",
+                            "Gnomad_oe_lof_score", "Gnomad_oe_ci_lower","Gnomad_oe_ci_upper","Gnomad_oe_mis_score", "Gnomad_mis_z_score","Gnomad_pLI_score","Gnomad_pnull_score","Gnomad_prec_score",
                             "Conserved_in_30_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score", "Gerp_score", "AlphaMissense"),
                             noncoding_scores,
-                            c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal", "Gnomad_male_ac",
-                            "Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)]
+                            c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal",
+                            "Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)] 
 
     variants <- variants[order(variants$Position),]
 
@@ -828,15 +835,15 @@ clinical_report <- function(project,samples,type){
     # for clinical, only keep variants where one of the alt depths was >= 20
     full_report <- dplyr::filter_at(full_report, paste0("Alt_depths.",samples), any_vars(as.integer(.) >= 20))    
     filtered_report <- subset(full_report, 
-               Quality > 1000 & Gnomad_af_popmax < 0.005 & C4R_WES_counts < 6,
+               Quality > 1000 & Gnomad_af_grpmax < 0.005 & C4R_WES_counts < 6,
                select = c("Position", "GNOMAD_Link", "Ref", "Alt", "Gene", paste0("Zygosity.", samples), 
                           paste0("Burden.",samples),
                           paste0("Alt_depths.",samples),
                         "Variation", "Info", "Refseq_change", "omim_phenotype", "omim_inheritance",
                         "Orphanet", "Clinvar", "C4R_WES_counts",
-                        "Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
+                        "Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom","Gnomad_male_ac","Gnomad_fafmax_faf95_max","Gnomad_filter",
                         "Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score",
-                        "Imprinting_status", "Pseudoautosomal", "Gnomad_male_ac", "UCE_100bp","UCE_200bp")
+                        "Imprinting_status", "Pseudoautosomal", "UCE_100bp","UCE_200bp")
                )
 
     # recalculate burden using the filtered report
@@ -860,9 +867,9 @@ clinical_report <- function(project,samples,type){
       paste0("Burden.", samples),
       "Variation", "Info", "Refseq_change", "omim_phenotype", "omim_inheritance",
       "Orphanet", "Clinvar", "C4R_WES_counts",
-      "Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
+      "Gnomad_af_grpmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom", "Gnomad_male_ac","Gnomad_fafmax_faf95_max","Gnomad_filter",
       "Sift_score", "Polyphen_score", "Cadd_score", "Vest4_score", "Revel_score",
-      "Imprinting_status", "Pseudoautosomal", "Gnomad_male_ac", "UCE_100bp", "UCE_200bp")]
+      "Imprinting_status", "Pseudoautosomal", "UCE_100bp", "UCE_200bp")]
 
     write.csv(filtered_report, paste0(project, ".clinical.", type, ".", datetime, ".csv"), row.names = F)
 }