Skip to content

Commit

Permalink
Merge pull request #261 from Ferlab-Ste-Justine/feat/clin-3562
Browse files Browse the repository at this point in the history
feat: CLIN-3562 add gnomad v4 genomes to variants
  • Loading branch information
meek0 authored Jan 16, 2025
2 parents 2f04ec4 + a08583f commit ef051f6
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import java.time.LocalDateTime

/**
* This ETL create an aggregated table on occurrences of SNV variants. Occurrences are aggregated by calculating the frequencies specified in parameter frequencies.
* The table is enriched with information from other datasets such as genes, dbsnp, clinvar, 1000 genomes, topmed_bravo, gnomad_genomes_v2, gnomad_exomes_v2, gnomad_genomes_v3.
* The table is enriched with information from other datasets such as genes, dbsnp, clinvar, 1000 genomes, topmed_bravo, gnomad_genomes_v2, gnomad_exomes_v2, gnomad_genomes_v3, gnomad_genomes_v4.
*
* @param participantId column used to distinct participants in order to calculate total number of participants (pn) and total allele number (an)
* @param affectedStatus column used to calculate frequencies for affected / unaffected participants
Expand All @@ -42,6 +42,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip
protected val gnomad_genomes_v2: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v2_1_1")
protected val gnomad_exomes_v2: DatasetConf = conf.getDataset("normalized_gnomad_exomes_v2_1_1")
protected val gnomad_genomes_v3: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v3")
protected val gnomad_genomes_v4: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v4")
protected val dbsnp: DatasetConf = conf.getDataset("normalized_dbsnp")
protected val clinvar: DatasetConf = conf.getDataset("normalized_clinvar")
protected val genes: DatasetConf = conf.getDataset("enriched_genes")
Expand Down Expand Up @@ -88,7 +89,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip

variantsCheckpoint
.withFrequencies(participantId, affectedStatus, snv, splits, checkpoint)
.withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id))
.withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id), data(gnomad_genomes_v4.id))
.withDbSNP(data(dbsnp.id))
.withClinvar(data(clinvar.id))
.withGenes(data(genes.id))
Expand Down Expand Up @@ -131,6 +132,7 @@ object Variants {
val conditionValueMap: List[(Column, String)] = List(
$"clinvar".isNotNull -> "Clinvar",
$"cmc".isNotNull -> "Cosmic",
$"external_frequencies.gnomad_genomes_4".isNotNull -> "gnomADv4",
)
val dfWithVariantExternalReference = conditionValueMap.foldLeft {
df.withColumn(outputColumn, when($"rsnumber".isNotNull, array(lit("DBSNP"))).otherwise(array()))
Expand All @@ -151,7 +153,8 @@ object Variants {
topmed: DataFrame,
gnomadGenomesV2: DataFrame,
gnomadExomesV2: DataFrame,
gnomadGenomesV3: DataFrame)(implicit spark: SparkSession): DataFrame = {
gnomadGenomesV3: DataFrame,
gnomadGenomesV4: DataFrame)(implicit spark: SparkSession): DataFrame = {
import spark.implicits._
val shapedThousandGenomes = thousandGenomes
.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"))
Expand All @@ -166,20 +169,23 @@ object Variants {
val shapedGnomadGenomesV2 = gnomadGenomesV2.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"hom".cast("long"))
val shapedGnomadExomesV2 = gnomadExomesV2.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"hom".cast("long"))
val shapedGnomadGenomesV3 = gnomadGenomesV3.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"nhomalt".cast("long") as "hom")
val shapedGnomadGenomesV4 = gnomadGenomesV4.selectLocus($"ac", $"af", $"an", $"hom")

df
.joinAndMerge(shapedThousandGenomes, "thousand_genomes", "left")
.joinAndMerge(shapedTopmed, "topmed_bravo", "left")
.joinAndMerge(shapedGnomadGenomesV2, "gnomad_genomes_2_1_1", "left")
.joinAndMerge(shapedGnomadExomesV2, "gnomad_exomes_2_1_1", "left")
.joinAndMerge(shapedGnomadGenomesV3, "gnomad_genomes_3", "left")
.joinAndMerge(shapedGnomadGenomesV4, "gnomad_genomes_4", "left")
.select(df("*"),
struct(
col("thousand_genomes"),
col("topmed_bravo"),
col("gnomad_genomes_2_1_1"),
col("gnomad_exomes_2_1_1"),
col("gnomad_genomes_3")) as "external_frequencies")
col("gnomad_genomes_3"),
col("gnomad_genomes_4")) as "external_frequencies")
}

def withDbSNP(dbsnp: DataFrame): DataFrame = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class EnrichedVariantsSpec extends SparkSpec with WithTestConfig {
val gnomad_genomes_v2_1_1: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v2_1_1")
val gnomad_exomes_v2_1_1: DatasetConf = conf.getDataset("normalized_gnomad_exomes_v2_1_1")
val gnomad_genomes_v3: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v3")
val gnomad_genomes_v4: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v4")
val dbsnp: DatasetConf = conf.getDataset("normalized_dbsnp")
val clinvar: DatasetConf = conf.getDataset("normalized_clinvar")
val genes: DatasetConf = conf.getDataset("enriched_genes")
Expand All @@ -40,6 +41,7 @@ class EnrichedVariantsSpec extends SparkSpec with WithTestConfig {
val gnomad_genomes_2_1_1Df: DataFrame = Seq(NormalizedGnomadGenomes211()).toDF
val gnomad_exomes_2_1_1Df: DataFrame = Seq(NormalizedGnomadExomes211()).toDF
val gnomad_genomes_3Df: DataFrame = Seq(NormalizedGnomadGenomes3()).toDF
val gnomad_genomes_4Df: DataFrame = Seq(NormalizedGnomadGenomes4()).toDF
val dbsnpDf: DataFrame = Seq(NormalizedDbsnp()).toDF
val clinvarDf: DataFrame = Seq(NormalizedClinvar(chromosome = "1", start = 69897, reference = "T", alternate = "C")).toDF
val genesDf: DataFrame = Seq(EnrichedGenes()).toDF()
Expand All @@ -56,6 +58,7 @@ class EnrichedVariantsSpec extends SparkSpec with WithTestConfig {
gnomad_genomes_v2_1_1.id -> gnomad_genomes_2_1_1Df,
gnomad_exomes_v2_1_1.id -> gnomad_exomes_2_1_1Df,
gnomad_genomes_v3.id -> gnomad_genomes_3Df,
gnomad_genomes_v4.id -> gnomad_genomes_4Df,
dbsnp.id -> dbsnpDf,
clinvar.id -> clinvarDf,
genes.id -> genesDf,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ case class EnrichedVariant(chromosome: String = "1",
dna_change: String = "T>C",
genes: List[GENES] = List(GENES()),
cmc: CMC = CMC(),
variant_external_reference: List[String] = List("DBSNP", "Clinvar", "Cosmic"),
variant_external_reference: List[String] = List("DBSNP", "Clinvar", "Cosmic", "gnomADv4"),
gene_external_reference: List[String] = List("HPO", "Orphanet", "OMIM", "DDD", "Cosmic", "gnomAD", "SpliceAI"),
)

Expand All @@ -38,7 +38,8 @@ object EnrichedVariant {
topmed_bravo: TopmedFreq = TopmedFreq(2, 125568, 0.0000159276, 0, 2),
gnomad_genomes_2_1_1: GnomadFreqOutput = GnomadFreqOutput(1, 26342, 0.000037962189659099535, 0),
gnomad_exomes_2_1_1: GnomadFreqOutput = GnomadFreqOutput(0, 2, 0.0, 0),
gnomad_genomes_3: GnomadFreqOutput = GnomadFreqOutput(10, 20, 0.5, 10))
gnomad_genomes_3: GnomadFreqOutput = GnomadFreqOutput(10, 20, 0.5, 10),
gnomad_genomes_4: GnomadFreqOutput = GnomadFreqOutput(2, 20, 2.0, 10))


case class ThousandGenomesFreq(ac: Long = 10,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package bio.ferlab.datalake.testutils.models.normalized

case class NormalizedGnomadGenomes4(
chromosome: String = "1",
start: Long = 69897,
end: Long = 69899,
reference: String = "T",
alternate: String = "C",
qual: Double = 0.0,
name: String = "BRAF",
ac: Long = 2,
af: Double = 2.0,
an: Long = 20,
hom: Long = 10,
)
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ case class PreparedVariantCentric(`chromosome`: String = "1",
noGene(Seq(CONSEQUENCES(`ensembl_transcript_id` = "transcript2")))
),
`cmc`: CMC = CMC(),
`variant_external_reference`: Seq[String] = Seq("DBSNP", "Clinvar", "Cosmic"),
`variant_external_reference`: Seq[String] = Seq("DBSNP", "Clinvar", "Cosmic", "gnomADv4"),
`gene_external_reference`: Seq[String] = Seq("HPO", "Orphanet", "OMIM", "DDD", "Cosmic", "gnomAD", "SpliceAI"))

object PreparedVariantCentric {
Expand All @@ -43,7 +43,8 @@ object PreparedVariantCentric {
`topmed_bravo`: TOPMED_BRAVO = TOPMED_BRAVO(),
`gnomad_genomes_2_1_1`: GNOMAD_GENOMES_2_1_1 = GNOMAD_GENOMES_2_1_1(),
`gnomad_exomes_2_1_1`: GNOMAD_EXOMES_2_1_1 = GNOMAD_EXOMES_2_1_1(),
`gnomad_genomes_3`: GNOMAD_GENOMES_3 = GNOMAD_GENOMES_3())
`gnomad_genomes_3`: GNOMAD_GENOMES_3 = GNOMAD_GENOMES_3(),
`gnomad_genomes_4`: GNOMAD_GENOMES_4 = GNOMAD_GENOMES_4())

case class DDD(`disease_name`: String = "OCULOAURICULAR SYNDROME")

Expand Down Expand Up @@ -119,6 +120,8 @@ object PreparedVariantCentric {
`af`: Double = 0.5,
`hom`: Long = 10)

case class GNOMAD_GENOMES_4(`ac`: Long = 2, `an`: Long = 20, `af`: Double = 2.0, `hom`: Long = 10)

case class COSMIC(`tumour_types_germline`: Seq[String] = Seq("breast", "colon", "endometrial cancer under age 50"))

case class EXON(`rank`: String = "1",
Expand Down

0 comments on commit ef051f6

Please sign in to comment.