Skip to content

Commit

Permalink
Merge branch 'develop' into bugfix/KBDEV-994-graphkb-signature-catego…
Browse files Browse the repository at this point in the history
…ryvariant-displaying-reference-version
  • Loading branch information
mathieulemieux authored Sep 20, 2024
2 parents 93ae50d + cf2606e commit 8565cc1
Show file tree
Hide file tree
Showing 58 changed files with 8,998 additions and 3,055 deletions.
157 changes: 95 additions & 62 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from textwrap import dedent

CONTAINER = 'docker://bcgsc/pori-graphkb-loader:v6.2.0'
CONTAINER = 'docker://bcgsc/pori-graphkb-loader:v6.4.0'
DATA_DIR = 'snakemake_data'
LOGS_DIR = 'snakemake_logs'

Expand All @@ -26,21 +26,21 @@ COSMIC_EMAIL = config.get('cosmic_email')
COSMIC_PASSWORD = config.get('cosmic_password')
USE_COSMIC = COSMIC_EMAIL or COSMIC_PASSWORD
BACKFILL_TRIALS = config.get('trials')
USE_FDA_UNII = config.get('fda') # due to the non-scriptable download, making FDA optional
GITHUB_DATA = 'https://raw.githubusercontent.com/bcgsc/pori_graphkb_loader/develop/data'


rule all:
input: f'{DATA_DIR}/civic.COMPLETE',
f'{DATA_DIR}/cgi.COMPLETE',
f'{DATA_DIR}/docm.COMPLETE',
f'{DATA_DIR}/dgidb.COMPLETE',
f'{DATA_DIR}/PMC4468049.COMPLETE',
f'{DATA_DIR}/PMC4232638.COMPLETE',
f'{DATA_DIR}/uberon.COMPLETE',
f'{DATA_DIR}/fdaApprovals.COMPLETE',
f'{DATA_DIR}/cancerhotspots.COMPLETE',
f'{DATA_DIR}/moa.COMPLETE',
*([f'{DATA_DIR}/ncitFdaXref.COMPLETE'] if USE_FDA_UNII else []),
f'{DATA_DIR}/ncitFdaXref.COMPLETE',
*([f'{DATA_DIR}/clinicaltrialsgov.COMPLETE'] if BACKFILL_TRIALS else []),
*([f'{DATA_DIR}/cosmic_resistance.COMPLETE', f'{DATA_DIR}/cosmic_fusions.COMPLETE'] if USE_COSMIC else [])

Expand All @@ -55,34 +55,32 @@ rule download_ncit:
rm -rf __MACOSX''')


if USE_FDA_UNII:
rule download_ncit_fda:
output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt'
shell: dedent(f'''\
cd {DATA_DIR}/ncit
wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''')
rule download_ncit_fda:
output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt'
shell: dedent(f'''\
cd {DATA_DIR}/ncit
wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''')


rule download_ensembl:
output: f'{DATA_DIR}/ensembl/biomart_export.tsv'
shell: dedent(f'''\
cd {DATA_DIR}/ensembl
query_string='<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "transcript_biotype" value = "protein_coding"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_gene_id_version" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "ensembl_transcript_id_version" /><Attribute name = "hgnc_id" /><Attribute name = "refseq_mrna" /><Attribute name = "description" /><Attribute name = "external_gene_name" /><Attribute name = "external_gene_source" /></Dataset></Query>'
query_string='<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "transcript_biotype" value = "protein_coding"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_gene_id_version" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "ensembl_transcript_id_version" /><Attribute name = "ensembl_peptide_id" /><Attribute name = "ensembl_peptide_id_version" /><Attribute name = "hgnc_id" /><Attribute name = "refseq_mrna" /><Attribute name = "description" /><Attribute name = "external_gene_name" /><Attribute name = "external_gene_source" /></Dataset></Query>'
wget -O biomart_export.tsv "http://www.ensembl.org/biomart/martservice?query=$query_string"
''')


if USE_FDA_UNII:
rule download_fda_srs:
output: f'{DATA_DIR}/fda/UNII_Records.txt'
shell: dedent(f'''\
cd {DATA_DIR}/fda
wget https://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip
unzip UNII_Data.zip
rm UNII_Data.zip
rule download_fda_srs:
output: f'{DATA_DIR}/fda/UNII_Records.txt'
shell: dedent(f'''\
cd {DATA_DIR}/fda
wget https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip
unzip UNII_Data.zip
rm UNII_Data.zip
mv UNII*.txt UNII_Records.txt
''')
mv UNII*.txt UNII_Records.txt
''')


rule download_refseq:
Expand Down Expand Up @@ -146,7 +144,7 @@ rule download_cgi:
output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv'
shell: dedent(f'''\
cd {DATA_DIR}/cgi
wget https://www.cancergenomeinterpreter.org/data/cgi_biomarkers_20180117.zip
wget https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip
unzip cgi_biomarkers_20180117.zip
''')

Expand All @@ -162,19 +160,13 @@ rule download_local_data:
rule download_cancerhotspots:
output: f'{DATA_DIR}/cancerhotspots/cancerhotspots.v2.maf'
shell: dedent(f'''\
mkdir -p {DATA_DIR}/cancerhotspots
cd {DATA_DIR}/cancerhotspots
wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz
wget https://cbioportal-download.s3.amazonaws.com/cancerhotspots.v2.maf.gz
gunzip cancerhotspots.v2.maf.gz
''')


rule download_clinicaltrialsgov:
output: directory(f'{DATA_DIR}/clinicaltrialsgov')
shell: dedent(f'''\
cd {DATA_DIR}/clinicaltrialsgov
wget https://clinicaltrials.gov/AllPublicXML.zip
unzip AllPublicXML.zip''')


rule download_cosmic_resistance:
output: f'{DATA_DIR}/cosmic/CosmicResistanceMutations.tsv'
Expand Down Expand Up @@ -228,24 +220,24 @@ rule load_ncit:
shell: LOADER_COMMAND + ' file ncit {input.data} &> {log}; cp {log} {output}'


if USE_FDA_UNII:
rule load_fda_srs:
input: expand(rules.load_local.output, local=['vocab']),
data=f'{DATA_DIR}/fda/UNII_Records.txt'
container: CONTAINER
log: f'{LOGS_DIR}/fdaSrs.logs.txt'
output: f'{DATA_DIR}/fdaSrs.COMPLETE'
shell: LOADER_COMMAND + ' file fdaSrs {input.data} &> {log}; cp {log} {output}'
rule load_fda_srs:
input: expand(rules.load_local.output, local=['vocab']),
f'{DATA_DIR}/ncit.COMPLETE',
data=f'{DATA_DIR}/fda/UNII_Records.txt'
container: CONTAINER
log: f'{LOGS_DIR}/fdaSrs.logs.txt'
output: f'{DATA_DIR}/fdaSrs.COMPLETE'
shell: LOADER_COMMAND + ' file fdaSrs {input.data} &> {log}; cp {log} {output}'


rule load_ncit_fda:
input: rules.load_ncit.output,
rules.load_fda_srs.output,
data=rules.download_ncit_fda.output
container: CONTAINER
log: f'{LOGS_DIR}/ncitFdaXref.logs.txt'
output: f'{DATA_DIR}/ncitFdaXref.COMPLETE'
shell: LOADER_COMMAND + ' file ncitFdaXref {input.data} &> {log}; cp {log} {output}'
rule load_ncit_fda:
input: rules.load_ncit.output,
rules.load_fda_srs.output,
data=rules.download_ncit_fda.output
container: CONTAINER
log: f'{LOGS_DIR}/ncitFdaXref.logs.txt'
output: f'{DATA_DIR}/ncitFdaXref.COMPLETE'
shell: LOADER_COMMAND + ' file ncitFdaXref {input.data} &> {log}; cp {log} {output}'


rule load_refseq:
Expand Down Expand Up @@ -285,7 +277,7 @@ rule load_uberon:


rule load_drugbank:
input: rules.load_fda_srs.output if USE_FDA_UNII else [],
input: rules.load_fda_srs.output,
data=rules.download_drugbank.output
container: CONTAINER
log: f'{LOGS_DIR}/drugbank.logs.txt'
Expand All @@ -301,18 +293,9 @@ rule load_oncotree:
shell: LOADER_COMMAND + ' api oncotree &> {log}; cp {log} {output}'


rule load_dgidb:
input: rules.load_local.output
container: CONTAINER
log: f'{LOGS_DIR}/dgidb.logs.txt'
output: f'{DATA_DIR}/dgidb.COMPLETE'
shell: LOADER_COMMAND + ' api dgidb &> {log}; cp {log} {output}'


def get_drug_inputs(wildcards):
inputs = [*rules.load_ncit.output]
if USE_FDA_UNII:
inputs.extend(rules.load_fda_srs.output)
inputs.extend(rules.load_fda_srs.output)
container: CONTAINER
if USE_DRUGBANK:
inputs.append(*rules.load_drugbank.output)
Expand All @@ -322,7 +305,7 @@ def get_drug_inputs(wildcards):
rule all_drugs:
input: lambda wildcards: get_drug_inputs(wildcards)
container: CONTAINER
output: f'{LOGS_DIR}/all_drugs.COMPLETE'
output: f'{DATA_DIR}/all_drugs.COMPLETE'
shell: 'touch {output}'


Expand All @@ -331,10 +314,26 @@ rule all_diseases:
rules.load_ncit.output,
rules.load_oncotree.output
container: CONTAINER
output: f'{LOGS_DIR}/all_diseases.COMPLETE'
output: f'{DATA_DIR}/all_diseases.COMPLETE'
shell: 'touch {output}'


rule all_local:
input: expand(rules.load_local.output, local=['vocab', 'signatures', 'chromosomes', 'evidenceLevels', 'aacr', 'asco']),
container: CONTAINER
log: f'{LOGS_DIR}/all_local.logs.txt'
output: f'{DATA_DIR}/all_local.COMPLETE'
shell: 'touch {output}'


rule load_dgidb:
input: rules.all_local.output
container: CONTAINER
log: f'{LOGS_DIR}/dgidb.logs.txt'
output: f'{DATA_DIR}/dgidb.COMPLETE'
shell: LOADER_COMMAND + ' api dgidb &> {log}; cp {log} {output}'


rule load_cancerhotspots:
input: expand(rules.load_local.output, local=['vocab', 'signatures', 'chromosomes']),
rules.load_oncotree.output,
Expand Down Expand Up @@ -372,7 +371,7 @@ rule load_civic:
container: CONTAINER
log: f'{LOGS_DIR}/civic.logs.txt'
output: f'{DATA_DIR}/civic.COMPLETE'
shell: LOADER_COMMAND + ' api civic &> {log}; cp {log} {output}'
shell: LOADER_COMMAND + ' civic &> {log}; cp {log} {output}'


rule load_cgi:
Expand All @@ -397,6 +396,7 @@ rule load_docm:


rule load_approvals:
input:
container: CONTAINER
log: f'{LOGS_DIR}/fdaApprovals.logs.txt'
output: f'{DATA_DIR}/fdaApprovals.COMPLETE'
Expand All @@ -406,12 +406,11 @@ rule load_approvals:
rule load_clinicaltrialsgov:
input: expand(rules.load_local.output, local=['vocab']),
rules.all_diseases.output,
rules.all_drugs.output,
data=rules.download_clinicaltrialsgov.output
rules.all_drugs.output
container: CONTAINER
log: f'{LOGS_DIR}/clinicaltrialsgov.logs.txt'
output: f'{DATA_DIR}/clinicaltrialsgov.COMPLETE'
shell: LOADER_COMMAND + ' api clinicaltrialsgov &> {log}; cp {log} {output}'
shell: LOADER_COMMAND + ' clinicaltrialsgov &> {log}; cp {log} {output}'


rule load_cosmic_resistance:
Expand Down Expand Up @@ -443,3 +442,37 @@ rule load_moa:
log: f'{LOGS_DIR}/load_moa.logs.txt'
output: f'{DATA_DIR}/moa.COMPLETE'
shell: LOADER_COMMAND + ' api moa &> {log}; cp {log} {output}'


# input isn't actually needed but it is a file-type loader, so a dummy file must be supplied
rule download_sources:
output: f'{DATA_DIR}/local/sources.json'
shell: dedent(f'''\
cd {DATA_DIR}/local
touch sources.json
''')

rule load_sources:
input: f'{DATA_DIR}/local/sources.json'
container: CONTAINER
log: f'{LOGS_DIR}/sources.logs.txt'
output: f'{DATA_DIR}/sources.COMPLETE'
shell: LOADER_COMMAND + ' file sources {input} &> {log}; cp {log} {output}'


rule all_ontologies:
input: expand(rules.load_local.output, local=['vocab', 'signatures', 'chromosomes', 'evidenceLevels', 'aacr', 'asco']),
rules.load_oncotree.output,
rules.load_ensembl.output,
rules.all_drugs.output,
rules.all_diseases.output,
rules.load_uberon.output,
rules.load_approvals.output,
rules.load_ncit.output,
rules.load_sources.output,
rules.load_fda_srs.output,
rules.load_ncit_fda.output,
rules.load_dgidb.output
container: CONTAINER
output: f'{DATA_DIR}/all_ontologies.COMPLETE'
shell: 'touch {output}'
Loading

0 comments on commit 8565cc1

Please sign in to comment.