Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Snakemake data loader maintenance #166

Open
wants to merge 1 commit into
base: feat/external-deployment
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions Dockerfile.snakemake
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
FROM node:16 as node_stage
FROM node:17 AS node_stage
WORKDIR /usr/src/app
# Bundle app source
COPY package*.json ./
RUN rm -rf node_modules
RUN npm install
RUN npm ci --only=production
# COPY everything not in dockerignore file
COPY . .
Expand Down Expand Up @@ -33,6 +35,10 @@ RUN chmod 777 /usr/src/app/snakemake_logs
# Copy the Snakefile to the working directory (assuming it's already in the build context)
COPY . .

# Still getting NODE version errors, but a rebuild is quick and helps.
RUN npm rebuild

# Run the Snakefile using Snakemake
CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS --until $GKB_LOADER"]
#CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS --until $GKB_LOADER"]
CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS"]

106 changes: 44 additions & 62 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
from textwrap import dedent

CONTAINER = 'docker://bcgsc/pori-graphkb-loader:v6.4.0'
CONTAINER = 'bcgsc/pori-graphkb-loader'
DATA_DIR = 'snakemake_data'
LOGS_DIR = 'snakemake_logs'

Expand Down Expand Up @@ -57,72 +56,64 @@ rule all:

rule download_ncit:
output: f'{DATA_DIR}/ncit/Thesaurus.txt',
shell: dedent(f'''\
cd {DATA_DIR}/ncit
wget https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip
unzip Thesaurus.FLAT.zip
rm Thesaurus.FLAT.zip
rm -rf __MACOSX''')
shell: f'''
mkdir -p {DATA_DIR}/ncit
curl https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip | zcat > {DATA_DIR}/ncit/Thesaurus.txt
rm -rf {DATA_DIR}/ncit/__MACOSX'''


rule download_ncit_fda:
output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/ncit
wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''')
wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt'''


rule download_ensembl:
output: f'{DATA_DIR}/ensembl/biomart_export.tsv'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/ensembl
query_string='<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "transcript_biotype" value = "protein_coding"/><Attribute name = "ensembl_gene_id" /><Attribute name = "ensembl_gene_id_version" /><Attribute name = "ensembl_transcript_id" /><Attribute name = "ensembl_transcript_id_version" /><Attribute name = "ensembl_peptide_id" /><Attribute name = "ensembl_peptide_id_version" /><Attribute name = "hgnc_id" /><Attribute name = "refseq_mrna" /><Attribute name = "description" /><Attribute name = "external_gene_name" /><Attribute name = "external_gene_source" /></Dataset></Query>'
wget -O biomart_export.tsv "http://www.ensembl.org/biomart/martservice?query=$query_string"
''')
'''


rule download_fda_srs:
output: f'{DATA_DIR}/fda/UNII_Records.txt'
shell: dedent(f'''\
cd {DATA_DIR}/fda
wget https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip
unzip UNII_Data.zip
rm UNII_Data.zip

mv UNII*.txt UNII_Records.txt
''')
shell: f'''
curl -L --create-dirs -o {DATA_DIR}/fda/UNII_Data.zip https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip
unzip -o -d {DATA_DIR}/fda {DATA_DIR}/fda/UNII_Data.zip
rm {DATA_DIR}/fda/UNII_Data.zip
mv {DATA_DIR}/fda/UNII*.txt {DATA_DIR}/fda/UNII_Records.txt
'''


rule download_refseq:
output: f'{DATA_DIR}/refseq/LRG_RefSeqGene.tab'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/refseq
wget -O LRG_RefSeqGene.tab ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
''')
'''


rule download_uberon:
output: f'{DATA_DIR}/uberon/uberon.owl'
shell: dedent(f'''\
cd {DATA_DIR}/uberon
wget http://purl.obolibrary.org/obo/uberon.owl
''')
shell: f'''
curl -L --create-dirs -o {DATA_DIR}/uberon/uberon.owl https://github.com/obophenotype/uberon/releases/latest/download/uberon.owl
'''


rule download_do:
output: f'{DATA_DIR}/do/doid.json'
shell: dedent(f'''\
cd {DATA_DIR}/do;
REPO=https://github.com/DiseaseOntology/HumanDiseaseOntology.git;
LATEST=$(git ls-remote $REPO --tags v\\* | cut -f 2 | sed 's/refs\\/tags\///' | grep '\\bv[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\\b' | sort -d | tail -n 1)
echo $LATEST
wget https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/$LATEST/src/ontology/doid.json
''')
shell: f'''
curl --create-dirs -o {DATA_DIR}/do/doid.json https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/src/ontology/doid.json
'''



rule download_drugbank:
output: f'{DATA_DIR}/drugbank/full_database.xml'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/drugbank
wget https://www.drugbank.ca/releases
latest=$(grep 'href="/releases/[^"]*"' -o releases | cut -f 3 -d/ | sed 's/"//' | sort -V | tail -n 2 | head -n 1)
Expand All @@ -131,86 +122,77 @@ rule download_drugbank:

curl -Lfv -o ${{filename}}.zip -u {DRUGBANK_EMAIL}:{DRUGBANK_PASSWORD} https://go.drugbank.com/releases/5-1-8/downloads/all-full-database
unzip ${{filename}}.zip
mv full\ database.xml full_database.xml''')

mv full\ database.xml full_database.xml'''

rule download_PMC4468049:
output: f'{DATA_DIR}/PMC4468049/NIHMS632238-supplement-2.xlsx'
shell: dedent(f'''\
cd {DATA_DIR}/PMC4468049
wget https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4468049/bin/NIHMS632238-supplement-2.xlsx
''')
shell: f''' curl --create-dirs -o {DATA_DIR}/PMC4468049/NIHMS632238-supplement-2.xlsx https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4468049/bin/NIHMS632238-supplement-2.xlsx'''


rule download_PMC4232638:
output: f'{DATA_DIR}/PMC4232638/13059_2014_484_MOESM2_ESM.xlsx'
shell: dedent(f'''\
cd {DATA_DIR}/PMC4232638
wget https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4232638/bin/13059_2014_484_MOESM2_ESM.xlsx
''')

shell: f''' curl --create-dirs -o {DATA_DIR}/PMC4232638/13059_2014_484_MOESM2_ESM.xlsx https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4232638/bin/13059_2014_484_MOESM2_ESM.xlsx'''

rule download_cgi:
output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv'
shell: dedent(f'''\
cd {DATA_DIR}/cgi
wget https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip
unzip cgi_biomarkers_20180117.zip
''')
shell: f'''
curl --create-dirs -o {DATA_DIR}/cgi/cgi_biomarkers.zip https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip
unzip -d {DATA_DIR}/cgi {DATA_DIR}/cgi/cgi_biomarkers.zip
'''


rule download_local_data:
output: f'{DATA_DIR}/local/{{local}}.json'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/local
wget {GITHUB_DATA}/{{wildcards.local}}.json
''')
'''


rule download_cancerhotspots:
output: f'{DATA_DIR}/cancerhotspots/cancerhotspots.v2.maf'
shell: dedent(f'''\
shell: f'''
mkdir -p {DATA_DIR}/cancerhotspots
cd {DATA_DIR}/cancerhotspots
wget https://cbioportal-download.s3.amazonaws.com/cancerhotspots.v2.maf.gz
gunzip cancerhotspots.v2.maf.gz
''')
'''



rule download_cosmic_resistance:
output: f'{DATA_DIR}/cosmic/CosmicResistanceMutations.tsv'
shell: dedent(f'''
shell: f'''
cd {DATA_DIR}/cosmic
AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 )
resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/CosmicResistanceMutations.tsv.gz );
url=$( node -e "var resp = $resp; console.log(resp.url);" );
curl "$url" -o CosmicResistanceMutations.tsv.gz
gunzip CosmicResistanceMutations.tsv.gz
''')
'''


rule download_cosmic_diseases:
output: f'{DATA_DIR}/cosmic/classification.csv'
shell: dedent(f'''
shell: f'''
cd {DATA_DIR}/cosmic
AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 )
resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/classification.csv );
url=$( node -e "var resp = $resp; console.log(resp.url);" );
curl "$url" -o classification.csv
''')
'''


rule download_cosmic_fusions:
output: f'{DATA_DIR}/cosmic/CosmicFusionExport.tsv'
shell: dedent(f'''
shell: f'''
cd {DATA_DIR}/cosmic
AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 )
resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/CosmicFusionExport.tsv.gz );
url=$( node -e "var resp = $resp; console.log(resp.url);" );
curl "$url" -o CosmicFusionExport.tsv.gz
gunzip CosmicFusionExport.tsv.gz
''')
'''


rule load_local:
Expand Down Expand Up @@ -457,10 +439,10 @@ rule load_moa:
# input isn't actually needed but it is a file-type loader, so a dummy file must be supplied
rule download_sources:
output: f'{DATA_DIR}/local/sources.json'
shell: dedent(f'''\
shell: f'''
cd {DATA_DIR}/local
touch sources.json
''')
'''

rule load_sources:
input: f'{DATA_DIR}/local/sources.json'
Expand Down
9 changes: 6 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@
"private": true,
"license": "GPL-3",
"dependencies": {
"@bcgsc-pori/graphkb-parser": "^1.1.1",
"@bcgsc-pori/graphkb-schema": "^3.14.3",
"@bcgsc-pori/graphkb-parser": "^2.0.0",
"@bcgsc-pori/graphkb-schema": "^3.16.0",
"ajv": "^6.10.0",
"argparse": "^2.0.1",
"csv-parse": "^4.6.5",
"expat": "^1.0.0",
"fast-csv": "^4.3.6",
"html-to-text": "^5.1.1",
"http-status-codes": "^1.3.2",
"json-cycle": "^1.3.0",
"json-stable-stringify": "^1.0.1",
"jsonpath": "^1.1.1",
"jsonwebtoken": "^8.5.1",
"libxmljs": "^1.0.11",
"lodash": "^4.17.21",
"node-expat": "^2.3.18",
"node-expat": "^2.4.1",
"node-fetch": "^2.6.7",
"node-gyp": "^10.2.0",
"p-limit": "^3.1.0",
"parse5": "^5.1.1",
"rdflib": "^2.2.15",
Expand Down
4 changes: 2 additions & 2 deletions src/PMC4232638/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
const readXlsxFile = require('read-excel-file/node');
const kbParser = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');

const { logger } = require('../logging');
const { rid } = require('../graphkb');
Expand Down Expand Up @@ -134,7 +134,7 @@ const uploadFile = async ({ conn, filename }) => {
logger.info(`loading: ${row.Gene}:${row['Amino acid change']}`);

try {
const parsed = kbParser.variant.parse(`p.${row['Amino acid change']}`, false).toJSON();
const parsed = jsonifyVariant(parseVariant(`p.${row['Amino acid change']}`, false) );
const [gene] = await _entrezGene.fetchAndLoadBySymbol(conn, row.Gene);
const relevance = await conn.getVocabularyTerm(row.relevance);
const evidence = await _pubmed.fetchAndLoadByIds(conn, row.evidence);
Expand Down
4 changes: 2 additions & 2 deletions src/cancerhotspots/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ const fs = require('fs');

const csv = require('fast-csv');

const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');

//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');
const {
convertRowFields,
hashRecordToId,
Expand Down
4 changes: 2 additions & 2 deletions src/cgl/index.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
const fs = require('fs');

const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');

//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');
const {
loadDelimToJson,
hashRecordToId,
Expand Down
4 changes: 2 additions & 2 deletions src/civic/evidenceItem.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ const path = require('path');

const _ = require('lodash');
const Ajv = require('ajv');
const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');

//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');
const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser');
const { checkSpec, request } = require('../util');
const { logger } = require('../logging');
const { civic: SOURCE_DEFN } = require('../sources');
Expand Down
4 changes: 2 additions & 2 deletions src/civic/profile.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Introducing Molecular Profiles with CIViC GraphQL API v2.2.0
* [EvidenceItem]--(many-to-one)--[MolecularProfile]--(many-to-many)--[Variant]
*/
const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');

//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');
const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser');

class NotImplementedError extends ErrorMixin { }
const MOLECULAR_PROFILE_CACHE = new Map();
Expand Down
4 changes: 2 additions & 2 deletions src/civic/publication.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');

//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');
const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser');
const _asco = require('../asco');
const _pubmed = require('../entrez/pubmed');

Expand Down
3 changes: 2 additions & 1 deletion src/civic/relevance.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');
//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser');
const {ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser');

class NotImplementedError extends ErrorMixin { }

Expand Down
3 changes: 2 additions & 1 deletion src/civic/variant.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ const _entrezGene = require('../entrez/gene');
const _snp = require('../entrez/snp');
const { civic: SOURCE_DEFN } = require('../sources');

const { error: { ErrorMixin, ParsingError } } = kbParser;
//const { error: { ErrorMixin, ParsingError } } = kbParser;
const { ParsingError, ErrorMixin, InputValidationError } = kbParser;
class NotImplementedError extends ErrorMixin { }

const VARIANT_CACHE = new Map();
Expand Down
4 changes: 2 additions & 2 deletions src/cosmic/resistance.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
*/
const fs = require('fs');

const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');

//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');
const {
loadDelimToJson,
convertRowFields,
Expand Down
4 changes: 2 additions & 2 deletions src/docm/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
const Ajv = require('ajv');
const fs = require('fs');

const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');

//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');
const { checkSpec, request } = require('../util');
const {
orderPreferredOntologyTerms, rid,
Expand Down
2 changes: 1 addition & 1 deletion src/entrez/snp.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
const Ajv = require('ajv');

const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser');
const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser');

const { checkSpec } = require('../util');
const {
Expand Down