diff --git a/Dockerfile.snakemake b/Dockerfile.snakemake index d1593bbd..16c26bf9 100644 --- a/Dockerfile.snakemake +++ b/Dockerfile.snakemake @@ -1,7 +1,9 @@ -FROM node:16 as node_stage +FROM node:17 AS node_stage WORKDIR /usr/src/app # Bundle app source COPY package*.json ./ +RUN rm -rf node_modules +RUN npm install RUN npm ci --only=production # COPY everything not in dockerignore file COPY . . @@ -33,6 +35,10 @@ RUN chmod 777 /usr/src/app/snakemake_logs # Copy the Snakefile to the working directory (assuming it's already in the build context) COPY . . +# Still getting NODE version errors, but a rebuild is quick and helps. +RUN npm rebuild + # Run the Snakefile using Snakemake -CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS --until $GKB_LOADER"] +#CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS --until $GKB_LOADER"] +CMD [ "sh", "-c", "snakemake --debug -j 1 --config gkb_url=$GKB_URL gkb_user=$GKB_USER gkb_pass=$GKB_PASS"] diff --git a/Snakefile b/Snakefile index bb9c5ab0..deb757f1 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,6 @@ import os -from textwrap import dedent -CONTAINER = 'docker://bcgsc/pori-graphkb-loader:v6.4.0' +CONTAINER = 'bcgsc/pori-graphkb-loader' DATA_DIR = 'snakemake_data' LOGS_DIR = 'snakemake_logs' @@ -57,72 +56,64 @@ rule all: rule download_ncit: output: f'{DATA_DIR}/ncit/Thesaurus.txt', - shell: dedent(f'''\ - cd {DATA_DIR}/ncit - wget https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip - unzip Thesaurus.FLAT.zip - rm Thesaurus.FLAT.zip - rm -rf __MACOSX''') + shell: f''' + mkdir -p {DATA_DIR}/ncit + curl https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip | zcat > {DATA_DIR}/ncit/Thesaurus.txt + rm -rf {DATA_DIR}/ncit/__MACOSX''' rule download_ncit_fda: output: f'{DATA_DIR}/ncit/FDA-UNII_NCIt_Subsets.txt' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/ncit - wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''') + wget https://evs.nci.nih.gov/ftp1/FDA/UNII/FDA-UNII_NCIt_Subsets.txt''' rule download_ensembl: output: f'{DATA_DIR}/ensembl/biomart_export.tsv' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/ensembl query_string='' wget -O biomart_export.tsv "http://www.ensembl.org/biomart/martservice?query=$query_string" - ''') + ''' rule download_fda_srs: output: f'{DATA_DIR}/fda/UNII_Records.txt' - shell: dedent(f'''\ - cd {DATA_DIR}/fda - wget https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip - unzip UNII_Data.zip - rm UNII_Data.zip - - mv UNII*.txt UNII_Records.txt - ''') + shell: f''' + curl -L --create-dirs -o {DATA_DIR}/fda/UNII_Data.zip https://precision.fda.gov/uniisearch/archive/latest/UNII_Data.zip + unzip -o -d {DATA_DIR}/fda {DATA_DIR}/fda/UNII_Data.zip + rm {DATA_DIR}/fda/UNII_Data.zip + mv {DATA_DIR}/fda/UNII*.txt {DATA_DIR}/fda/UNII_Records.txt + ''' rule download_refseq: output: f'{DATA_DIR}/refseq/LRG_RefSeqGene.tab' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/refseq wget -O LRG_RefSeqGene.tab ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene - ''') + ''' rule download_uberon: output: f'{DATA_DIR}/uberon/uberon.owl' - shell: dedent(f'''\ - cd {DATA_DIR}/uberon - wget http://purl.obolibrary.org/obo/uberon.owl - ''') + shell: f''' + curl -L --create-dirs -o {DATA_DIR}/uberon/uberon.owl https://github.com/obophenotype/uberon/releases/latest/download/uberon.owl + ''' rule download_do: output: f'{DATA_DIR}/do/doid.json' - shell: dedent(f'''\ - cd {DATA_DIR}/do; - REPO=https://github.com/DiseaseOntology/HumanDiseaseOntology.git; - LATEST=$(git ls-remote $REPO --tags v\\* | cut -f 2 | sed 's/refs\\/tags\///' | grep '\\bv[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\\b' | sort -d | tail -n 1) - echo $LATEST - wget https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/$LATEST/src/ontology/doid.json - ''') + shell: f''' + curl --create-dirs -o {DATA_DIR}/do/doid.json https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/src/ontology/doid.json + ''' + rule download_drugbank: output: f'{DATA_DIR}/drugbank/full_database.xml' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/drugbank wget https://www.drugbank.ca/releases latest=$(grep 'href="/releases/[^"]*"' -o releases | cut -f 3 -d/ | sed 's/"//' | sort -V | tail -n 2 | head -n 1) @@ -131,86 +122,77 @@ rule download_drugbank: curl -Lfv -o ${{filename}}.zip -u {DRUGBANK_EMAIL}:{DRUGBANK_PASSWORD} https://go.drugbank.com/releases/5-1-8/downloads/all-full-database unzip ${{filename}}.zip - mv full\ database.xml full_database.xml''') - + mv full\ database.xml full_database.xml''' rule download_PMC4468049: output: f'{DATA_DIR}/PMC4468049/NIHMS632238-supplement-2.xlsx' - shell: dedent(f'''\ - cd {DATA_DIR}/PMC4468049 - wget https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4468049/bin/NIHMS632238-supplement-2.xlsx - ''') + shell: f''' curl --create-dirs -o {DATA_DIR}/PMC4468049/NIHMS632238-supplement-2.xlsx https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4468049/bin/NIHMS632238-supplement-2.xlsx''' rule download_PMC4232638: output: f'{DATA_DIR}/PMC4232638/13059_2014_484_MOESM2_ESM.xlsx' - shell: dedent(f'''\ - cd {DATA_DIR}/PMC4232638 - wget https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4232638/bin/13059_2014_484_MOESM2_ESM.xlsx - ''') - + shell: f''' curl --create-dirs -o {DATA_DIR}/PMC4232638/13059_2014_484_MOESM2_ESM.xlsx https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4232638/bin/13059_2014_484_MOESM2_ESM.xlsx''' rule download_cgi: output: f'{DATA_DIR}/cgi/cgi_biomarkers_per_variant.tsv' - shell: dedent(f'''\ - cd {DATA_DIR}/cgi - wget https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip - unzip cgi_biomarkers_20180117.zip - ''') + shell: f''' + curl --create-dirs -o {DATA_DIR}/cgi/cgi_biomarkers.zip https://www.cancergenomeinterpreter.org/data/biomarkers/cgi_biomarkers_20180117.zip + unzip -d {DATA_DIR}/cgi {DATA_DIR}/cgi/cgi_biomarkers.zip + ''' rule download_local_data: output: f'{DATA_DIR}/local/{{local}}.json' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/local wget {GITHUB_DATA}/{{wildcards.local}}.json - ''') + ''' rule download_cancerhotspots: output: f'{DATA_DIR}/cancerhotspots/cancerhotspots.v2.maf' - shell: dedent(f'''\ + shell: f''' mkdir -p {DATA_DIR}/cancerhotspots cd {DATA_DIR}/cancerhotspots wget https://cbioportal-download.s3.amazonaws.com/cancerhotspots.v2.maf.gz gunzip cancerhotspots.v2.maf.gz - ''') + ''' rule download_cosmic_resistance: output: f'{DATA_DIR}/cosmic/CosmicResistanceMutations.tsv' - shell: dedent(f''' + shell: f''' cd {DATA_DIR}/cosmic AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 ) resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/CosmicResistanceMutations.tsv.gz ); url=$( node -e "var resp = $resp; console.log(resp.url);" ); curl "$url" -o CosmicResistanceMutations.tsv.gz gunzip CosmicResistanceMutations.tsv.gz - ''') + ''' rule download_cosmic_diseases: output: f'{DATA_DIR}/cosmic/classification.csv' - shell: dedent(f''' + shell: f''' cd {DATA_DIR}/cosmic AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 ) resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/classification.csv ); url=$( node -e "var resp = $resp; console.log(resp.url);" ); curl "$url" -o classification.csv - ''') + ''' rule download_cosmic_fusions: output: f'{DATA_DIR}/cosmic/CosmicFusionExport.tsv' - shell: dedent(f''' + shell: f''' cd {DATA_DIR}/cosmic AUTH=$( echo "{COSMIC_EMAIL}:{COSMIC_PASSWORD}" | base64 ) resp=$( curl -H "Authorization: Basic $AUTH" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v92/CosmicFusionExport.tsv.gz ); url=$( node -e "var resp = $resp; console.log(resp.url);" ); curl "$url" -o CosmicFusionExport.tsv.gz gunzip CosmicFusionExport.tsv.gz - ''') + ''' rule load_local: @@ -457,10 +439,10 @@ rule load_moa: # input isn't actually needed but it is a file-type loader, so a dummy file must be supplied rule download_sources: output: f'{DATA_DIR}/local/sources.json' - shell: dedent(f'''\ + shell: f''' cd {DATA_DIR}/local touch sources.json - ''') + ''' rule load_sources: input: f'{DATA_DIR}/local/sources.json' diff --git a/package.json b/package.json index b4b59389..44630c8a 100644 --- a/package.json +++ b/package.json @@ -12,11 +12,12 @@ "private": true, "license": "GPL-3", "dependencies": { - "@bcgsc-pori/graphkb-parser": "^1.1.1", - "@bcgsc-pori/graphkb-schema": "^3.14.3", + "@bcgsc-pori/graphkb-parser": "^2.0.0", + "@bcgsc-pori/graphkb-schema": "^3.16.0", "ajv": "^6.10.0", "argparse": "^2.0.1", "csv-parse": "^4.6.5", + "expat": "^1.0.0", "fast-csv": "^4.3.6", "html-to-text": "^5.1.1", "http-status-codes": "^1.3.2", @@ -24,9 +25,11 @@ "json-stable-stringify": "^1.0.1", "jsonpath": "^1.1.1", "jsonwebtoken": "^8.5.1", + "libxmljs": "^1.0.11", "lodash": "^4.17.21", - "node-expat": "^2.3.18", + "node-expat": "^2.4.1", "node-fetch": "^2.6.7", + "node-gyp": "^10.2.0", "p-limit": "^3.1.0", "parse5": "^5.1.1", "rdflib": "^2.2.15", diff --git a/src/PMC4232638/index.js b/src/PMC4232638/index.js index 1f69050b..28f3de47 100644 --- a/src/PMC4232638/index.js +++ b/src/PMC4232638/index.js @@ -1,5 +1,5 @@ const readXlsxFile = require('read-excel-file/node'); -const kbParser = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { logger } = require('../logging'); const { rid } = require('../graphkb'); @@ -134,7 +134,7 @@ const uploadFile = async ({ conn, filename }) => { logger.info(`loading: ${row.Gene}:${row['Amino acid change']}`); try { - const parsed = kbParser.variant.parse(`p.${row['Amino acid change']}`, false).toJSON(); + const parsed = jsonifyVariant(parseVariant(`p.${row['Amino acid change']}`, false) ); const [gene] = await _entrezGene.fetchAndLoadBySymbol(conn, row.Gene); const relevance = await conn.getVocabularyTerm(row.relevance); const evidence = await _pubmed.fetchAndLoadByIds(conn, row.evidence); diff --git a/src/cancerhotspots/index.js b/src/cancerhotspots/index.js index 9225c4b8..8983e752 100644 --- a/src/cancerhotspots/index.js +++ b/src/cancerhotspots/index.js @@ -5,8 +5,8 @@ const fs = require('fs'); const csv = require('fast-csv'); -const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); - +//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { convertRowFields, hashRecordToId, diff --git a/src/cgl/index.js b/src/cgl/index.js index cac1b51f..7a50bd70 100644 --- a/src/cgl/index.js +++ b/src/cgl/index.js @@ -1,7 +1,7 @@ const fs = require('fs'); -const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); - +//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { loadDelimToJson, hashRecordToId, diff --git a/src/civic/evidenceItem.js b/src/civic/evidenceItem.js index f999f8db..2af918f9 100644 --- a/src/civic/evidenceItem.js +++ b/src/civic/evidenceItem.js @@ -3,8 +3,8 @@ const path = require('path'); const _ = require('lodash'); const Ajv = require('ajv'); -const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); - +//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); +const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser'); const { checkSpec, request } = require('../util'); const { logger } = require('../logging'); const { civic: SOURCE_DEFN } = require('../sources'); diff --git a/src/civic/profile.js b/src/civic/profile.js index 4d6845d8..521a7017 100644 --- a/src/civic/profile.js +++ b/src/civic/profile.js @@ -2,8 +2,8 @@ * Introducing Molecular Profiles with CIViC GraphQL API v2.2.0 * [EvidenceItem]--(many-to-one)--[MolecularProfile]--(many-to-many)--[Variant] */ -const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); - +//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); +const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser'); class NotImplementedError extends ErrorMixin { } const MOLECULAR_PROFILE_CACHE = new Map(); diff --git a/src/civic/publication.js b/src/civic/publication.js index 644111ee..2d421805 100644 --- a/src/civic/publication.js +++ b/src/civic/publication.js @@ -1,5 +1,5 @@ -const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); - +//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); +const { ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser'); const _asco = require('../asco'); const _pubmed = require('../entrez/pubmed'); diff --git a/src/civic/relevance.js b/src/civic/relevance.js index 3c1bff1f..f0fe7608 100644 --- a/src/civic/relevance.js +++ b/src/civic/relevance.js @@ -1,4 +1,5 @@ -const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); +//const { error: { ErrorMixin } } = require('@bcgsc-pori/graphkb-parser'); +const {ParsingError, ErrorMixin, InputValidationError } = require('@bcgsc-pori/graphkb-parser'); class NotImplementedError extends ErrorMixin { } diff --git a/src/civic/variant.js b/src/civic/variant.js index 98ebee67..56842135 100644 --- a/src/civic/variant.js +++ b/src/civic/variant.js @@ -4,7 +4,8 @@ const _entrezGene = require('../entrez/gene'); const _snp = require('../entrez/snp'); const { civic: SOURCE_DEFN } = require('../sources'); -const { error: { ErrorMixin, ParsingError } } = kbParser; +//const { error: { ErrorMixin, ParsingError } } = kbParser; +const { ParsingError, ErrorMixin, InputValidationError } = kbParser; class NotImplementedError extends ErrorMixin { } const VARIANT_CACHE = new Map(); diff --git a/src/cosmic/resistance.js b/src/cosmic/resistance.js index 5ff8e29e..1dbcc911 100644 --- a/src/cosmic/resistance.js +++ b/src/cosmic/resistance.js @@ -3,8 +3,8 @@ */ const fs = require('fs'); -const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); - +//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { loadDelimToJson, convertRowFields, diff --git a/src/docm/index.js b/src/docm/index.js index 4b7d5f60..5a43bd9d 100644 --- a/src/docm/index.js +++ b/src/docm/index.js @@ -6,8 +6,8 @@ const Ajv = require('ajv'); const fs = require('fs'); -const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); - +//const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { checkSpec, request } = require('../util'); const { orderPreferredOntologyTerms, rid, diff --git a/src/entrez/snp.js b/src/entrez/snp.js index 6611913b..6d646b23 100644 --- a/src/entrez/snp.js +++ b/src/entrez/snp.js @@ -1,6 +1,6 @@ const Ajv = require('ajv'); -const { variant: { parse: variantParser } } = require('@bcgsc-pori/graphkb-parser'); +const {parseVariant, stringifyVariant, jsonifyVariant} = require('@bcgsc-pori/graphkb-parser'); const { checkSpec } = require('../util'); const {