Merge pull request #96 from leexgh/update-hgnc-202410

Update grch37 HGNC, Mutation Assessor and ClinVar
genome-nexus · Jan 21, 2025 · 76c89aa · 76c89aa
2 parents a065749 + da190ea
commit 76c89aa
Show file tree

Hide file tree

Showing 18 changed files with 87,884 additions and 87,536 deletions.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
@@ -4,6 +4,7 @@ on:
     branches: 
       - master
       - demo-*
+  pull_request:
 # Only use GitHub Actions for testing, because the Docker image with Mutation Assessor data is too big to be built and pushed.
 # CircleCI builds and pushes the images to Docker Hub for every new release.
 jobs:

diff --git a/Dockerfile b/Dockerfile
@@ -1,31 +1,34 @@
-# This base image starts up mongo
-# This version needs to correspond with the helm chart version
 ARG MONGODBVERSION=4.0.12
-FROM bitnami/mongodb:${MONGODBVERSION} as build
 
-# Use .dockerignore file to ignore unwanted files
-# These files are used by import_mongo.sh to initialize mongo
-# Creating directories as root
-# Set user back to the one in base image
-USER root
-RUN mkdir -p /data
-COPY data/ /data/
+FROM bitnami/mongodb:${MONGODBVERSION}
 
+# Define build arguments
 ARG ARG_REF_ENSEMBL_VERSION
 ENV REF_ENSEMBL_VERSION=${ARG_REF_ENSEMBL_VERSION}
 ARG SPECIES=homo_sapiens
+# Define additional annotation resources arguments
 ARG MUTATIONASSESSOR=false
 
-# Import data into mongodb
-COPY scripts/import_mongo.sh /docker-entrypoint-initdb.d/
-RUN /setup.sh
+USER root
 
-FROM bitnami/mongodb:${MONGODBVERSION}
-COPY --from=build /bitnami/mongodb /bitnami/seed
-COPY /scripts/startup.sh /startup.sh
+# Create directories for scripts and data storage and copy data to directory inside the container
+RUN mkdir -p /scripts /data
+COPY data/ /data/
+COPY scripts/startup.sh /scripts/
 
-USER root
-RUN chown -R 1001 /bitnami/seed
+# Make all scripts in the /scripts directory executable
+RUN chmod +x /scripts/*.sh
+
+# Change ownership of the /data directory and its contents to non-root user 1001
+RUN chown -R 1001 /data
+
+# Switch to the non-root user
 USER 1001
 
-CMD [ "/startup.sh" ]
+# Copy the MongoDB initialization script into the /docker-entrypoint-initdb.d/ directory
+# This directory is automatically scanned and executed by MongoDB during the first database initialization
+COPY scripts/import_mongo.sh /docker-entrypoint-initdb.d/
+
+# Set the default command to execute the custom startup script when the container runs
+# The startup script arranges the setup and start of MongoDB
+CMD [ "/scripts/startup.sh" ]
diff --git a/README.md b/README.md
@@ -60,6 +60,28 @@ For R there is only the dependency on the biomaRt library.
 R -e "source('https://bioconductor.org/biocLite.R'); biocLite('biomaRt')"
 ```
 
+#### Updating versions
+##### data/<ref_genome_ensembl_version>/input
+Delete old input files, new files will be generated while running `Makefile`. 
+If you want to update data to a specific verison, check [Ensembl Archieves](https://useast.ensembl.org/info/website/archives/index.html) to find the corresponding url and replace `host_url` in the `retrieve_biomart_tables.R`. Please note only GRCh38 supports versioning url, GRCh37 can only point to `https://grch37.ensembl.org/` which uses the latest version.
+
+##### Onocokb isoform overrides
+Delete old files, new files will be downloaded by `download_oncokb_isoform_overrides.py` while running `Makefile`. Please make sure to set `ONCOKB_VERSION=` in `Makefile`. Also this version need to be updated in `/data/common_input/version_info.txt`.
+
+##### Clinvar
+Delete old input and output files, new files can be genrated by manually run:
+```
+make clinvar/input/clinvar_grch37_input.vcf.gz
+make clinvar/export/clinvar_grch37.txt.gz
+make clinvar/input/clinvar_grch38_input.vcf.gz
+make clinvar/export/clinvar_grch38.txt.gz
+
+``` 
+The latest version date number can be found on [https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/) and [https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/), please make sure to provide correct `CLINVAR_VERSION=` in `Makefile` (for exampel: `CLINVAR_VERSION=20250106`). This new version number also needs to be added in `/data/common_input/version_info.txt`.
+
+##### HGNC
+`/data/common_inpu/hgnc_xxxx.txt` should keep in sync with [https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt](https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt). When updating HGNC file, download new file from the github, rename to `hgnc_xxxx.txt` (`xxxx` indicates the version, for example v2024.10.1). Please also update this new file name in `Makefile` where the files is used, and update the version in `/data/common_input/version_info.txt`.
+
 #### Running
 Run the import pipeline using the command below. This will take a few hours to complete.
 ```bash

diff --git a/data/Makefile b/data/Makefile
@@ -159,7 +159,7 @@ $(TMP_DIR)/ensembl_transcript_info.txt: $(TMP_DIR)/$(SPECIES).gff3.gz
 	python ../scripts/transform_gff_to_tsv_for_exon_info_from_ensembl.py $^ $@
 
 # Add HGNC symbols, exons, UTRs, PFAM domains and Uniprot id to Ensembl Transcript
-$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_complete_set_2023-10.txt
+$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_v2024.10.1.txt
 	python ../scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py $^ $@
 
 # for mouse a specific recipe without overrides
@@ -169,9 +169,9 @@ $(TMP_DIR)/ensembl_biomart_transcripts_mouse.json.gz: $(TMP_DIR)/ensembl_biomart
 # give default/canonical geneid/transcript based on given hugo symbol takes
 # about 50m to run (TODO: this can be easily optimized)
 # isoform_overrides_genome_nexus.txt is made for genome nexus, others files are generated for vcf2maf
-# Please note: we should keep hgnc_complete_set_2023-10 in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt
+# Please note: we should keep hgnc_v2024.10.1.txt in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt
 # isoform_overrides_oncokb_grch3*.txt is a list of OncoKB transcripts and genes, it's generated by download_oncokb_isoform_overrides.py
-$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_2023-10.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt
+$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_v2024.10.1.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt
 	python ../scripts/make_one_canonical_transcript_per_gene.py $^ $@
 
 # mouse version. A different script is called that set the canonicals based on Ensembl lookup.
@@ -220,7 +220,7 @@ common_input/isoform_overrides_oncokb_grch38.txt:
 
 # ClinVar version
 # The latest version date number can be found on https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/ and https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/
-CLINVAR_VERSION=20230722
+CLINVAR_VERSION=20250106
 # download GRCh37 ClinVar VCF file from NCBI
 clinvar/input/clinvar_grch37_input.vcf.gz:
 	curl "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_${CLINVAR_VERSION}.vcf.gz" > $@