Skip to content

Commit

Permalink
Merge pull request #96 from leexgh/update-hgnc-202410
Browse files Browse the repository at this point in the history
Update grch37 HGNC, Mutation Assessor and ClinVar
  • Loading branch information
leexgh authored Jan 21, 2025
2 parents a065749 + da190ea commit 76c89aa
Show file tree
Hide file tree
Showing 18 changed files with 87,884 additions and 87,536 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
branches:
- master
- demo-*
pull_request:
# Only use GitHub Actions for testing, because the Docker image with Mutation Assessor data is too big to be built and pushed.
# CircleCI builds and pushes the images to Docker Hub for every new release.
jobs:
Expand Down
41 changes: 22 additions & 19 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,34 @@
# This base image starts up mongo
# This version needs to correspond with the helm chart version
ARG MONGODBVERSION=4.0.12
FROM bitnami/mongodb:${MONGODBVERSION} as build

# Use .dockerignore file to ignore unwanted files
# These files are used by import_mongo.sh to initialize mongo
# Creating directories as root
# Set user back to the one in base image
USER root
RUN mkdir -p /data
COPY data/ /data/
FROM bitnami/mongodb:${MONGODBVERSION}

# Define build arguments
ARG ARG_REF_ENSEMBL_VERSION
ENV REF_ENSEMBL_VERSION=${ARG_REF_ENSEMBL_VERSION}
ARG SPECIES=homo_sapiens
# Define additional annotation resources arguments
ARG MUTATIONASSESSOR=false

# Import data into mongodb
COPY scripts/import_mongo.sh /docker-entrypoint-initdb.d/
RUN /setup.sh
USER root

FROM bitnami/mongodb:${MONGODBVERSION}
COPY --from=build /bitnami/mongodb /bitnami/seed
COPY /scripts/startup.sh /startup.sh
# Create directories for scripts and data storage and copy data to directory inside the container
RUN mkdir -p /scripts /data
COPY data/ /data/
COPY scripts/startup.sh /scripts/

USER root
RUN chown -R 1001 /bitnami/seed
# Make all scripts in the /scripts directory executable
RUN chmod +x /scripts/*.sh

# Change ownership of the /data directory and its contents to non-root user 1001
RUN chown -R 1001 /data

# Switch to the non-root user
USER 1001

CMD [ "/startup.sh" ]
# Copy the MongoDB initialization script into the /docker-entrypoint-initdb.d/ directory
# This directory is automatically scanned and executed by MongoDB during the first database initialization
COPY scripts/import_mongo.sh /docker-entrypoint-initdb.d/

# Set the default command to execute the custom startup script when the container runs
# The startup script arranges the setup and start of MongoDB
CMD [ "/scripts/startup.sh" ]
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,28 @@ For R there is only the dependency on the biomaRt library.
R -e "source('https://bioconductor.org/biocLite.R'); biocLite('biomaRt')"
```

#### Updating versions
##### data/<ref_genome_ensembl_version>/input
Delete old input files, new files will be generated while running `Makefile`.
If you want to update data to a specific verison, check [Ensembl Archieves](https://useast.ensembl.org/info/website/archives/index.html) to find the corresponding url and replace `host_url` in the `retrieve_biomart_tables.R`. Please note only GRCh38 supports versioning url, GRCh37 can only point to `https://grch37.ensembl.org/` which uses the latest version.

##### Onocokb isoform overrides
Delete old files, new files will be downloaded by `download_oncokb_isoform_overrides.py` while running `Makefile`. Please make sure to set `ONCOKB_VERSION=` in `Makefile`. Also this version need to be updated in `/data/common_input/version_info.txt`.

##### Clinvar
Delete old input and output files, new files can be genrated by manually run:
```
make clinvar/input/clinvar_grch37_input.vcf.gz
make clinvar/export/clinvar_grch37.txt.gz
make clinvar/input/clinvar_grch38_input.vcf.gz
make clinvar/export/clinvar_grch38.txt.gz
```
The latest version date number can be found on [https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/) and [https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/), please make sure to provide correct `CLINVAR_VERSION=` in `Makefile` (for exampel: `CLINVAR_VERSION=20250106`). This new version number also needs to be added in `/data/common_input/version_info.txt`.

##### HGNC
`/data/common_inpu/hgnc_xxxx.txt` should keep in sync with [https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt](https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt). When updating HGNC file, download new file from the github, rename to `hgnc_xxxx.txt` (`xxxx` indicates the version, for example v2024.10.1). Please also update this new file name in `Makefile` where the files is used, and update the version in `/data/common_input/version_info.txt`.

#### Running
Run the import pipeline using the command below. This will take a few hours to complete.
```bash
Expand Down
8 changes: 4 additions & 4 deletions data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ $(TMP_DIR)/ensembl_transcript_info.txt: $(TMP_DIR)/$(SPECIES).gff3.gz
python ../scripts/transform_gff_to_tsv_for_exon_info_from_ensembl.py $^ $@

# Add HGNC symbols, exons, UTRs, PFAM domains and Uniprot id to Ensembl Transcript
$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_complete_set_2023-10.txt
$(TMP_DIR)/ensembl_biomart_transcripts.json.gz: $(TMP_DIR)/ensembl_biomart_transcripts.txt $(TMP_DIR)/ensembl_transcript_info.txt $(VERSION)/input/ensembl_biomart_pfam.txt $(VERSION)/input/ensembl_biomart_refseq.txt $(VERSION)/input/ensembl_biomart_ccds.txt uniprot/export/$(VERSION)_enst_to_uniprot_mapping_id.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/hgnc_v2024.10.1.txt
python ../scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py $^ $@

# for mouse a specific recipe without overrides
Expand All @@ -169,9 +169,9 @@ $(TMP_DIR)/ensembl_biomart_transcripts_mouse.json.gz: $(TMP_DIR)/ensembl_biomart
# give default/canonical geneid/transcript based on given hugo symbol takes
# about 50m to run (TODO: this can be easily optimized)
# isoform_overrides_genome_nexus.txt is made for genome nexus, others files are generated for vcf2maf
# Please note: we should keep hgnc_complete_set_2023-10 in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt
# Please note: we should keep hgnc_v2024.10.1.txt in sync with https://github.com/cBioPortal/datahub-study-curation-tools/blob/master/gene-table-update/build-input-for-importer/hgnc_complete_set.txt
# isoform_overrides_oncokb_grch3*.txt is a list of OncoKB transcripts and genes, it's generated by download_oncokb_isoform_overrides.py
$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_complete_set_2023-10.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt
$(TMP_DIR)/ensembl_biomart_canonical_transcripts_per_hgnc.txt: $(TMP_DIR)/ensembl_canonical_data.txt common_input/hgnc_v2024.10.1.txt common_input/isoform_overrides_uniprot.txt common_input/$(MSKCC_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(GENOME_NEXUS_ISOFORM_OVERRIDES_FILE_NAME) common_input/$(ONCOKB_ISOFORM_OVERRIDES_FILE_NAME) common_input/ignored_genes.txt
python ../scripts/make_one_canonical_transcript_per_gene.py $^ $@

# mouse version. A different script is called that set the canonicals based on Ensembl lookup.
Expand Down Expand Up @@ -220,7 +220,7 @@ common_input/isoform_overrides_oncokb_grch38.txt:

# ClinVar version
# The latest version date number can be found on https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/ and https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/
CLINVAR_VERSION=20230722
CLINVAR_VERSION=20250106
# download GRCh37 ClinVar VCF file from NCBI
clinvar/input/clinvar_grch37_input.vcf.gz:
curl "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_${CLINVAR_VERSION}.vcf.gz" > $@
Expand Down
Loading

0 comments on commit 76c89aa

Please sign in to comment.