Merge pull request #47 from jfnavarro/dev

Merge from dev
jfnavarro · Apr 13, 2021 · 9480e6a · 9480e6a
2 parents ee572ff + 4259fd4
commit 9480e6a
Show file tree

Hide file tree

Showing 17 changed files with 439 additions and 746 deletions.
diff --git a/INSTALL.txt b/INSTALL.txt
@@ -3,52 +3,38 @@
 # Install Anaconda if you have not done it already
 
 mkdir -p ~/shared
-cd ~/shared
 
 # Create environment with all the tools
 conda env create -f environment.yml
 
+# Activate environment 
+conda activate hla
+
 # Download mhcflurry models
 mhcflurry-downloads fetch models_class1_presentation
 
-cd ~/shared
+# Download and install VEP cache (Make use to use the same version as the installed VEP)
+# Note that this will install the cache in the default location for VEP but you can have the cache folder
+# placed elsewhere and use the parameter --vep-cache in the dna_pipeline.py and rna_pipeline.py
+mkdir -p $HOME/.vep
+cd $HOME/.vep
+curl -O http://ftp.ensembl.org/pub/release-102/variation/indexed_vep_cache/homo_sapiens_vep_102_GRCh38.tar.gz
+tar xzf homo_sapiens_vep_102_GRCh38.tar.gz
+
 # Install Strelka (Conda version is only compatible with Python 2.7)
+cd ~/shared
 wget https://github.com/Illumina/strelka/releases/download/v2.9.10/strelka-2.9.10.centos6_x86_64.tar.bz2
 tar -xjf strelka-2.9.10.centos6_x86_64.tar.bz2
 mv strelka-2.9.10.centos6_x86_64 ~/shared/strelka
 
-# Install Annovar. Download link may be expired (email the developer for new link if expired)
-cd ~/shared
-wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
-tar -xvf annovar.latest.tar.gz
-
-# Download GRCh38 references for Annovar
-cd annovar
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar knownGene humandb/
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar ensGene humandb/
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar refGene humandb/
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar gnomad211_exome humandb/
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar avsnp150 humandb/
-./annotate_variation.pl -downdb -buildver hg38 -webfrom annovar cosmic70 humandb/
-
-# Download hg19 references for Annovar
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar knownGene humandb/
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar ensGene humandb/
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar refGene humandb/
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar gnomad211_exome humandb/
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar avsnp150 humandb/
-./annotate_variation.pl -downdb -buildver hg19 -webfrom annovar cosmic70 humandb/
-
 # Install GATK3 from archives
 cd ~/shared
 gsutil cp gs://gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2 .
 tar xjf GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2
 mv GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar gatk3.8.jar
 
-# Create hg38 and hg19 dictionaries following the steps in the notebooks located in /dictionaries
-
-cd ~
 # Clone the pipeline and install it
+cd ~
 git clone https://github.com/jfnavarro/hla_pipeline.git
 cd hla_pipeline
 cp shared/* ~/shared/

diff --git a/README.md b/README.md
@@ -15,14 +15,14 @@ There are 2 pipelines and 2 tools:
 **dna_pipeline.py** processes DNA data and generates a list of unified
 filtered and annotated somatic variants. 
 The variant callers are Mutect2, Strelka2, Varscan and SomaticSniper and both indels and SNPs are
-reported. Annotation is performed using Annovar. 
+reported. Annotation is performed using VEP. 
 The pipeline uses trim-galore to trim, bwa-men to align and follows GATK4 best practices. 
 The pipeline also performs HLA predictions with OptiType (tumor and normal).
 QC is performed with FastQC and BamQC.
 
 **rna_pipeline.py** processes RNA data and generates a list of unified
 annotated somatic variants (weak filtered) and also a list of gene counts values. 
-The variant callers used are Varscan and HaplotypeCaller. Annotation is performed with Annovar.
+The variant callers used are Varscan and HaplotypeCaller. Annotation is performed with VEP.
 The pipeline uses trim-galore to trim, STAR to align and follows GATK4 best practices. 
 The pipeline also performs HLA predictions with OptiType.
 The gene counts values are computed with featureCounts.
@@ -35,20 +35,14 @@ are created for each of the variants somatic effects. The user can define
 the values of the filters for both dna and rna variants. 
 
 **mhc_predict.py** can take the file generated with merge_results.py and the HLA files
-generated in the DNA and/or RNA pipelines and then generate a list of predicted neo-antigens.
+generated in the DNA and/or RNA pipelines and then generate a list of predicted neo-antigens
+with affinity binding scores.
 Variants are filtered by certain criteria and only the most common alleles for each HLA class 1
 are used. 
 
 Each tool/pipeline uses a command line interface with parameters which
 can be shown and described with --help.
 
-## cDNA and Peptides dictionaries
-merge_results.py requires two dictionaries, one mapping transcript ids to DNA sequences and another
-one mapping transcript ids to peptide sequences. The format is the following for both files:
-
-TRANSCRIPT_ID:SEQUENCE 
-
-To build these dictionaries you can use as reference the Jupyter Notebooks located in dictionaries
 
 ## Requirements
 We strongly recommend to use Anaconda or Miniconda, otherwise you may need to create aliases
@@ -114,6 +108,12 @@ Other files:
   - protein_sequences_mu.fasta
   - protein_sequences_wt.fasta
 
+## Authors
+Jose Fernandez Navarro <[email protected]>
+
+## Contributors
+Jonatan Gonzalez <[email protected]>
+
 ## Contact
 Contact: Jose Fernandez Navarro <[email protected]>
 

diff --git a/REFERENCES.txt b/REFERENCES.txt
@@ -1,4 +1,4 @@
-# Instructions on how to install references for GRCh38 y hg19
+# Instructions on how to install references for GRCh38
 
 # Install GSUTIL to download references
 # You need to create .boto config with gsutil config and then disable integrity check by setting this line check_hashes = if_fast_else_skip
@@ -21,21 +21,3 @@ mkdir -p STARIndex
 STAR --runMode genomeGenerate --runThreadN 20 --genomeDir STARIndex \
   --genomeFastaFiles Homo_sapiens_assembly38.fasta \
   --sjdbGTFfile gencode.v34.primary_assembly.annotation.gtf
-
-cd ~/shared
-# Download references for GRCh38 and build STAR index
-mkdir -p hg19
-cd hg19
-gsutil -m cp -r gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta* .
-gsutil -m cp -r gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict* .
-gsutil -m cp -r gs://gcp-public-data--broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.vcf.gz* .
-gsutil -m cp -r gs://gcp-public-data--broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz* .
-gsutil -m cp -r gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz* .
-gsutil -m cp -r gs://gatk-best-practices/somatic-b37/af-only-gnomad.raw.sites.vcf* .
-gsutil -m cp -r gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf* .
-wget ftp://ftp.ensembl.org/pub/grch37/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz
-gunzip Homo_sapiens.GRCh37.87.gtf.gz
-mkdir -p STARIndex
-STAR --runMode genomeGenerate --runThreadN 20 --genomeDir STARIndex \
-  --genomeFastaFiles Homo_sapiens_assembly19.fasta \
-  --sjdbGTFfile Homo_sapiens.GRCh37.87.gtf
diff --git a/RUN.txt b/RUN.txt
@@ -1,6 +1,5 @@
 # You may want to add these to your .bashrc or .bash_profile
 export STRELKA_PATH="~/shared/strelka"
-export ANNOVAR_PATH="~/shared/annovar"
 export GATK3_PATH="~/shared/gatk3.8.jar"
 
 conda activate hla
@@ -15,28 +14,12 @@ dna_pipeline.py \
   --germline ~/shared/GRCh38/af-only-gnomad.hg38.vcf.gz \
   --pon ~/shared/GRCh38/1000g_pon.hg38.vcf.gz \
   --threads 20 \
-  --annovar-db humandb \
-  --annovar-version hg38 \
+  --vep-db GRCh38 \
+  --vep-version 102 \
   --hla-fasta ~/shared/hla_reference/hla_reference_dna.fasta \
   NormalR1.fastq.gz NormalR2.fastq.gz \
   TumorR1.fastq.gz TumorR2.fastq.gz
 
-dna_pipeline.py \
-  --genome ~/shared/GRCh38/Homo_sapiens_assembly38.fasta \
-  --sample G001 \
-  --outdir outcfdna \
-  --known1 ~/shared/GRCh38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
-  --known2 ~/shared/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
-  --snpsites ~/shared/GRCh38/Homo_sapiens_assembly38.dbsnp138.vcf \
-  --germline ~/shared/GRCh38/af-only-gnomad.hg38.vcf.gz \
-  --pon ~/shared/GRCh38/1000g_pon.hg38.vcf.gz \
-  --threads 20 \
-  --annovar-db humandb \
-  --annovar-version hg38 \
-  --hla-fasta ~/shared/hla_reference/hla_reference_dna.fasta \
-  NormalR1.fastq.gz NormalR2.fastq.gz \
-  cfDNAR1.fastq.gz cfDNAR2.fastq.gz
-
 rna_pipeline.py \
   --genome ~/shared/GRCh38/Homo_sapiens_assembly38.fasta \
   --genome-star ~/shared/GRCh38/STARIndex \
@@ -47,21 +30,20 @@ rna_pipeline.py \
   --known2 ~/shared/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
   --snpsites ~/shared/GRCh38/Homo_sapiens_assembly38.dbsnp138.vcf \
   --threads 20 \
-  --annovar-db humandb \
-  --annovar-version hg38 \
+  --vep-db GRCh38 \
+  --vep-version 102 \
   --hla-fasta ~/shared/hla_reference/hla_reference_rna.fasta \
   RNAR1.fastq.gz RNAR2.fastq.gz
 
 merge_results.py \
-  --dictcDNA ~/shared/GRCh38/hg38_cDNA_DICT.dict \
-  --dictAA ~/shared/GRCh38/hg38_AA_DICT.dict \
-  --dna outtumor/annotated.hg38_multianno.vcf outcfdna/annotated.hg38_multianno.vcf\
-  --dna-names Tumor cfDNA
+  --dna outtumor/annotated.hg38_multianno.vcf \
+  --dna-names Tumor
   --rna outrna/annotated.hg38_multianno.vcf \
   --rna-names RNA
-  --rna-counts outrna/gene.counts'
+  --rna-counts outrna/gene.counts' \
+  --ensembl-version 102
 
 mhc_predict.py \
-  --hla outtumor/Tumor_hla_genotype.tsv outcfdna/Tumor_hla_genotype.tsv outrna/hla_genotype.tsv \
+  --hla outtumor/Tumor_hla_genotype.tsv outrna/hla_genotype.tsv \
   --variants overlap_final.txt \
   --alleles ~/shared/alleles.txt
diff --git a/diagram.png b/diagram.png
diff --git a/dictionaries/hg19_created_db.ipynb b/dictionaries/hg19_created_db.ipynb