diff --git a/CONTRIBUTORS.yaml b/CONTRIBUTORS.yaml index 38dafec147264e..b3f98b06145f46 100644 --- a/CONTRIBUTORS.yaml +++ b/CONTRIBUTORS.yaml @@ -163,6 +163,10 @@ aurelienmoumbock: twitter: FMoumbock joined: 2022-02 +avahoffman: + name: Ava Hoffman + joined: 2023-06 + avani-k: name: Avani Khadilkar email: akhadilk@ucsc.edu @@ -291,6 +295,10 @@ CameronFRWatson: email: watsocam@ohsu.edu orcid: 0000-0002-6942-2469 +cansavvy: + name: Candace Savonen + joined: 2023-06 + cat-bro: name: Catherine Bromhead matrix: 'cat-bro:matrix.org' @@ -376,6 +384,10 @@ cstritt: name: Christoph Stritt joined: 2022-03 +cutsort: + name: Frederick Tan + joined: 2023-06 + d-salgado: name: David Salgado joined: 2022-10 @@ -447,6 +459,10 @@ eancelet: joined: 2021-01 elixir_node: fr +ehumph: + name: Elizabeth Humphries + joined: 2023-06 + ElectronicBlueberry: name: Laila Los joined: 2023-04 @@ -818,6 +834,10 @@ jsaintvanne: name: Julien Saint-Vanne joined: 2020-01 +jtleek: + name: Jeffrey T. Leek + joined: 2023-06 + jxtx: name: James Taylor joined: 2018-06 @@ -826,6 +846,10 @@ jxtx: His impacts on the Galaxy community, have been incredible, and his loss is keenly felt. +katherinecox: + name: Katherine Cox + joined: 2023-06 + katrinleinweber: name: Katrin Leinweber email: katrin.leinweber@tib.eu @@ -1120,6 +1144,11 @@ nagoue: orcid: 0000-0003-2750-1473 joined: 2019-07 +nakucher: + name: Natalie Kucher + email: nkucher3@jhu.edu + joined: 2023-06 + natefoo: name: Nate Coraor matrix: 'natefoo:matrix.org' @@ -1334,6 +1363,10 @@ robertmand: joined: 2021-10 elixir_node: uk +robertmeller: + name: Robert Meller + joined: 2023-06 + reginaesinamabotsi: name: Regina Esinam Abotsi joined: 2018-06 @@ -1782,6 +1815,24 @@ elixir-converge: funding_statement: | ELIXIR CONVERGE is connecting and align ELIXIR Nodes to deliver sustainable FAIR life-science data management services. This project has received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement № 871075 +nhgri-gdscn: + name: National Human Genome Research Institute Genomic Data Science Community Network + github: false + joined: 2023-06 + avatar: https://www.genome.gov/themes/custom/nhgri/assets/global/NHGRI-logo.svg + url: https://www.genome.gov/ + funder: true + funding_id: 75N92022P00232 + +nhgri-anvil: + name: National Human Genome Research Institute Genomic Data Science Analysis, Visualization, and Informatics Lab-Space + github: false + joined: 2023-06 + avatar: https://www.genome.gov/themes/custom/nhgri/assets/global/NHGRI-logo.svg + url: https://www.genome.gov/Funded-Programs-Projects/Computational-Genomics-and-Data-Science-Program/Genomic-Analysis-Visualization-Informatics-Lab-space-AnVIL + funder: true + funding_id: U24HG010263 + ai4life: name: AI4Life github: false @@ -1793,4 +1844,4 @@ ai4life: funding_system: cordis funding_statement: | AI4Life has received funding from the European Union’s Horizon Europe research and innovation programme under grant agreement number 101057970. - + diff --git a/_plugins/jekyll-duration.rb b/_plugins/jekyll-duration.rb index 497ab5ebc9fc0a..008be2883960ad 100644 --- a/_plugins/jekyll-duration.rb +++ b/_plugins/jekyll-duration.rb @@ -29,7 +29,7 @@ def duration_to_human(duration) hour = 'hour' hours = 'hours' minutes = 'minutes' - if @context.registers[:page]&.key?('lang') + if @context.registers[:page]&.key?('lang') and @context.registers[:page]['lang'] != 'en' lang = @context.registers[:page]['lang'] hour = @context.registers[:site].data['lang'][lang]['hour'] hours = @context.registers[:site].data['lang'][lang]['hours'] diff --git a/bin/lint.rb b/bin/lint.rb index 6999d5ee5a9a99..327e9a560c1e3c 100755 --- a/bin/lint.rb +++ b/bin/lint.rb @@ -108,6 +108,7 @@ def self.message(path: '', idx: 0, match_start: 0, match_end: 1, replacement: ni # Linting functions for the GTN module GtnLinter @BAD_TOOL_LINK = /{% tool (\[[^\]]*\])\(https?.*tool_id=([^)]*)\)\s*%}/i + @BAD_TOOL_LINK2 = /{% tool (\[[^\]]*\])\(https:\/\/toolshed.g2([^)]*)\)\s*%}/i def self.find_matching_texts(contents, query) contents.map.with_index do |text, idx| @@ -150,7 +151,7 @@ def self.youtube_bad(contents) def self.link_gtn_tutorial_external(contents) find_matching_texts( contents, - %r{\((https?://(training.galaxyproject.org|galaxyproject.github.io)/training-material/(.*tutorial).html)\)} + %r{\((https?://(training.galaxyproject.org|galaxyproject.github.io)/training-material/[^)]*)\)} ) .map do |idx, _text, selected| ReviewDogEmitter.error( @@ -349,7 +350,7 @@ def self.non_existent_snippet(contents) end def self.bad_tool_links(contents) - find_matching_texts(contents, @BAD_TOOL_LINK) + find_matching_texts(contents, @BAD_TOOL_LINK) + find_matching_texts(contents, @BAD_TOOL_LINK2) .map do |idx, _text, selected| ReviewDogEmitter.error( path: @path, diff --git a/topics/sequence-analysis/faqs/quality_score.md b/topics/sequence-analysis/faqs/quality_score.md new file mode 100644 index 00000000000000..557a78ef817368 --- /dev/null +++ b/topics/sequence-analysis/faqs/quality_score.md @@ -0,0 +1,35 @@ +--- +title: Quality Scores +area: format +box_type: details +layout: faq +contributors: [bebatut, nakucher, hexylena] +--- + +But what does this quality score mean? + +The quality score for each sequence is a string of characters, one for each base of the nucleotide sequence, used to characterize the probability of misidentification of each base. The score is encoded using the ASCII character table (with [some historical differences](https://en.wikipedia.org/wiki/FASTQ_format#Encoding)): + +To save space, the sequencer records an [ASCII character](http://drive5.com/usearch/manual/quality_score.html) to represent scores 0-42. For example 10 corresponds to "+" and 40 corresponds to "I". FastQC knows how to translate this. This is often called "Phred" scoring. + +![Encoding of the quality score with ASCII characters for different Phred encoding. The ascii code sequence is shown at the top with symbols for 33 to 64, upper case letters, more symbols, and then lowercase letters. Sanger maps from 33 to 73 while solexa is shifted, starting at 59 and going to 104. Illumina 1.3 starts at 54 and goes to 104, Illumina 1.5 is shifted three scores to the right but still ends at 104. Illumina 1.8+ goes back to the Sanger except one single score wider. Illumina]({{site.baseurl}}/topics/sequence-analysis/faqs/images/fastq-quality-encoding.png) + +So there is an ASCII character associated with each nucleotide, representing its [Phred quality score](https://en.wikipedia.org/wiki/Phred_quality_score), the probability of an incorrect base call: + +Phred Quality Score | Probability of incorrect base call | Base call accuracy +--- | --- | --- +10 | 1 in 10 | 90% +20 | 1 in 100 | 99% +30 | 1 in 1000 | 99.9% +40 | 1 in 10,000 | 99.99% +50 | 1 in 100,000 | 99.999% +60 | 1 in 1,000,000 | 99.9999% + + +What does 0-42 represent? These numbers, when plugged into a formula, tell us the probability of an error for that base. This is the formula, where Q is our quality score (0-42) and P is the probability of an error: + +``` +Q = -10 log10(P) +``` + +Using this formula, we can calculate that a quality score of 40 means only 0.00010 probability of an error! diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/00-anvil-workspaces.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/00-anvil-workspaces.png new file mode 100644 index 00000000000000..d99cd90a63e785 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/00-anvil-workspaces.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/01-sars-workspace.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/01-sars-workspace.png new file mode 100644 index 00000000000000..b69e18f72e56ca Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/01-sars-workspace.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/02-clone-button.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/02-clone-button.png new file mode 100644 index 00000000000000..b61bac63d33fad Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/02-clone-button.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/03-clone-settings.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/03-clone-settings.png new file mode 100644 index 00000000000000..50cfaa15c2ba4e Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/03-clone-settings.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/04-select-galaxy.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/04-select-galaxy.png new file mode 100644 index 00000000000000..3d7050ce67c5cf Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/04-select-galaxy.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/05-start-galaxy.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/05-start-galaxy.png new file mode 100644 index 00000000000000..9cc0b79985651b Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/05-start-galaxy.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/06-galaxy-next.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/06-galaxy-next.png new file mode 100644 index 00000000000000..1ee24201244d22 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/06-galaxy-next.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/07-galaxy-create.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/07-galaxy-create.png new file mode 100644 index 00000000000000..95f8acb32c88b6 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/07-galaxy-create.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/08-galaxy-provisioning.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/08-galaxy-provisioning.png new file mode 100644 index 00000000000000..8a991c56520168 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/08-galaxy-provisioning.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/09-galaxy-open.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/09-galaxy-open.png new file mode 100644 index 00000000000000..5c54c229a27b47 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/09-galaxy-open.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/10-galaxy-on-anvil.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/10-galaxy-on-anvil.png new file mode 100644 index 00000000000000..943eef5f94c57d Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/10-galaxy-on-anvil.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/11-galaxy-data-workspace.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/11-galaxy-data-workspace.png new file mode 100644 index 00000000000000..2ee5a65eee36c9 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/11-galaxy-data-workspace.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/12-fastq-view.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/12-fastq-view.png new file mode 100644 index 00000000000000..782d0f5c266959 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/12-fastq-view.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/13-fastqc-report.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/13-fastqc-report.png new file mode 100644 index 00000000000000..005a5c02091aae Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/13-fastqc-report.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/14-galaxy-jbrowse-bams.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/14-galaxy-jbrowse-bams.png new file mode 100644 index 00000000000000..681a9b7e62a2d1 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/14-galaxy-jbrowse-bams.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/15-galaxy-jbrowse-ref.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/15-galaxy-jbrowse-ref.png new file mode 100644 index 00000000000000..d55b2b009a2640 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/15-galaxy-jbrowse-ref.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/16-export-history.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/16-export-history.png new file mode 100644 index 00000000000000..f9a43e701709b0 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/16-export-history.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/17-export.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/17-export.png new file mode 100644 index 00000000000000..9b3f66a298fc4e Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/17-export.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/18-export-verify.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/18-export-verify.png new file mode 100644 index 00000000000000..726dd5f7dc1d38 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/18-export-verify.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/19-galaxy-env.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/19-galaxy-env.png new file mode 100644 index 00000000000000..c902aa022b9e65 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/19-galaxy-env.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/20-galaxy-edit.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/20-galaxy-edit.png new file mode 100644 index 00000000000000..4407e3bcb1d306 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/20-galaxy-edit.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/21-galaxy-delete.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/21-galaxy-delete.png new file mode 100644 index 00000000000000..0322e15efe7966 Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/21-galaxy-delete.png differ diff --git a/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/22-galaxy-delete-pd.png b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/22-galaxy-delete-pd.png new file mode 100644 index 00000000000000..0bc3d9fef263ee Binary files /dev/null and b/topics/sequence-analysis/images/sars-with-galaxy-in-anvil/22-galaxy-delete-pd.png differ diff --git a/topics/sequence-analysis/tutorials/quality-control/tutorial.md b/topics/sequence-analysis/tutorials/quality-control/tutorial.md index 060d7e82b68a6f..58be8eaf8a2868 100644 --- a/topics/sequence-analysis/tutorials/quality-control/tutorial.md +++ b/topics/sequence-analysis/tutorials/quality-control/tutorial.md @@ -112,22 +112,7 @@ GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFGGGGGGAFFGGFGG It means that the fragment named `@M00970` corresponds to the DNA sequence `GTGCCAGCCGCCGCGGTAGTCCGACGTGGCTGTCTCTTATACACATCTCCGAGCCCACGAGACCGAAGAACATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAGAAGCAAATGACGATTCAAGAAAGAAAAAAACACAGAATACTAACAATAAGTCATAAACATCATCAACATAAAAAAGGAAATACACTTACAACACATATCAATATCTAAAATAAATGATCAGCACACAACATGACGATTACCACACATGTGTACTACAAGTCAACTA` and this sequence has been sequenced with a quality `GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFGGGGGGAFFGGFGGGGGGGGFGGGGGGGGGGGGGGFGGG+38+35*311*6,,31=******441+++0+0++0+*1*2++2++0*+*2*02*/***1*+++0+0++38++00++++++++++0+0+2++*+*+*+*+*****+0**+0**+***+)*.***1**//*)***)/)*)))*)))*),)0(((-((((-.(4(,,))).,(())))))).)))))))-))-(`. -But what does this quality score mean? - -The quality score for each sequence is a string of characters, one for each base of the nucleic sequence, used to characterize the probability of mis-identification of each base. The score is encoded using the ASCII character table (with [some historical differences](https://en.wikipedia.org/wiki/FASTQ_format#Encoding)): - -![Encoding of the quality score with ASCII characters for different Phred encoding. The ascii code sequence is shown at the top with symbols for 33 to 64, upper case letters, more symbols, and then lowercase letters. Sanger maps from 33 to 73 while solexa is shifted, starting at 59 and going to 104. Illumina 1.3 starts at 54 and goes to 104, Illumina 1.5 is shifted three scores to the right but still ends at 104. Illumina 1.8+ goes back to the Sanger except one single score wider. Illumina](../../../sequence-analysis/images/fastq-quality-encoding.png) - -So there is an ASCII character associated with each nucleotide, representing its [Phred quality score](https://en.wikipedia.org/wiki/Phred_quality_score), the probability of an incorrect base call: - -Phred Quality Score | Probability of incorrect base call | Base call accuracy ---- | --- | --- -10 | 1 in 10 | 90% -20 | 1 in 100 | 99% -30 | 1 in 1000 | 99.9% -40 | 1 in 10,000 | 99.99% -50 | 1 in 100,000 | 99.999% -60 | 1 in 1,000,000 | 99.9999% +{% snippet topics/sequence-analysis/faqs/quality_score.md %} > > @@ -168,7 +153,7 @@ Rather than looking at quality scores for each individual read, FASTQE looks at ![FASTQE before](../../images/quality-control/fastqe-mean-before.png "FASTQE mean scores") -You can see the score for each emoji [here](https://github.com/fastqe/fastqe#scale). The emojis below, with Phred scores less than 20, are the ones we hope we don't see much. +You can see the score for each [emoji in fastqe's documentation](https://github.com/fastqe/fastqe#scale). The emojis below, with Phred scores less than 20, are the ones we hope we don't see much. Phred Quality Score | ASCII code | Emoji --- | --- | --- @@ -310,7 +295,7 @@ It is normal with all Illumina sequencers for the median quality score to start When the median quality is below a Phred score of ~20, we should consider trimming away bad quality bases from the sequence. We will explain that process in the Trim and filter section. -#### Adapter Content +### Adapter Content ![Adapter Content](../../images/quality-control/adapter_content-before.png "Adapter Content") @@ -332,13 +317,13 @@ We can run an trimming tool such as Cutadapt to remove this adapter. We will exp > Take a shortcut > > The following sections go into detail about some of the other plots generated by FastQC. Note that some plots/modules may give warnings but be normal -> for the type of data you're working with, as discussed below and [here](https://rtsf.natsci.msu.edu/genomics/tech-notes/fastqc-tutorial-and-faq/). +> for the type of data you're working with, as discussed below and [in the FASTQC FAQ](https://rtsf.natsci.msu.edu/genomics/tech-notes/fastqc-tutorial-and-faq/). > The other plots give us information to more deeply understand the quality of the data, and to see if changes could be made in the lab to get higher-quality data in the future. > These sections are **optional**, and if you would like to skip these you can: > - Jump straight to the [next section](#trim-and-filter---short-reads) to learn about trimming paired-end data {: .tip} -#### Per tile sequence quality +### Per tile sequence quality This plot enables you to look at the quality scores from each tile across all of your bases to see if there was a loss in quality associated with only one part of the flowcell. The plot shows the deviation from the average quality for each flowcell tile. The hotter colours indicate that reads in the given tile have worse qualities for that position than reads in other tiles. With this sample, you can see that certain tiles show consistently poor quality, especially from ~100bp onwards. A good plot should be blue all over. @@ -413,7 +398,7 @@ But there are also other situations in which an unusually-shaped distribution ma > {: .solution } {: .question} -#### Sequence length distribution +### Sequence length distribution This plot shows the distribution of fragment sizes in the file which was analysed. In many cases this will produce a simple plot showing a peak only at one size, but for variable length FASTQ files this will show the relative amounts of each different size of sequence fragment. Our plot shows variable length as we trimmed the data. The biggest peak is at 296bp but there is a second large peak at ~100bp. So even though our sequences range up to 296bp in length, a lot of the good-quality sequences are shorter. This corresponds with the drop we saw in the sequence quality at ~100bp and the red stripes starting at this position in the per tile sequence quality plot. @@ -570,7 +555,7 @@ The quality drops in the middle of these sequences. This could cause bias in dow To accomplish this task we will use [Cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) {% cite marcel2011cutadapt %}, a tool that enhances sequence quality by automating adapter trimming as well as quality control. We will: -- Trim low-quality bases from the ends. Quality trimming is done before any adapter trimming. We will set the quality threshold as 20, a commonly used threshold, see more [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531872-Phred-scaled-quality-scores). +- Trim low-quality bases from the ends. Quality trimming is done before any adapter trimming. We will set the quality threshold as 20, a commonly used threshold, see more [in GATK's Phred Score FAQ](https://gatk.broadinstitute.org/hc/en-us/articles/360035531872-Phred-scaled-quality-scores). - Trim adapter with Cutadapt. For that we need to supply the sequence of the adapter. In this sample, Nextera is the adapter that was detected. We can find the sequence of the Nextera adapter on the [Illumina website here](https://support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html) `CTGTCTCTTATACACATCT`. We will trim that sequence from the 3' end of the reads. - Filter out sequences with length < 20 after trimming diff --git a/topics/sequence-analysis/tutorials/sars-with-galaxy-on-anvil/tutorial.md b/topics/sequence-analysis/tutorials/sars-with-galaxy-on-anvil/tutorial.md new file mode 100644 index 00000000000000..c2bd9a1e00d17b --- /dev/null +++ b/topics/sequence-analysis/tutorials/sars-with-galaxy-on-anvil/tutorial.md @@ -0,0 +1,401 @@ +--- +layout: tutorial_hands_on + +title: SARS-CoV-2 Viral Sample Alignment and Variant Visualization +level: Introductory +zenodo_link: 'https://doi.org/10.5281/zenodo.8115178' +questions: +- How do I check my data quality? +- How do I put together my dataset of fragmented sequences into a full sequence? +- How do I visually explore the variants in my SARS-CoV-2 sample? +objectives: +- Gather and evaluate experimental evidence, including qualitative and quantitative data +- Generate and interpret graphs displaying experimental results +- Critique large data sets and use bioinformatics to assess genetics data +- Tap into the interdisciplinary nature of science +time_estimation: '1H' +key_points: +- Use Galaxy on the AnVIL cloud computing resource to check data, perform an alignment, and visualize the results. +tags: + - covid19 +contributions: + authorship: + - nakucher + - avahoffman + - robertmeller + editing: + - katherinecox + - cutsort + - nakucher + infrastructure: + - cansavvy + - avahoffman + - katherinecox + - ehumph + - cutsort + funding: + - nhgri-gdscn + - nhgri-anvil +--- + +There is a growing need for undergraduate students to learn cutting-edge concepts in genomics data science, including performing analysis on the cloud instead of a personal computer. + +This lesson aims to introduce a mutant detection bioinformatics pipeline based on a publicly available genetic sample of SARS-CoV-2. Students will be introduced to the sequencing revolution, variants, genetic alignments, and essentials of cloud computing prior to the lab activity. During the lesson, students will work hands-on with the point-and-click Galaxy interface on the AnVIL cloud computing resource to check data, perform an alignment, and visualize their results. + +> +> +> In this tutorial we will deal with: +> +> 1. TOC +> {:toc} +> +{: .agenda} + + +# Background Concepts + +## What is a Variant? + +This lecture module introduces genetic variants. It provides several examples of genetic variants, background on the structure of DNA, and a review of the “Central Dogma” of molecular biology - the process of turning DNA into RNA into protein. + +Learning Objectives +- Answer “what is a genetic variant?” +- Learn about the molecular structure of a variant + +Video thumbnail image for the What is a Variant background video. + +## The Sequencing Revolution + +This lecture module introduces the history of the sequencing revolution. It highlights the enormous proliferation of genomic data that has accompanied the rapidly growing technology. It also suggests opportunities for careers in genomics, as well as an in-depth look at how some sequencing technologies actually work. + +Learning Objectives +- Learn the history the sequencing revolution +- Introduce the sequencing workforce +- Explore the evolution of sequencing technology + +Video thumbnail image for the Sequencing Revolution background video. + +## Alignments + +This lecture module introduces the structure of genomic data and how alignments work. It touches on the “shredded book” analogy, demonstrates how short chunks of data can be compared to find variation, and reviews data files needed for alignments, including reference genomes and read data. It also reviews some diverse applications of variant detection made possible via alignment tools. + +Learning Objectives +- Learn about data as “reads” & shredded books +- Become familiar with reference genomes and alignments +- Explore the file structure of genomic data and quality scoring + +Video thumbnail image for the Alignments background video. + +## Cloud Computing + +This lecture module introduces cloud computing and computing architecture. It reviews the utility of cloud computing for genomics and also highlights how all modules of this activity fit together. + +Learning Objectives +- Learn about different types of computers +- Answer the question “What is cloud computing?” +- Learn about cloud computing for genomics +- Revisit the big picture, from variants to alignments + +Video thumbnail image for the Cloud Computing background video. + +# Overview + +This overview video introduces the lab activity. It briefly reviews some of the essential background for the activity, highlights key areas to focus on for activity assessment questions, and provides a detailed walk-through of the steps - from starting AnVIL and Galaxy to browsing the genome and shutting down the cloud computing instance. + +Learning Objectives: +- Review lecture content +- Review big steps in the analysis +- Cover the setup on AnVIL in detail + +Video thumbnail image for the Activity Overview background video. + +# Set Up + +In the next few steps, you will walk through how to get set up to use Galaxy on the AnVIL platform. AnVIL is centered around different “Workspaces”. Each Workspace functions almost like a mini code laboratory - it is a place where data can be examined, stored, and analyzed. The first thing we want to do is to copy or “clone” a Workspace to create a space for you to experiment. + +> +> Because AnVIL runs on a commerical cloud provider, you will need to have set up billing for yourself or through your institution to follow along with this exercise, or you will need to be added to a billing account created for a training event. +> +> Learn more about options for creating billing accounts at this link: https://jhudatascience.org/AnVIL_Book_Getting_Started/overview-pis.html. +{: .comment} + +> Screen view +> +> * At this point, it might make things easier to open up a new window in your browser and split your screen. That way, you can follow along with this guide on one side and execute the steps on the other. +{: .tip} + +> Clone the Workspace +> +> 1. Use a web browser to go to the AnVIL website. In the browser type: `anvil.terra.bio`. Log into AnVIL. +> 2. Click "View Workspaces". +> - Select the “Public” tab. +> - In the top search bar type the activity workspace `SARS-CoV-2-Genome`. You can also go directly to the following link: https://anvil.terra.bio/#workspaces/gdscn-exercises/SARS-CoV-2-Genome. +> ![Screenshot of the AnVIL platform workspaces page highlighting the Public tab and the SARS-CoV-2-Genome Workspace search result.](../../images/sars-with-galaxy-in-anvil/01-sars-workspace.png) +> 3. Clone the workspace by clicking the teardrop button and selecting “Clone”. +> ![Screenshot showing the teardrop button. The button has been clicked revealing the "clone" option. The Clone option and the teardrop button are highlighted.](../../images/sars-with-galaxy-in-anvil/02-clone-button.png) +> - In the new window, give your Workspace clone a name by adding an underscore (“_”) and your name. +> - Next, select the Billing project provided by your instructor. +> - Leave the Description and Authorization Domain boxes as-is. +> ![Screenshot showing the "clone a workspace" popout. The Workspace name, Billing Project, and Clone Workspace button have been filled in and highlighted.](../../images/sars-with-galaxy-in-anvil/03-clone-settings.png) +> - Click “CLONE WORKSPACE”. +> +{: .hands_on} + +## Starting Galaxy + +Galaxy is a great tool for performing bioinformatics analysis without having to update software or worry too much about coding. In order to use Galaxy, we need to create a cloud environment. This is like quickly renting a few computers from Google as the engine to power our Galaxy analysis. + +> Internet Browser +> Google Chrome is the most recommended browser to use AnVIL Galaxy cloud environments to operate as expected. Safari and Firefox may be used as well, though there may be some functionality that is not supported in these or other browsers. +{: .warning} + +> Clone the Workspace +> +> 1. In your new Workspace, click on the “ANALYSES” tab. Next, click on “START”. You should see a popup window on the right side of the screen. +> ![Screenshot of the Workspace Notebooks tab. The notebook tab name and the plus button that starts a cloud environment for Galaxy have been highlighted.](../../images/sars-with-galaxy-in-anvil/05-start-galaxy.png) +> 2. Click on the Galaxy logo to proceed. +> - Click on “NEXT” and “CREATE” to keep all settings as-is. +> - ![The NEXT button among cloud environments has been highlighted.](../../images/sars-with-galaxy-in-anvil/06-galaxy-next.png) +> - ![The CREATE button among cloud environments has been highlighted.](../../images/sars-with-galaxy-in-anvil/07-galaxy-create.png) +> - Click on the Galaxy icon. You will see that the environment is still being set up. This will take 8-10 minutes. +> ![The Galaxy icon appears if the environment has been successfully launched.](../../images/sars-with-galaxy-in-anvil/08-galaxy-provisioning.png) +> 3. When it is done, click “Open”. You might need to refresh the page. +> +{: .hands_on} + +> Refresh +> Remember that you can refresh your browser or navigate away at any time. This is because the connection to the environment is in the cloud, not on your personal computer. +> +{: .tip} + +You can also follow along with the first ~2 minutes of [this video](https://jhudatascience.org/AnVIL_Book_Getting_Started/starting-galaxy.html) to start Galaxy on AnVIL. + +## Navigating Galaxy + +Notice the three main sections. + +**Tools** - These are all of the bioinformatics tool packages available for you to use. + +**The Main Dashboard** - This contains flash messages and posts when you first open Galaxy, but when we are using data this is the main interface area. + +**History** - When you start a project you will be able to see all of the documents in the project in the history. Now be aware, this can become very busy. Also the naming that Galaxy uses is not very intuitive, so you must make sure that you label your files with something that makes sense to you. + +![Screenshot of the Galaxy landing page. The Tools and History headings have been highlighted.](../../images/sars-with-galaxy-in-anvil/10-galaxy-on-anvil.png) + +The welcome page includes links to tutorials. You may try these out on your own. If you want to try a new analysis this is a good place to start. + +# Exercise One: Importing Data into Galaxy + +Luckily, we linked to the original data when we cloned our Workspace! We have three files we will need for our activity. These are (1) the reference genome for SARS-CoV-2, and both forward (2) and reverse (3) reads for our sample. Our sample has two sets of reads because the scientists who collected it used paired-end sequencing. The reference genome ends in “.fasta” because it has already been cleaned up by scientists. The sample we are looking at ends in ".fastq" because it is raw data from the sequencer. + +> Import Data from the Workspace +> +> 1. Click on Upload Data in the Tools pane. +> 2. Click on “Choose remote files” at the bottom of the popup. Double-click the top selection, which is the Workspace folder, then “Tables/” then “reference/”. Click the reference .fasta file so that it is highlighted in green and click “Ok”. +> ![Screenshot of the Galaxy Data upload page with the current AnVIL workspace highlighted.](../../images/sars-with-galaxy-in-anvil/11-galaxy-data-workspace.png) +> 3. Now that your reference has been added, click “Choose remote files” again to add the two sample files. Double-click the Workspace folder, then “Tables/” then “samples/”. Click the two sample `fastq` files so that they are highlighted in green and click “OK”. +> 4. Click “Start” and once complete, you can click “Close”. +> 5. Confirm your upload worked by looking at the file names in the History pane. +> +{: .hands_on} + +# Exercise Two: Examining Files in Galaxy + +Now we have some data in our account we can look at it. In this exercise we will see data in fastq format. This is the typical output from an Illumina Sequencer, but also the standard format for most alignment software. + +## Examining Inputs + +Use your mouse and click on the eye icon {% icon galaxy-eye %} of the first file `VA_sample_forward_reads.fastq`. In the Main screen you will see something like this: + +![Screenshot of a fastq file. The data includes DNA sequences but also includes many coded characters, making it hard to understand.](../../images/sars-with-galaxy-in-anvil/12-fastq-view.png) + +> +> +> 1. How many lines in a .fastq file represent an individual read? +> 2. What does each line represent? +> 3. Why is the final line for each read (the quality score) important? +> +> > +> > +> > 1. Four lines represent one read. +> > 2. The lines represent: 1 - A sequence identifier, 2 - The sequence (the base calls; A, C, T, G and N), 3 - A separator (not really data), 4 - The base call quality scores. +> > 3. It can help us filter out data that is wrong and/or low quality. +> > +> {: .solution} +{: .question} + +## Quality Scoring + +FastQC is a tool which aims to provide simple quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a set of analyses which you can use to get a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. + +> Determine the Quality of the Samples +> +> 1. Find {% tool [FastQC](toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.73+galaxy0) %} in the GENOMIC FILE MANIPULATION: FASTQ Quality Control tool folder. +> - {% icon param-file %} *"Raw read data from your current history"*: `VA_sample_forward_reads.fastq` +> 2. Run the tool. +> +{: .hands_on} + +The main dash will highlight in green if everything is okay. In the history, you will see the new files turn yellow, then green. If the job fails it will show an error. + +Click on the eye icon {% icon galaxy-eye %} in the new file in the history “FASTQC on data2 Webpage”. + +You will open up a summary report for the sequencing file: + +![Screenshot of the FastQC results. The Basic Statistics and Per Base Sequence Quality sections for the report on VA_sample_forward_reads.fastq are visible.](../../images/sars-with-galaxy-in-anvil/13-fastqc-report.png) + +> +> +> 1. Explore “Basic Statistics”. How many total reads are there? Have any been flagged as poor quality? What is the sequence length? +> 2. Explore “Per base sequence quality”. Based on the Basic Statistics, is 28-40 a good or bad quality score? +> 3. Is it okay to proceed based on the per base sequence quality? +> +> > +> > +> > 1. 43,522 reads. Zero flagged as poor quality. Sequence length is 3-301 base pairs. +> > 2. 28-40 is pretty good. +> > 3. Yes, because the per base sequence quality is good (“in the green”). +> > +> {: .solution} +{: .question} + +{% snippet topics/sequence-analysis/faqs/quality_score.md %} + +# Exercise Three: Alignment + +Given that our data has passed some quality checks, we will try to align the data to the reference genome. In this case it is simple, a viral genome. A human sequencing project will generate much larger data sets. There are many aligners, but we will start off looking at a simple aligner BWA-MEM. This example uses paired data. + +We will use our two SARS data files, which are ready for alignment. + +- `VA_sample_forward_reads.fastq` +- `VA_sample_reverse_reads.fastq` + +> Align to the Reference Dataset +> +> 1. Go to GENOMICS ANALYSIS and expand the Mapping menu. Select {% tool [Map with BWA-MEM](toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa_mem/0.7.17.2) %}. This program will align your reads to your SARS reference genome. Some of our reads are >100 base pairs so we will use the MEM option. +> - *"Will you select a reference genome from your history or use a built-in index?"*: Use a genome from history and build index. +> - {% icon param-file %} *"Use the following dataset as the reference sequence"*: `SARS-CoV-2_reference_genome.fasta` +> - *"Single or Paired-end reads"*: `Paired` +> - *"Select first set of reads"*: `VA_sample_forward_reads.fastq` +> - *"Select second set of reads"*: `VA_sample_reverse_reads.fastq` +> 2. Run the tool. +> +{: .hands_on} + +The output file is a `BAM` file, which lists where each read aligns to the reference genome and whether there are any differences. You can click the eye button to preview the results, but the results are not easy to interpret visually (much like the `fastq` files). Instead you will use a genome viewer in the next step. + +> +> +> 1. What is alignment software (for example, BWA-MEM) actually doing? +> 2. In this example, we are using paired fastq (“paired end”) data. What is an advantage of using paired data? +> +> > +> > +> > 1. Alignment tools figure out the optimal positioning of the reads next to the reference genome to minimize mismatches and gaps. +> > 2. Paired data is made up of fragments that are read twice (forward and reverse). Paired data improves data accuracy. +> > +> {: .solution} +{: .question} + +# Viewing aligned data + +We have aligned our data but it is currently a table of where the reads align. This is hard to read, so we will use [JBrowse](https://jbrowse.org/jbrowse1.html) to view the data. + +> Visualize Reference Data +> +> 1. Scroll down in the Tools menu to STATISTICS AND VISUALIZATION. Under "Graph/Display Data", select {% tool [JBrowse](toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.9+galaxy0) %}. +> - {% icon param-select %} *"Reference genome to display"*: `Use a genome from history` +> - {% icon param-select %} *"Select the reference genome"*: `SARS-CoV-2_reference_genome.fasta` +> 2. Click "Run Tool". +> 3. You should see a new JBrowse item appear in your history. Click the eye icon {% icon galaxy-eye %} to open JBrowse. +> +{: .hands_on} + +You will need to click on the magnifying glasses to zoom in, but you should see the A,C,G, and Ts and their corresponding colors that make up the SARS-CoV-2 genome! + +![Screenshot of preliminary JBrowse results. The eye icon is highlighted, as it should be used to open the JBrowse viewer. The magnifying glasses in JBrowse are also highlighted as they enable zooming in to see the individual bases.](../../images/sars-with-galaxy-in-anvil/15-galaxy-jbrowse-ref.png) + +This is interesting, but it doesn’t let us compare the genome to the sample we have. We suspect there may be some differences that indicate our sample is the delta variant. + +> Visualize Aligned Data +> +> 1. Scroll down in the Tools menu to STATISTICS AND VISUALIZATION. Under "Graph/Display Data", select {% tool [JBrowse](toolshed.g2.bx.psu.edu/repos/iuc/jbrowse/jbrowse/1.16.9+galaxy0) %}. +> - {% icon param-select %} *"Reference genome to display"*: `Use a genome from history` +> - {% icon param-select %} *"Select the reference genome"*: `SARS-CoV-2_reference_genome.fasta` +> - {% icon param-repeat %} *"+ Insert Track Group"* +> - {% icon param-repeat %} *“+ Insert Annotation Track”* +> - {% icon param-select %} *"Track Type"*: `BAM Pileups` +> - {% icon param-toggle %} *"Autogenerate SNP Track"*: `Yes` +> - {% icon param-text %} *"Maximum size of BAM chunks"*: Add one more zero: `50000000` +> 2. Click "Run Tool". +> +{: .hands_on} + +You should see a new JBrowse item appear in your history. Click the eye icon {% icon galaxy-eye %} to open JBrowse. Make sure that all boxes are checked on the left side: “Available Tracks”. The tracks will show up in the order that you click on them. + +> Visualize Aligned Data +> +> 1. Select the tracks from the BWA-MEM tool run. +> - {% icon param-check %} `Map with BWA-MEM on data 9, data 8, and data 7 (mapped reads in BAM format)` +> - {% icon param-check %} `Map with BWA-MEM on data 9, data 8, and data 7 (mapped reads in BAM format) - SNPs/Coverage` +> 2. You should see a new JBrowse item appear in your history. Click the eye icon {% icon galaxy-eye %} to open JBrowse. +> +{: .hands_on} + +![Screenshot of JBrowse viewer. The Available Tracks sidebar is shown, with both sample data boxes checked.](../../images/sars-with-galaxy-in-anvil/14-galaxy-jbrowse-bams.png) + +Let’s look at an example mutation in our sample. Type in the reference position “24410” and click “Go”. You should see a bunch of “A”s highlighted in green throughout our sample. The reference sequence (top line) is a “G” but all of the reads are an “A”. This means that our sample is genetically different from the established SARS-CoV-2 reference genome! Researchers often call these single base differences “SNPs” - Single Nucleotide Polymorphisms. + +> +> +> 1. How long is the SARS-CoV-2 genome? Hint: zoom out and scroll to the end of the genome. +> 2. Locate position 23,603. This is the site of an important mutation in the spike protein of the delta variant “P681R”. In this mutation, the amino acid proline is replaced by arginine. Is this mutation present at position 23,603 in our sample? Based on the evidence, do you think this sample is a delta variant? +> +> > +> > +> > 1. ~29,904 bp +> > 2. Yes! C has become a G. Yes, this sample is probably a delta variant because this mutation is indicative of the delta variant. +> > +> {: .solution} +{: .question} + +> Sequencing errors +> +> It’s possible to make mistakes in the data preparation before we get to the data analysis. Sometimes this happens when the samples are being prepared in the lab and sometimes this happens because the sequencer makes a mistake. This is one reason why quality scores are helpful. With millions of reads of data, it’s more likely that we see a “SNP” that is actually an accident. Multiple copies of the same areas of our data (“read depth”) help us be sure it’s a real SNP. When we compare across lots of aligned reads of the same area, we can determine the actual sequence by consensus. For example, we can be reasonably confident that the “G” at position 1,203 is just a sequencing or lab mistake. +> +{: .details} + +# Export Your History + +It’s a good idea to export your “History” so that your collaborators can see what you did. + +> Export History to Workspace +> +> 1. Click on the History Menu {% icon galaxy-history-options %} and click on “Export History to File”. +> ![History dropdown menu button and Export History to File buttons are highlighted.](../../images/sars-with-galaxy-in-anvil/16-export-history.png) +> 2. Make sure you select “to a remote file”. Then, click to select where to export your History. On the popup menu, select your Workspace name, then select “Other Data”. Finally, select “Files”. Then click “Select this folder”. Make sure the export directory looks correct. +> 3. Next, name your history “SARS Galaxy Variant Detection” and click “Export”. +> ![When exporting History, make sure you select "to a remote file". The directory should be your workspace name followed by Other Data and Files. The export button is highlighted.](../../images/sars-with-galaxy-in-anvil/17-export.png) +> 4. Back at your Workspace, click on the “Data” tab, and the Files folder. You should now see the History export in your files. If you click on the file, you can download it or view it in Google Cloud Storage Browser. +> ![Back on AnVIL, the Data tab and Files folder are selected. A file called SARS_Galaxy_Variant_Detection is now present.](../../images/sars-with-galaxy-in-anvil/18-export-verify.png) +> +{: .hands_on} + +# Wrap-up + +Once you are done with the activity, you’ll need to shut down your Galaxy cloud environment. This frees up the cloud resources for others and minimizes computing cost. The following steps will delete your work, so make sure you are completely finished at this point. Otherwise, you will have to repeat your work from the previous steps. + +> Shut Down Galaxy in AnVIL +> 1. Return to AnVIL, and find the Galaxy logo that shows your cloud environment is running. Click on the Galaxy logo. +> ![Screenshot of the Workspace menu. The currently running Galaxy cloud environment logo on the right of the page is highlighted.](../../images/sars-with-galaxy-in-anvil/19-galaxy-env.png) +> 2. Click "Settings". +> ![Screenshot of the Environment details menu. The Settings button is highlighted.](../../images/sars-with-galaxy-in-anvil/20-galaxy-edit.png) +> 3. Next, scroll down and click on “DELETE ENVIRONMENT”: +> ![Screenshot of the cloud environment pop out menu. The “DELETE ENVIRONMENT” button is highlighted.](../../images/sars-with-galaxy-in-anvil/21-galaxy-delete.png) +> 4. Finally, select “Delete everything, including persistent disk”. Make sure you are done with the activity and then click “DELETE”. +> ![Screenshot of the cloud environment pop out menu. The “Delete everything, including persistent disk” radio button has been checked and is highlighted. The “DELETE” button is highlighted.](../../images/sars-with-galaxy-in-anvil/22-galaxy-delete-pd.png) +> +{: .hands_on} + +# Conclusion +Congratulations! You have run your first analysis using Galaxy in the AnVIL platform!