From 5ce9bdcf959c73a43e2a6377a4960f3fcf4edc0c Mon Sep 17 00:00:00 2001 From: Logan Blair Date: Tue, 23 Jul 2024 11:06:41 -0700 Subject: [PATCH] Tutorial updates --- data/xanthomonas.csv | 60 ++++++++++----------- docs/search.json | 8 +-- docs/tutorial.html | 122 ++++++++++++++++++++++++++----------------- tutorial.qmd | 41 +++++++++------ 4 files changed, 134 insertions(+), 97 deletions(-) diff --git a/data/xanthomonas.csv b/data/xanthomonas.csv index 4e0a540..049347b 100644 --- a/data/xanthomonas.csv +++ b/data/xanthomonas.csv @@ -1,30 +1,30 @@ -sample_id,shortread_1,shortread_2,reference,reference_id,report_group,color_by,date_isolated,date_received,year,host,cv_key,nursery -22-299,test/data/reads/22-299_R1.fastq.gz,test/data/reads/22-299_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/2/22,3/29/22,2022,Pelargonium x hortorum,CV-1,MD -22-300,test/data/reads/22-300_R1.fastq.gz,test/data/reads/22-300_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/2/22,3/30/22,2022,Pelargonium x hortorum,CV-2,MD -22-301,test/data/reads/22-301_R1.fastq.gz,test/data/reads/22-301_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/2/22,3/31/22,2022,Pelargonium x hortorum,CV-3,MD -22-302,test/data/reads/22-302_R1.fastq.gz,test/data/reads/22-302_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/2/22,4/1/22,2022,Pelargonium x hortorum,CV-4,MD -22-303,test/data/reads/22-303_R1.fastq.gz,test/data/reads/22-303_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/2/22,4/2/22,2022,Pelargonium x hortorum,CV-5,MD -22-304,test/data/reads/22-304_R1.fastq.gz,test/data/reads/22-304_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/7/22,4/3/22,2022,Pelargonium x hortorum,CV-6,MD -22-309,test/data/reads/22-309_R1.fastq.gz,test/data/reads/22-309_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/4/22,4/4/22,2022,Pelargonium x hortorum,CV-7,NY -22-310,test/data/reads/22-310_R1.fastq.gz,test/data/reads/22-310_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/4/22,4/5/22,2022,P. x hortorum,CV-8,NY -22-311,test/data/reads/22-311_R1.fastq.gz,test/data/reads/22-311_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/4/22,4/6/22,2022,P. x hortorum,CV-9,NY -22-312,test/data/reads/22-312_R1.fastq.gz,test/data/reads/22-312_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/7/22,4/7/22,2022,P. x hortorum,CV-10,NY -22-314,test/data/reads/22-314_R1.fastq.gz,test/data/reads/22-314_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/7/22,4/9/22,2022,P. x hortorum,CV-12,NY -22-315,test/data/reads/22-315_R1.fastq.gz,test/data/reads/22-315_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/8/22,4/10/22,2022,P. x hortorum,CV-13,NY -22-316,test/data/reads/22-316_R1.fastq.gz,test/data/reads/22-316_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/8/22,4/11/22,2022,P. x hortorum,CV-14,NY -22-317,test/data/reads/22-317_R1.fastq.gz,test/data/reads/22-317_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/14/22,4/12/22,2022,P. x hortorum,CV-12,NY -22-318,test/data/reads/22-318_R1.fastq.gz,test/data/reads/22-318_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/14/22,4/13/22,2022,P. x hortorum,CV-15,NY -22-319,test/data/reads/22-319_R1.fastq.gz,test/data/reads/22-319_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/21/22,4/14/22,2022,P. x hortorum,CV-16,NY -22-320,test/data/reads/22-320_R1.fastq.gz,test/data/reads/22-320_R2.fastq.gz,,,xan_test;subgroup,year;nursery,3/21/22,4/15/22,2022,P. x hortorum,CV-17,NY -22-322,test/data/reads/22-322_R1.fastq.gz,test/data/reads/22-322_R2.fastq.gz,,,xan_test;subgroup,year;nursery,2001,4/17/22,2001,P. x hortorum,CV-18,KS -22-324,test/data/reads/22-324_R1.fastq.gz,test/data/reads/22-324_R2.fastq.gz,,,xan_test;subgroup,year;nursery,1987,4/19/22,1987,P. x hortorum,NA,NA -22-329A,test/data/reads/22-329A_R1.fastq.gz,test/data/reads/22-329A_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/25/22,4/30/22,2022,P. x hortorum,CV-28,PA -22-329B,test/data/reads/22-329B_R1.fastq.gz,test/data/reads/22-329B_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/25/22,4/29/22,2022,P. x hortorum,CV-28,PA -22-330,test/data/reads/22-330_R1.fastq.gz,test/data/reads/22-330_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/25/22,5/1/22,2022,P. x hortorum,CV-29,PA -22-331,test/data/reads/22-331_R1.fastq.gz,test/data/reads/22-331_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/21/22,2022,Pelargonium x,CV-20,IN -22-332,test/data/reads/22-332_R1.fastq.gz,test/data/reads/22-332_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/22/22,2022,Pelargonium x,CV-21,IN -22-333,test/data/reads/22-333_R1.fastq.gz,test/data/reads/22-333_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/23/22,2022,Pelargonium x,CV-22,IN -22-334,test/data/reads/22-334_R1.fastq.gz,test/data/reads/22-334_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/24/22,2022,Pelargonium x,CV-23,IN -22-335,test/data/reads/22-335_R1.fastq.gz,test/data/reads/22-335_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/25/22,2022,Pelargonium x,CV-24,IN -22-336,test/data/reads/22-336_R1.fastq.gz,test/data/reads/22-336_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/26/22,2022,Pelargonium zonale,CV-25,IN -22-337,test/data/reads/22-337_R1.fastq.gz,test/data/reads/22-337_R2.fastq.gz,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year;nursery,3/21/22,4/27/22,2022,Pelargonium zonale,CV-26,IN +sample_id,path_1,path_2,sequence_type,reference,reference_id,report_group,color_by,date_isolated,date_received,year,host,cv_key,nursery,, +22-299,test/data/reads/22-299_R1.fastq.gz,test/data/reads/22-299_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/2/22,3/29/22,2022,Pelargonium x hortorum,CV-1,MD +22-300,test/data/reads/22-300_R1.fastq.gz,test/data/reads/22-300_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/2/22,3/30/22,2022,Pelargonium x hortorum,CV-2,MD +22-301,test/data/reads/22-301_R1.fastq.gz,test/data/reads/22-301_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/2/22,3/31/22,2022,Pelargonium x hortorum,CV-3,MD +22-302,test/data/reads/22-302_R1.fastq.gz,test/data/reads/22-302_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/2/22,4/1/22,2022,Pelargonium x hortorum,CV-4,MD +22-303,test/data/reads/22-303_R1.fastq.gz,test/data/reads/22-303_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/2/22,4/2/22,2022,Pelargonium x hortorum,CV-5,MD +22-304,test/data/reads/22-304_R1.fastq.gz,test/data/reads/22-304_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/7/22,4/3/22,2022,Pelargonium x hortorum,CV-6,MD +22-309,test/data/reads/22-309_R1.fastq.gz,test/data/reads/22-309_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/4/22,4/4/22,2022,Pelargonium x hortorum,CV-7,NY +22-310,test/data/reads/22-310_R1.fastq.gz,test/data/reads/22-310_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/4/22,4/5/22,2022,P. x hortorum,CV-8,NY +22-311,test/data/reads/22-311_R1.fastq.gz,test/data/reads/22-311_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/4/22,4/6/22,2022,P. x hortorum,CV-9,NY +22-312,test/data/reads/22-312_R1.fastq.gz,test/data/reads/22-312_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/7/22,4/7/22,2022,P. x hortorum,CV-10,NY +22-314,test/data/reads/22-314_R1.fastq.gz,test/data/reads/22-314_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/7/22,4/9/22,2022,P. x hortorum,CV-12,NY +22-315,test/data/reads/22-315_R1.fastq.gz,test/data/reads/22-315_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/8/22,4/10/22,2022,P. x hortorum,CV-13,NY +22-316,test/data/reads/22-316_R1.fastq.gz,test/data/reads/22-316_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/8/22,4/11/22,2022,P. x hortorum,CV-14,NY +22-317,test/data/reads/22-317_R1.fastq.gz,test/data/reads/22-317_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/14/22,4/12/22,2022,P. x hortorum,CV-12,NY +22-318,test/data/reads/22-318_R1.fastq.gz,test/data/reads/22-318_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/14/22,4/13/22,2022,P. x hortorum,CV-15,NY +22-319,test/data/reads/22-319_R1.fastq.gz,test/data/reads/22-319_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/21/22,4/14/22,2022,P. x hortorum,CV-16,NY +22-320,test/data/reads/22-320_R1.fastq.gz,test/data/reads/22-320_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,3/21/22,4/15/22,2022,P. x hortorum,CV-17,NY +22-322,test/data/reads/22-322_R1.fastq.gz,test/data/reads/22-322_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,2001,4/17/22,2001,P. x hortorum,CV-18,KS +22-324,test/data/reads/22-324_R1.fastq.gz,test/data/reads/22-324_R2.fastq.gz, Illumina,,,xan_test,subgroup,year,nursery,1987,4/19/22,1987,P. x hortorum,NA,NA +22-329A,test/data/reads/22-329A_R1.fastq.gz,test/data/reads/22-329A_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/25/22,4/30/22,2022,P. x hortorum,CV-28,PA, +22-329B,test/data/reads/22-329B_R1.fastq.gz,test/data/reads/22-329B_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/25/22,4/29/22,2022,P. x hortorum,CV-28,PA, +22-330,test/data/reads/22-330_R1.fastq.gz,test/data/reads/22-330_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/25/22,5/1/22,2022,P. x hortorum,CV-29,PA, +22-331,test/data/reads/22-331_R1.fastq.gz,test/data/reads/22-331_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/21/22,2022,Pelargonium x,CV-20,IN, +22-332,test/data/reads/22-332_R1.fastq.gz,test/data/reads/22-332_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/22/22,2022,Pelargonium x,CV-21,IN, +22-333,test/data/reads/22-333_R1.fastq.gz,test/data/reads/22-333_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/23/22,2022,Pelargonium x,CV-22,IN, +22-334,test/data/reads/22-334_R1.fastq.gz,test/data/reads/22-334_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/24/22,2022,Pelargonium x,CV-23,IN, +22-335,test/data/reads/22-335_R1.fastq.gz,test/data/reads/22-335_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/25/22,2022,Pelargonium x,CV-24,IN, +22-336,test/data/reads/22-336_R1.fastq.gz,test/data/reads/22-336_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/26/22,2022,Pelargonium zonale,CV-25,IN, +22-337,test/data/reads/22-337_R1.fastq.gz,test/data/reads/22-337_R2.fastq.gz, Illumina,test/data/refs/reference-22-331.fna,22_331_assembly,xan_test,year,nursery,3/21/22,4/27/22,2022,Pelargonium zonale,CV-26,IN, diff --git a/docs/search.json b/docs/search.json index 0b1fa25..bc3944c 100644 --- a/docs/search.json +++ b/docs/search.json @@ -74,7 +74,7 @@ "href": "tutorial.html", "title": "Tutorial", "section": "", - "text": "This example uses sequencing reads from an outbreak of Xanthomonas hortorum in several plant nurseries. We’ll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from Xanthomonas hortorum). We’ll also explore how isolates from different nursery populations relate to each other and the reference sequences of other closely-related organisms. This information can be obtained from several plots that the pathogensurveillance pipeline generates automatically. \n\n\nThe pipeline is designed to work with a wide variety of existing metadata sheets without extensive changes. Here’s a look at “xanthomonas.csv”, which serves as the only unique input file within the command to run the pipeline:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nshortread_1\nshortread_2\nreference\nreference_id\nreport_group\ncolor_by\ndate_isolated\ndate_received\nyear\nhost\ncv_key\nnursery\n\n\n\n\n22-299\ntest/data/reads/22-299_R1.fastq.gz\ntest/data/reads/22-299_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/29/22\n2022\nPelargonium x hortorum\nCV-1\nMD\n\n\n22-300\ntest/data/reads/22-300_R1.fastq.gz\ntest/data/reads/22-300_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/30/22\n2022\nPelargonium x hortorum\nCV-2\nMD\n\n\n22-301\ntest/data/reads/22-301_R1.fastq.gz\ntest/data/reads/22-301_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/31/22\n2022\nPelargonium x hortorum\nCV-3\nMD\n\n\n22-302\ntest/data/reads/22-302_R1.fastq.gz\ntest/data/reads/22-302_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n4/1/22\n2022\nPelargonium x hortorum\nCV-4\nMD\n\n\n22-303\ntest/data/reads/22-303_R1.fastq.gz\ntest/data/reads/22-303_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n4/2/22\n2022\nPelargonium x hortorum\nCV-5\nMD\n\n\n22-304\ntest/data/reads/22-304_R1.fastq.gz\ntest/data/reads/22-304_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/7/22\n4/3/22\n2022\nPelargonium x hortorum\nCV-6\nMD\n\n\n\n\n\n\nThere is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can be present either locally or they can be downloaded.\nUsing local reads: Columns “shortread_1” and “shortread_2” specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. They are derived from paired-end illumina shortreads, but the pipeline will also work with mixed inputs of Pacbio or Oxford Nanopore sequences.\nDownloading reads: Sequence files may instead be hosted on the ncbi. In that case, the “shortread_1/shortread_2” columns should be substituted with a single “SRA” column, and they will be downloaded from the ncbi automatically. See test/data/metadata/xanthomonas.csv for an example using this input format.\nSpecifying a reference genome (optional): The “reference_refseq” column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Documentation for an in-depth explanation of how to designate mandatory and optional references.\nAssigning sample groups (optional): The optional column “color_by” is used for data visualization. It will assign one or more columns to serve as grouping factors for the output report. Here, samples will be grouped by the values of the “year” and “nursery” columns. Note that multiple factors need to be separated by semicolons within the color_by column. \n\n\n\nHere is the full command used execute this example, using a docker container:\nnextflow run nf-core/pathogensurveillance --input https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/xanthomonas.csv --outdir xanthomonas --download_bakta_db true -profile docker -resume --max_cpus 8 --max_memory 30GB -resume\nWhen running your own analysis, you will need to provide your own path to the input CSV file.\nBy default, the pipeline will run on 128 GB of RAM and 16 threads. This is more resources than is strictly necessary and beyond the capacity of most desktop computers. We can scale this back a bit for this lightweight test run. This analysis will work with 8 cpus and 30 GB of RAM (albeit more slowly), which is specified by the –max_cpus and –max_memory settings.\nThe setting -resume is only necessary when resuming a previous analysis. However, it doesn’t hurt to include it at the start. If the pipeline is interrupted, this setting allows progress to pick up where it left off – as long as the previous command is executed from the same working directory.\nIf the pipeline begins successfully, you should see a screen tracking your progress:\n[25/63dcee] process > PATHOGENSURVEILLANCE:INPUT_CHECK:SAMPLESHEET_CHECK (xanthomonas.csv)[100%] 1 of 1\n[- ] process > PATHOGENSURVEILLANCE:SRATOOLS_FASTERQDUMP -\n[- ] process > PATHOGENSURVEILLANCE:DOWNLOAD_ASSEMBLIES -\n[- ] process > PATHOGENSURVEILLANCE:SEQKIT_SLIDING -\n[- ] process > PATHOGENSURVEILLANCE:FASTQC -\n[- ] process > PATHOGENSURVEILLANCE:COARSE_SAMPLE_TAXONOMY:BBMAP_SENDSKETCH -\nThe input and output of each process can be accessed from the work/ directory. The subdirectory within work/ is designated by the string to left of each step. Note that this location will be different each time the pipeline is run, and only the first part of the name of the subdirectory is shown. For this run, we could navigate to work/25/63dcee(etc) to access the input csv that is used for the next step. \n\n\n\nYou should see a message similar to this if the pipeline finishes successfully:\n-[nf-core/plantpathsurveil] Pipeline completed successfully-\n\nTo clean the cache, enter the command: \nnextflow clean evil_boyd -f \n\nCompleted at: 20-May-2024 12:44:40\nDuration : 3h 29m 2s\nCPU hours : 15.2\nSucceeded : 253\nThe final report can be viewed as either a .pdf or .html file. It can be accessed inside the reports folder of the output directory (here: xanthomonas/reports). This report shows several key pieces of information about your samples.\nA note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option –copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run.\nThis particular report has been included as an example \n\nSummary:\n\nPipeline Status Report: error messages for samples or sample groups\nInput Data: Data read from the input .csv file\n\n\nIdentification:\n\nInitial identification: Coarse identification from the bbmap sendsketch step. The first tab shows best species ID for each sample. The second tab shows similarity metrics between sample sequences and other reference genomes: %ANI (average nucleotide identity), %WKID (weighted kmer identity), and %completeness.\n\nFor more information about each metric, click the About this table tab underneath.\n\n\n\n\nMost similar organisms: Shows relationships between samples and references using % ani and % pocp (percentage of conserved proteins). For better resolution, you can interactively zoom in/out of plots.\n\n\nCore gene phylogeny: A core gene phylogeny uses the sequences of all gene shared by all of the genomes included in the tree to infer evolutionary relationships. It is the most robust identification provided by this pipeline, but its precision is still limited by the availability of similar reference sequences. Methods to generate this tree differ between prokaryotes and eukaryotes. Our input to the pipeline was prokaryotic DNA sequences, and the method to build this tree is based upon many different core genes shared between samples and references (for eukaryotes, this is constrained to BUSCO genes). This tree is built with iqtree and based upon shared core genes analyzed using the program pirate. You can highlight branches by hovering over and clicking on nodes.\n\n\n\nSNP trees: This tree is better suited for visualizing the genetic diversity among samples. However, the core gene phylogeny provides a much better source of information for evolutionary differences among samples and other known references.\n\n\nMinimum spanning network\n\nMinimum spanning network: The nodes represent unique multilocus genotypes, and the size of nodes is proportional to the # number of samples that share the same genotype. The edges represent the SNP differences between two given genotypes, and the darker the color of the edges, the fewer SNP differences between the two.", + "text": "Before starting, first take a look at the Quickstart for instructions on how to download pathogensurveillance and install both Docker and Nextflow.", "crumbs": [ "Tutorial" ] @@ -238,8 +238,8 @@ "objectID": "tutorial.html#example-1-standard-run", "href": "tutorial.html#example-1-standard-run", "title": "Tutorial", - "section": "", - "text": "This example uses sequencing reads from an outbreak of Xanthomonas hortorum in several plant nurseries. We’ll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from Xanthomonas hortorum). We’ll also explore how isolates from different nursery populations relate to each other and the reference sequences of other closely-related organisms. This information can be obtained from several plots that the pathogensurveillance pipeline generates automatically. \n\n\nThe pipeline is designed to work with a wide variety of existing metadata sheets without extensive changes. Here’s a look at “xanthomonas.csv”, which serves as the only unique input file within the command to run the pipeline:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nshortread_1\nshortread_2\nreference\nreference_id\nreport_group\ncolor_by\ndate_isolated\ndate_received\nyear\nhost\ncv_key\nnursery\n\n\n\n\n22-299\ntest/data/reads/22-299_R1.fastq.gz\ntest/data/reads/22-299_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/29/22\n2022\nPelargonium x hortorum\nCV-1\nMD\n\n\n22-300\ntest/data/reads/22-300_R1.fastq.gz\ntest/data/reads/22-300_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/30/22\n2022\nPelargonium x hortorum\nCV-2\nMD\n\n\n22-301\ntest/data/reads/22-301_R1.fastq.gz\ntest/data/reads/22-301_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n3/31/22\n2022\nPelargonium x hortorum\nCV-3\nMD\n\n\n22-302\ntest/data/reads/22-302_R1.fastq.gz\ntest/data/reads/22-302_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n4/1/22\n2022\nPelargonium x hortorum\nCV-4\nMD\n\n\n22-303\ntest/data/reads/22-303_R1.fastq.gz\ntest/data/reads/22-303_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/2/22\n4/2/22\n2022\nPelargonium x hortorum\nCV-5\nMD\n\n\n22-304\ntest/data/reads/22-304_R1.fastq.gz\ntest/data/reads/22-304_R2.fastq.gz\n\n\nxan_test;subgroup\nyear;nursery\n3/7/22\n4/3/22\n2022\nPelargonium x hortorum\nCV-6\nMD\n\n\n\n\n\n\nThere is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can be present either locally or they can be downloaded.\nUsing local reads: Columns “shortread_1” and “shortread_2” specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. They are derived from paired-end illumina shortreads, but the pipeline will also work with mixed inputs of Pacbio or Oxford Nanopore sequences.\nDownloading reads: Sequence files may instead be hosted on the ncbi. In that case, the “shortread_1/shortread_2” columns should be substituted with a single “SRA” column, and they will be downloaded from the ncbi automatically. See test/data/metadata/xanthomonas.csv for an example using this input format.\nSpecifying a reference genome (optional): The “reference_refseq” column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Documentation for an in-depth explanation of how to designate mandatory and optional references.\nAssigning sample groups (optional): The optional column “color_by” is used for data visualization. It will assign one or more columns to serve as grouping factors for the output report. Here, samples will be grouped by the values of the “year” and “nursery” columns. Note that multiple factors need to be separated by semicolons within the color_by column. \n\n\n\nHere is the full command used execute this example, using a docker container:\nnextflow run nf-core/pathogensurveillance --input https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/xanthomonas.csv --outdir xanthomonas --download_bakta_db true -profile docker -resume --max_cpus 8 --max_memory 30GB -resume\nWhen running your own analysis, you will need to provide your own path to the input CSV file.\nBy default, the pipeline will run on 128 GB of RAM and 16 threads. This is more resources than is strictly necessary and beyond the capacity of most desktop computers. We can scale this back a bit for this lightweight test run. This analysis will work with 8 cpus and 30 GB of RAM (albeit more slowly), which is specified by the –max_cpus and –max_memory settings.\nThe setting -resume is only necessary when resuming a previous analysis. However, it doesn’t hurt to include it at the start. If the pipeline is interrupted, this setting allows progress to pick up where it left off – as long as the previous command is executed from the same working directory.\nIf the pipeline begins successfully, you should see a screen tracking your progress:\n[25/63dcee] process > PATHOGENSURVEILLANCE:INPUT_CHECK:SAMPLESHEET_CHECK (xanthomonas.csv)[100%] 1 of 1\n[- ] process > PATHOGENSURVEILLANCE:SRATOOLS_FASTERQDUMP -\n[- ] process > PATHOGENSURVEILLANCE:DOWNLOAD_ASSEMBLIES -\n[- ] process > PATHOGENSURVEILLANCE:SEQKIT_SLIDING -\n[- ] process > PATHOGENSURVEILLANCE:FASTQC -\n[- ] process > PATHOGENSURVEILLANCE:COARSE_SAMPLE_TAXONOMY:BBMAP_SENDSKETCH -\nThe input and output of each process can be accessed from the work/ directory. The subdirectory within work/ is designated by the string to left of each step. Note that this location will be different each time the pipeline is run, and only the first part of the name of the subdirectory is shown. For this run, we could navigate to work/25/63dcee(etc) to access the input csv that is used for the next step. \n\n\n\nYou should see a message similar to this if the pipeline finishes successfully:\n-[nf-core/plantpathsurveil] Pipeline completed successfully-\n\nTo clean the cache, enter the command: \nnextflow clean evil_boyd -f \n\nCompleted at: 20-May-2024 12:44:40\nDuration : 3h 29m 2s\nCPU hours : 15.2\nSucceeded : 253\nThe final report can be viewed as either a .pdf or .html file. It can be accessed inside the reports folder of the output directory (here: xanthomonas/reports). This report shows several key pieces of information about your samples.\nA note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option –copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run.\nThis particular report has been included as an example \n\nSummary:\n\nPipeline Status Report: error messages for samples or sample groups\nInput Data: Data read from the input .csv file\n\n\nIdentification:\n\nInitial identification: Coarse identification from the bbmap sendsketch step. The first tab shows best species ID for each sample. The second tab shows similarity metrics between sample sequences and other reference genomes: %ANI (average nucleotide identity), %WKID (weighted kmer identity), and %completeness.\n\nFor more information about each metric, click the About this table tab underneath.\n\n\n\n\nMost similar organisms: Shows relationships between samples and references using % ani and % pocp (percentage of conserved proteins). For better resolution, you can interactively zoom in/out of plots.\n\n\nCore gene phylogeny: A core gene phylogeny uses the sequences of all gene shared by all of the genomes included in the tree to infer evolutionary relationships. It is the most robust identification provided by this pipeline, but its precision is still limited by the availability of similar reference sequences. Methods to generate this tree differ between prokaryotes and eukaryotes. Our input to the pipeline was prokaryotic DNA sequences, and the method to build this tree is based upon many different core genes shared between samples and references (for eukaryotes, this is constrained to BUSCO genes). This tree is built with iqtree and based upon shared core genes analyzed using the program pirate. You can highlight branches by hovering over and clicking on nodes.\n\n\n\nSNP trees: This tree is better suited for visualizing the genetic diversity among samples. However, the core gene phylogeny provides a much better source of information for evolutionary differences among samples and other known references.\n\n\nMinimum spanning network\n\nMinimum spanning network: The nodes represent unique multilocus genotypes, and the size of nodes is proportional to the # number of samples that share the same genotype. The edges represent the SNP differences between two given genotypes, and the darker the color of the edges, the fewer SNP differences between the two.", + "section": "Example 1: Standard Run", + "text": "Example 1: Standard Run\nThis example uses sequencing reads from an 2022 outbreak of Xanthomonas hortorum across several plant nurseries. Using whole-genome sequencing, researchers determined a shared genetic basis between strains at different locations. With this information, they traced the origin of the outbreak to a single supplier that sold infected cuttings. You can read more about the study here. \nWe’ll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from Xanthomonas hortorum). We’ll also see the high degree of shared DNA sequence between samples, which is seen from several plots that the pathogensurveillance pipeline generates automatically. \n\nSample input\nThe pipeline is designed to work with a wide variety of existing metadata sheets without extensive changes. Here’s a look at “xanthomonas.csv”, which serves as the only unique input file within the command to run the pipeline:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\npath_1\npath_2\nsequence_type\nreference\nreference_id\nreport_group\ncolor_by\ndate_isolated\ndate_received\nyear\nhost\ncv_key\nnursery\nX\nX.1\n\n\n\n\n22-299\ntest/data/reads/22-299_R1.fastq.gz\ntest/data/reads/22-299_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/2/22\n3/29/22\n2022\nPelargonium x hortorum\nCV-1\nMD\n\n\n22-300\ntest/data/reads/22-300_R1.fastq.gz\ntest/data/reads/22-300_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/2/22\n3/30/22\n2022\nPelargonium x hortorum\nCV-2\nMD\n\n\n22-301\ntest/data/reads/22-301_R1.fastq.gz\ntest/data/reads/22-301_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/2/22\n3/31/22\n2022\nPelargonium x hortorum\nCV-3\nMD\n\n\n22-302\ntest/data/reads/22-302_R1.fastq.gz\ntest/data/reads/22-302_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/2/22\n4/1/22\n2022\nPelargonium x hortorum\nCV-4\nMD\n\n\n22-303\ntest/data/reads/22-303_R1.fastq.gz\ntest/data/reads/22-303_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/2/22\n4/2/22\n2022\nPelargonium x hortorum\nCV-5\nMD\n\n\n22-304\ntest/data/reads/22-304_R1.fastq.gz\ntest/data/reads/22-304_R2.fastq.gz\nIllumina\n\n\nxan_test\nsubgroup\nyear\nnursery\n3/7/22\n4/3/22\n2022\nPelargonium x hortorum\nCV-6\nMD\n\n\n\n\n\n\nThere is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can either be present locally or they can be downloaded from the NCBI.\nUsing local reads: Columns “path_1” and “path_2” specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. . If your reads are single-ended, “path_2” should be left blank.\nShortread/Longread sequences*: Information in the column “sequencing_type” tells the pipeline these are derived from illumina shortreads. Other options for this column are “nanopore” and “pacbio”.\nDownloading reads: Sequence files may instead be hosted on the NCBI. In that case, the “shortread_1/shortread_2” columns should be substituted with a single “SRA” column, and they will be downloaded right after the pipeline checks the sample sheet. These downloads will show up in the folder path_surveil_data/reads. See test/data/metadata/xanthomonas.csv for an example using this input format.\nSpecifying a reference genome (optional): The “reference_refseq” column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Example 2 for an explanation of how to designate mandatory and optional references.\nAssigning sample groups (optional): The optional column “color_by” is used for data visualization. It will assign one or more columns to serve as grouping factors for the output report. Here, samples will be grouped by the values of the “year” and “nursery” columns. Note that multiple factors need to be separated by semicolons within the color_by column. \n\n\nRunning the pipeline\nHere is the full command used execute this example, using a docker container:\nnextflow run nf-core/pathogensurveillance --input https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/xanthomonas.csv --outdir xanthomonas --download_bakta_db true -profile docker -resume --max_cpus 8 --max_memory 30GB -resume\nWhen running your own analysis, you will need to provide your own path to the input CSV file.\nBy default, the pipeline will run on 128 GB of RAM and 16 threads. This is more resources than is strictly necessary and beyond the capacity of most desktop computers. We can scale this back a bit for this lightweight test run. This analysis will work with 8 cpus and 30 GB of RAM (albeit more slowly), which is specified by the –max_cpus and –max_memory settings.\nThe setting -resume is only necessary when resuming a previous analysis. However, it doesn’t hurt to include it at the start. If the pipeline is interrupted, this setting allows progress to pick up where it left off – as long as the previous command is executed from the same working directory.\nIf the pipeline begins successfully, you should see a screen tracking your progress:\n[25/63dcee] process > PATHOGENSURVEILLANCE:INPUT_CHECK:SAMPLESHEET_CHECK (xanthomonas.csv)[100%] 1 of 1\n[- ] process > PATHOGENSURVEILLANCE:SRATOOLS_FASTERQDUMP -\n[- ] process > PATHOGENSURVEILLANCE:DOWNLOAD_ASSEMBLIES -\n[- ] process > PATHOGENSURVEILLANCE:SEQKIT_SLIDING -\n[- ] process > PATHOGENSURVEILLANCE:FASTQC -\n[- ] process > PATHOGENSURVEILLANCE:COARSE_SAMPLE_TAXONOMY:BBMAP_SENDSKETCH -\nThe input and output of each process can be accessed from the work/ directory. The subdirectory within work/ is designated by the string to left of each step. Note that this location will be different each time the pipeline is run, and only the first part of the name of the subdirectory is shown. For this run, we could navigate to work/25/63dcee(etc) to access the input csv that is used for the next step. \n\n\nReport\nYou should see a message similar to this if the pipeline finishes successfully:\n-[nf-core/plantpathsurveil] Pipeline completed successfully-\n\nTo clean the cache, enter the command: \nnextflow clean evil_boyd -f \n\nCompleted at: 20-May-2024 12:44:40\nDuration : 3h 29m 2s\nCPU hours : 15.2\nSucceeded : 253\nThe final report can be viewed as either a .pdf or .html file. It can be accessed inside the reports folder of the output directory (here: xanthomonas/reports). This report shows several key pieces of information about your samples.\nA note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case, some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option –copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run.\nThis particular report has been included as an example \n\nSummary:\n\nPipeline Status Report: error messages for samples or sample groups\nInput Data: Data read from the input .csv file\n\n\nIdentification:\n\nInitial identification: Coarse identification from the bbmap sendsketch step. The first tab shows best species ID for each sample. The second tab shows similarity metrics between sample sequences and other reference genomes: %ANI (average nucleotide identity), %WKID (weighted kmer identity), and %completeness.\n\nFor more information about each metric, click the About this table tab underneath.\n\n\n\n\nMost similar organisms: Shows relationships between samples and references using % ani and % pocp (percentage of conserved proteins). For better resolution, you can interactively zoom in/out of plots.\n\n\nCore gene phylogeny: A core gene phylogeny uses the sequences of all gene shared by all of the genomes included in the tree to infer evolutionary relationships. It is the most robust identification provided by this pipeline, but its precision is still limited by the availability of similar reference sequences. Methods to generate this tree differ between prokaryotes and eukaryotes. Our input to the pipeline was prokaryotic DNA sequences, and the method to build this tree is based upon many different core genes shared between samples and references (for eukaryotes, this is constrained to BUSCO genes). This tree is built with iqtree and based upon shared core genes analyzed using the program pirate. You can highlight branches by hovering over and clicking on nodes.\n\n\n\nSNP trees: This tree is better suited for visualizing the genetic diversity among samples. However, the core gene phylogeny provides a much better source of information for evolutionary differences among samples and other known references.\n\n\nMinimum spanning network\n\nMinimum spanning network: The nodes represent unique multilocus genotypes, and the size of nodes is proportional to the # number of samples that share the same genotype. The edges represent the SNP differences between two given genotypes, and the darker the color of the edges, the fewer SNP differences between the two.", "crumbs": [ "Tutorial" ] @@ -249,7 +249,7 @@ "href": "tutorial.html#example-2-defining-references", "title": "Tutorial", "section": "Example 2: Defining References", - "text": "Example 2: Defining References\nYou may already know what your samples are. If so, you may also know the best reference genome and want to tell the pipeline to use it. Other users may have a few different organisms of interest that they want to use as a points of comparison. For example, maybe there is a particularly nasty strain of V. cholerae that you want to see in relation to your other samples. There are a few options to select (or not select) reference genomes for these cases.\nPathogensurveilance has two different categories of reference genomes. Primary references are used for alignment and will always be displayed in phylogenetic trees. In contrast, contextual references are selected before the primary reference is known and may not be used later on in the pipeline. Some contextual references are chosen because they are really close matches to your samples, and these may be selected to become primary references. However, pathogensurveilance will select a few distantly related contextual references too. Some of these are used to “fill out” the phylogeny, and you may want a higher or lower number of contextual references depending on how you want your phylogenetic trees to look.\n\nspecifying primary references\nTake this sample list containing three Mycobacterium abscessus samples and three Mycobacterium leprae samples:\n\n\n\n\n\n\nsample_id\nncbi.accession\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\n\n\nmycobacterium_abscessus2\nERR7253669\n\n\nmycobacterium_abscessus3\nERR7253671\n\n\nmycobacterium_leperae1\nSRR6241707\n\n\nmycobacterium_leperae2\nSRR6241708\n\n\nmycobacterium_leperae3\nSRR6241709\n\n\n\n\n\n\nTo force the pipeline to use the NCBI specified Mycobacterium abscessus reference genome for the three Mycobacterium abscessus samples, and likewise make the three Mycobacterium leprae samples use the NCBI specified Mycobacterium leprae genome, we need to tell pathogenserveilance where to find the reference sequences and how to use them. We can either specify a local path to the reads, or this can instead be specified through the ref_ncbi_accession column. Here, how the references are used here is controlled by the ref_primary_usage column:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_accession\nref_primary_usage\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus2\nERR7253669\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus3\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_leprae1\nSRR6241707\nGCF_003253775.1\nrequired\n\n\nmycobacterium_leprae2\nSRR6241708\nGCF_003253775.1\nrequired\n\n\nmycobacterium_leprae3\nSRR6241709\nGCF_003253775.1\nrequired\n\n\n\n\n\n\n\n\n\nspecifying contextual references\nTaking the previous Mycobacterium abscessus/leprae example, imagine we would like to see the comparison between Mycobacterium leprae and Mycobacterium tuberculosis. We can do this by including Mycobacterium tuberculosis as a mandatory contextual reference:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_accession\nref_contextual_usage\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus2\nERR7253669\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus3\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_leprae1\nSRR6241707\n\n\n\n\nmycobacterium_leprae2\nSRR6241708\n\n\n\n\nmycobacterium_leprae3\nSRR6241709\n\n\n\n\n\n\n\n\n\n\n\nselecting references from an ncbi query\nIt is also possible to submit a valid NCBI query to the pipeline, with reference genomes selected from query hits. For example, if you wanted to test how your Mycobacterium leprae samples compared to a bunch of different other Mycobacterium leprae genomes, your reference csv file may look like:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_query\nref_ncbi_query_max\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\n\nNA\n\n\nmycobacterium_abscessus2\nERR7253669\n\nNA\n\n\nmycobacterium_abscessus3\nERR7253671\n\nNA\n\n\nmycobacterium_leprae1\nSRR6241707\nmycobacterium leprae\n100\n\n\nmycobacterium_leprae2\nSRR6241708\nmycobacterium leprae\n100\n\n\nmycobacterium_leprae3\nSRR6241709\nmycobacterium leprae\n100\n\n\n\n\n\n\nSome things to keep in mind:\n\nDepending on your organism, this may a massive amount of data. Make sure you have queried NCBI beforehand to get a good handle on how many references you are downloading.\nThe optional parameter ref_ncbi_query_max is a good way of limiting this number when you are sampling from a densely populated clade, such as Mycobacterium leprae. This parameter can either be a set number (like shown here) or a percentage.\nThe NCBI API will fail if there are too many requests. See ncbi support for more detail.\n\n\n\n\nMultiple references per sample\nIf we would like to add multiple references per sample, we can enter this information through a separate reference csv. In this example, we specify one primary reference each for Mycobacterium abscessus and Mycobacterium leprae, then three additional contextual references for Mycobacterium leprae:\n\n\n\n\n\n\n\n\n\n\n\n\nref_group_ids\nref_path\nRef.primary.usage\nRef.contextual.Usage\n\n\n\n\nabscessus\ntest/data/refs/mycobacterium_abscessus_reference1.fna\nrequired\n\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference1.fna\nrequired\n\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference2.fna\n\noptional\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference3.fna\n\noptional\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference4.fna\n\noptional\n\n\n\n\n\n\nNote that the “ref_group_ids” column in the sample input csv needs to match the sample csv:\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_group_ids\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nabscessus\n\n\nmycobacterium_abscessus2\nERR7253669\nabscessus\n\n\nmycobacterium_abscessus3\nERR7253671\nabscessus\n\n\nmycobacterium_leprae1\nSRR6241707\nleprae\n\n\nmycobacterium_leprae2\nSRR6241708\nleprae\n\n\nmycobacterium_leprae3\nSRR6241709\nleprae\n\n\n\n\n\n\nWhen enter the command to run the pipeline, the path this reference csv will need to be specified:\nnextflow run nf-core/pathogensurveillance --sample_inut mycobacterium_samples.csv --reference_input mycobacterium_references.csv --output_dir mycobacterium_test --download_bakta_db true -profile docker", + "text": "Example 2: Defining References\nIf you know what your samples are already, you may want to tell the pipeline to use a “standard” reference genome instead of picking one that is more obscure (even if pathogensurveillance deems it to be a better fit). Other users may have a few different organisms of interest that they want to use as a points of comparison. For example, maybe there is a particularly nasty strain of V. cholerae that you want to see in relation to your other samples. There are a few options to select (or not select) reference genomes for these cases.\nPathogensurveillance uses two categories of reference genomes. Primary references are used for alignment and will always be displayed in phylogenetic trees. In contrast, contextual references are selected before the primary reference is known and they may or may not be used later. Some contextual references are chosen because they are really close matches to your samples, and these may be selected to become primary references. However, pathogensurveillance will select a few distantly related contextual references too. Some of these are used to “fill out” the phylogeny, and you may want a higher or lower number of contextual references depending on how you want your phylogenetic trees to look.\n\nChosing primary references\nTake this sample list containing three Mycobacterium abscessus samples and three Mycobacterium leprae samples:\n\n\n\n\n\n\nsample_id\nncbi.accession\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\n\n\nmycobacterium_abscessus2\nERR7253669\n\n\nmycobacterium_abscessus3\nERR7253671\n\n\nmycobacterium_leperae1\nSRR6241707\n\n\nmycobacterium_leperae2\nSRR6241708\n\n\nmycobacterium_leperae3\nSRR6241709\n\n\n\n\n\n\nTo force the pipeline to use the NCBI specified Mycobacterium abscessus reference genome for the three Mycobacterium abscessus samples, and likewise make the three Mycobacterium leprae samples use the NCBI specified Mycobacterium leprae genome, we need to tell pathogensurveillance both where to find these reference sequences and how to use them. We can either specify a local path to the reads, or this can instead be specified through the ref_ncbi_accession column. Here, how the references are used here is controlled by the ref_primary_usage column:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_accession\nref_primary_usage\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus2\nERR7253669\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus3\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_leprae1\nSRR6241707\nGCF_003253775.1\nrequired\n\n\nmycobacterium_leprae2\nSRR6241708\nGCF_003253775.1\nrequired\n\n\nmycobacterium_leprae3\nSRR6241709\nGCF_003253775.1\nrequired\n\n\n\n\n\n\n\n\n\nSpecifying contextual references\nTaking the previous Mycobacterium abscessus/leprae example, imagine we would like to see the comparison between Mycobacterium abscessus and Mycobacterium tuberculosis. We can do this by including Mycobacterium tuberculosis as a mandatory contextual reference:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_accession\nref_contextual_usage\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus2\nERR7253669\nGCF_001632805.1\nrequired\n\n\nmycobacterium_abscessus3\nERR7253671\nGCF_001632805.1\nrequired\n\n\nmycobacterium_leprae1\nSRR6241707\n\n\n\n\nmycobacterium_leprae2\nSRR6241708\n\n\n\n\nmycobacterium_leprae3\nSRR6241709\n\n\n\n\n\n\n\n\n\n\n\nSelecting references from an NCBI query\nIt is also possible to submit a valid NCBI query to the pipeline with reference genomes selected from query hits. For example, you could test how your Mycobacterium leprae samples compared to a bunch of different other Mycobacterium leprae genomes:\n\n\n\n\n\n\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_ncbi_query\nref_ncbi_query_max\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\n\nNA\n\n\nmycobacterium_abscessus2\nERR7253669\n\nNA\n\n\nmycobacterium_abscessus3\nERR7253671\n\nNA\n\n\nmycobacterium_leprae1\nSRR6241707\nmycobacterium leprae\n100\n\n\nmycobacterium_leprae2\nSRR6241708\nmycobacterium leprae\n100\n\n\nmycobacterium_leprae3\nSRR6241709\nmycobacterium leprae\n100\n\n\n\n\n\n\nSome things to keep in mind:\n\nDepending on your organism, this may a massive amount of data. Make sure you have queried NCBI beforehand to get a good handle on how many references you are downloading.\nThe optional parameter ref_ncbi_query_max is a good way of limiting this number when you are sampling from a densely populated clade, such as Mycobacterium leprae. This parameter can either be a set number (like shown here) or a percentage.\nThe NCBI API will fail if there are too many requests. See ncbi support for more detail.\n\n\n\n\nMultiple references per sample\nIf we would like to add multiple references per sample, we can enter this information through a separate reference csv. In this example, we specify one primary reference each for Mycobacterium abscessus and Mycobacterium leprae, then three additional contextual references for Mycobacterium leprae:\n\n\n\n\n\n\n\n\n\n\n\n\nref_group_ids\nref_path\nRef.primary.usage\nRef.contextual.Usage\n\n\n\n\nabscessus\ntest/data/refs/mycobacterium_abscessus_reference1.fna\nrequired\n\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference1.fna\nrequired\n\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference2.fna\n\noptional\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference3.fna\n\noptional\n\n\nleprae\ntest/data/refs/mycobacterium_leprae_reference4.fna\n\noptional\n\n\n\n\n\n\nNote that the “ref_group_ids” column in the sample input csv needs to match the sample csv:\n\n\n\n\n\n\nsample_id\nncbi.accession\nref_group_ids\n\n\n\n\nmycobacterium_abscessus1\nERR7253671\nabscessus\n\n\nmycobacterium_abscessus2\nERR7253669\nabscessus\n\n\nmycobacterium_abscessus3\nERR7253671\nabscessus\n\n\nmycobacterium_leprae1\nSRR6241707\nleprae\n\n\nmycobacterium_leprae2\nSRR6241708\nleprae\n\n\nmycobacterium_leprae3\nSRR6241709\nleprae\n\n\n\n\n\n\nThe path to this reference csv needs to be specified in the command to run the pipeline:\nnextflow run nf-core/pathogensurveillance --sample_inut mycobacterium_samples.csv --reference_input mycobacterium_references.csv --output_dir mycobacterium_test --download_bakta_db true -profile docker", "crumbs": [ "Tutorial" ] diff --git a/docs/tutorial.html b/docs/tutorial.html index c4f01a8..8738be3 100644 --- a/docs/tutorial.html +++ b/docs/tutorial.html @@ -194,9 +194,9 @@

On this page

  • Example 2: Defining References
  • @@ -224,9 +224,12 @@

    Tutorial

    +
    +

    Before starting, first take a look at the Quickstart for instructions on how to download pathogensurveillance and install both Docker and Nextflow.

    Example 1: Standard Run

    -

    This example uses sequencing reads from an outbreak of Xanthomonas hortorum in several plant nurseries. We’ll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from Xanthomonas hortorum). We’ll also explore how isolates from different nursery populations relate to each other and the reference sequences of other closely-related organisms. This information can be obtained from several plots that the pathogensurveillance pipeline generates automatically.

    +

    This example uses sequencing reads from an 2022 outbreak of Xanthomonas hortorum across several plant nurseries. Using whole-genome sequencing, researchers determined a shared genetic basis between strains at different locations. With this information, they traced the origin of the outbreak to a single supplier that sold infected cuttings. You can read more about the study here.

    +

    We’ll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from Xanthomonas hortorum). We’ll also see the high degree of shared DNA sequence between samples, which is seen from several plots that the pathogensurveillance pipeline generates automatically.

    Sample input

    The pipeline is designed to work with a wide variety of existing metadata sheets without extensive changes. Here’s a look at “xanthomonas.csv”, which serves as the only unique input file within the command to run the pipeline:

    @@ -236,34 +239,40 @@

    Sample input

    -----++++++--++++ - - + + + - + + + @@ -271,13 +280,16 @@

    Sample input

    + - - + + + + - + @@ -286,13 +298,16 @@

    Sample input

    + - - + + + + - + @@ -301,13 +316,16 @@

    Sample input

    + - - + + + + - + @@ -316,13 +334,16 @@

    Sample input

    + - - + + + + - + @@ -331,13 +352,16 @@

    Sample input

    + - - + + + + - + @@ -346,13 +370,16 @@

    Sample input

    + - - + + + + - + @@ -362,10 +389,11 @@

    Sample input

    -

    There is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can be present either locally or they can be downloaded.

    -

    Using local reads: Columns “shortread_1” and “shortread_2” specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. They are derived from paired-end illumina shortreads, but the pipeline will also work with mixed inputs of Pacbio or Oxford Nanopore sequences.

    -

    Downloading reads: Sequence files may instead be hosted on the ncbi. In that case, the “shortread_1/shortread_2” columns should be substituted with a single “SRA” column, and they will be downloaded from the ncbi automatically. See test/data/metadata/xanthomonas.csv for an example using this input format.

    -

    Specifying a reference genome (optional): The “reference_refseq” column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Documentation for an in-depth explanation of how to designate mandatory and optional references.

    +


    There is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can either be present locally or they can be downloaded from the NCBI.

    +

    Using local reads: Columns “path_1” and “path_2” specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. . If your reads are single-ended, “path_2” should be left blank.

    +

    Shortread/Longread sequences*: Information in the column “sequencing_type” tells the pipeline these are derived from illumina shortreads. Other options for this column are “nanopore” and “pacbio”.

    +

    Downloading reads: Sequence files may instead be hosted on the NCBI. In that case, the “shortread_1/shortread_2” columns should be substituted with a single “SRA” column, and they will be downloaded right after the pipeline checks the sample sheet. These downloads will show up in the folder path_surveil_data/reads. See test/data/metadata/xanthomonas.csv for an example using this input format.

    +

    Specifying a reference genome (optional): The “reference_refseq” column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Example 2 for an explanation of how to designate mandatory and optional references.

    Assigning sample groups (optional): The optional column “color_by” is used for data visualization. It will assign one or more columns to serve as grouping factors for the output report. Here, samples will be grouped by the values of the “year” and “nursery” columns. Note that multiple factors need to be separated by semicolons within the color_by column.

    @@ -397,7 +425,7 @@

    Report

    CPU hours : 15.2 Succeeded : 253

    The final report can be viewed as either a .pdf or .html file. It can be accessed inside the reports folder of the output directory (here: xanthomonas/reports). This report shows several key pieces of information about your samples.

    -

    A note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option –copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run.

    +

    A note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case, some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option –copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run.

    This particular report has been included as an example


    Summary:

    @@ -433,10 +461,10 @@

    Report

    Example 2: Defining References

    -

    You may already know what your samples are. If so, you may also know the best reference genome and want to tell the pipeline to use it. Other users may have a few different organisms of interest that they want to use as a points of comparison. For example, maybe there is a particularly nasty strain of V. cholerae that you want to see in relation to your other samples. There are a few options to select (or not select) reference genomes for these cases.

    -

    Pathogensurveilance has two different categories of reference genomes. Primary references are used for alignment and will always be displayed in phylogenetic trees. In contrast, contextual references are selected before the primary reference is known and may not be used later on in the pipeline. Some contextual references are chosen because they are really close matches to your samples, and these may be selected to become primary references. However, pathogensurveilance will select a few distantly related contextual references too. Some of these are used to “fill out” the phylogeny, and you may want a higher or lower number of contextual references depending on how you want your phylogenetic trees to look.

    -
    -

    specifying primary references

    +

    If you know what your samples are already, you may want to tell the pipeline to use a “standard” reference genome instead of picking one that is more obscure (even if pathogensurveillance deems it to be a better fit). Other users may have a few different organisms of interest that they want to use as a points of comparison. For example, maybe there is a particularly nasty strain of V. cholerae that you want to see in relation to your other samples. There are a few options to select (or not select) reference genomes for these cases.

    +

    Pathogensurveillance uses two categories of reference genomes. Primary references are used for alignment and will always be displayed in phylogenetic trees. In contrast, contextual references are selected before the primary reference is known and they may or may not be used later. Some contextual references are chosen because they are really close matches to your samples, and these may be selected to become primary references. However, pathogensurveillance will select a few distantly related contextual references too. Some of these are used to “fill out” the phylogeny, and you may want a higher or lower number of contextual references depending on how you want your phylogenetic trees to look.

    +
    +

    Chosing primary references

    Take this sample list containing three Mycobacterium abscessus samples and three Mycobacterium leprae samples:

    @@ -478,7 +506,7 @@

    specifying p

    -

    To force the pipeline to use the NCBI specified Mycobacterium abscessus reference genome for the three Mycobacterium abscessus samples, and likewise make the three Mycobacterium leprae samples use the NCBI specified Mycobacterium leprae genome, we need to tell pathogenserveilance where to find the reference sequences and how to use them. We can either specify a local path to the reads, or this can instead be specified through the ref_ncbi_accession column. Here, how the references are used here is controlled by the ref_primary_usage column:

    +

    To force the pipeline to use the NCBI specified Mycobacterium abscessus reference genome for the three Mycobacterium abscessus samples, and likewise make the three Mycobacterium leprae samples use the NCBI specified Mycobacterium leprae genome, we need to tell pathogensurveillance both where to find these reference sequences and how to use them. We can either specify a local path to the reads, or this can instead be specified through the ref_ncbi_accession column. Here, how the references are used here is controlled by the ref_primary_usage column:

    @@ -542,8 +570,8 @@

    specifying p

    -

    specifying contextual references

    -

    Taking the previous Mycobacterium abscessus/leprae example, imagine we would like to see the comparison between Mycobacterium leprae and Mycobacterium tuberculosis. We can do this by including Mycobacterium tuberculosis as a mandatory contextual reference:

    +

    Specifying contextual references

    +

    Taking the previous Mycobacterium abscessus/leprae example, imagine we would like to see the comparison between Mycobacterium abscessus and Mycobacterium tuberculosis. We can do this by including Mycobacterium tuberculosis as a mandatory contextual reference:

    @@ -607,8 +635,8 @@

    specifyin

    -

    selecting references from an ncbi query

    -

    It is also possible to submit a valid NCBI query to the pipeline, with reference genomes selected from query hits. For example, if you wanted to test how your Mycobacterium leprae samples compared to a bunch of different other Mycobacterium leprae genomes, your reference csv file may look like:

    +

    Selecting references from an NCBI query

    +

    It is also possible to submit a valid NCBI query to the pipeline with reference genomes selected from query hits. For example, you could test how your Mycobacterium leprae samples compared to a bunch of different other Mycobacterium leprae genomes:

    @@ -782,7 +810,7 @@

    Multiple re

    -

    When enter the command to run the pipeline, the path this reference csv will need to be specified:

    +

    The path to this reference csv needs to be specified in the command to run the pipeline:

    nextflow run nf-core/pathogensurveillance --sample_inut mycobacterium_samples.csv --reference_input mycobacterium_references.csv --output_dir mycobacterium_test --download_bakta_db true -profile docker 
    diff --git a/tutorial.qmd b/tutorial.qmd index 1174764..07be748 100644 --- a/tutorial.qmd +++ b/tutorial.qmd @@ -7,10 +7,15 @@ format: params: inputs: "tutorial_data/all_inputs" --- +------------------------------------------------------------------------ + +Before starting, first take a look at the Quickstart for instructions on how to download pathogensurveillance and install both Docker and Nextflow.

    ## Example 1: Standard Run -This example uses sequencing reads from an outbreak of *Xanthomonas hortorum* in several plant nurseries. We'll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from *Xanthomonas hortorum*). We'll also explore how isolates from different nursery populations relate to each other and the reference sequences of other closely-related organisms. This information can be obtained from several plots that the pathogensurveillance pipeline generates automatically.

    +This example uses sequencing reads from an 2022 outbreak of *Xanthomonas hortorum* across several plant nurseries. Using whole-genome sequencing, researchers determined a shared genetic basis between strains at different locations. With this information, they traced the origin of the outbreak to a single supplier that sold infected cuttings. You can read more about the study here. + +We'll be treating the pathogen as an unknown and using the pathogensurveillance pipeline to determine what we know already (that these samples come from *Xanthomonas hortorum*). We'll also see the high degree of shared DNA sequence between samples, which is seen from several plots that the pathogensurveillance pipeline generates automatically.
    ### Sample input @@ -25,13 +30,17 @@ df = read.csv("data/xanthomonas.csv") df |> head() ``` -There is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can be present either locally or they can be downloaded. +
    There is quite a bit of information in this file, but only a few columns are essential (and can be in any order). The input csv needs show the pipeline where to find the sequencing reads. These can either be present locally or they can be downloaded from the NCBI. + +**Sample ID**: The "sample_id" column is used to name your samples. This information will be used in graphs, so it is recommended to keep names short but informative. If you do not include this column, sample IDs will be generated from the names of your fastq files. + +**Using local reads**: Columns "path_1" and "path_2" specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. . If your reads are single-ended, "path_2" should be left blank. -**Using local reads**: Columns "shortread_1" and "shortread_2" specify the path to forward and reverse reads. Each row corresponds to one individual sample. Reads for this tutorial are hosted on the pathogensurveilance github repo. They are derived from paired-end illumina shortreads, but the pipeline will also work with mixed inputs of Pacbio or Oxford Nanopore sequences. +**Shortread/Longread sequences***: Information in the column "sequencing_type" tells the pipeline these are derived from illumina shortreads. Other options for this column are "nanopore" and "pacbio". -**Downloading reads**: Sequence files may instead be hosted on the ncbi. In that case, the "shortread_1/shortread_2" columns should be substituted with a single "SRA" column, and they will be downloaded from the ncbi automatically. See test/data/metadata/xanthomonas.csv for an example using this input format. +**Downloading reads**: Sequence files may instead be hosted on the NCBI. In that case, the "shortread_1/shortread_2" columns should be substituted with a single "SRA" column, and they will be downloaded right after the pipeline checks the sample sheet. These downloads will show up in the folder path_surveil_data/reads. See test/data/metadata/xanthomonas.csv for an example using this input format. -**Specifying a reference genome (optional)**: The "reference_refseq" column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Documentation for an in-depth explanation of how to designate mandatory and optional references. +**Specifying a reference genome (optional)**: The "reference_refseq" column may be useful when you are relatively confident as to the identity of your samples and would like to include one particular reference for comparison. See Example 2 for an explanation of how to designate mandatory and optional references. **Assigning sample groups (optional)**: The optional column "color_by" is used for data visualization. It will assign one or more columns to serve as grouping factors for the output report. Here, samples will be grouped by the values of the "year" and "nursery" columns. Note that multiple factors need to be separated by semicolons within the color_by column.

    @@ -40,7 +49,7 @@ There is quite a bit of information in this file, but only a few columns are ess Here is the full command used execute this example, using a docker container: ``` bash -nextflow run nf-core/pathogensurveillance --input https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/xanthomonas.csv --outdir xanthomonas --download_bakta_db true -profile docker -resume --max_cpus 8 --max_memory 30GB -resume +nextflow run nf-core/pathogensurveillance --sample_data https://raw.githubusercontent.com/grunwaldlab/pathogensurveillance/master/test/data/metadata/xanthomonas.csv --out_dir xanthomonas --download_bakta_db true -profile docker -resume --max_cpus 8 --max_memory 30GB -resume ``` When running your own analysis, you will need to provide your own path to the input CSV file. @@ -80,7 +89,7 @@ Succeeded : 253 The final report can be viewed as either a .pdf or .html file. It can be accessed inside the reports folder of the output directory (here: `xanthomonas/reports`). This report shows several key pieces of information about your samples. -A note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option --copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run. +A note on storage management - pathogensurveillance creates a large number of intermediate files. For most users we recommend clearing these files after each run. To do so, run the script shown after the completion message (nextflow clean -f). You would not want to do this if: (1) You still need to use the caching system. For example, imagine you would like to compare a new sample to 10 samples from a previous run. In that case, some files could be reused to make the pipeline work more quickly. (2) You would like to use intermediate files for your own analysis. By default, these files are saved in the output directory as symlinks to their location in the work/ directory, so you would need to retrieve these before clearing the cache. You could use alternatively use the option --copymode high to save all intermediate files to the published directory, though in the short term this doubles the storage footprint of each run. This particular report has been included as an example @@ -96,7 +105,7 @@ This particular report has been included as an head() ### Specifying contextual references -Taking the previous *Mycobacterium abscessus/leprae* example, imagine we would like to see the comparison between *Mycobacterium leprae* and *Mycobacterium tuberculosis*. We can do this by including *Mycobacterium tuberculosis* as a mandatory contextual reference: +Taking the previous *Mycobacterium abscessus/leprae* example, imagine we would like to see the comparison between *Mycobacterium abscessus* and *Mycobacterium tuberculosis*. We can do this by including *Mycobacterium tuberculosis* as a mandatory contextual reference: ```{r} #| echo: false #| message: false @@ -176,9 +185,9 @@ df |> head() ------------------------------------------------------------------------ -### Selecting references from an ncbi query +### Selecting references from an NCBI query -It is also possible to submit a valid NCBI query to the pipeline, with reference genomes selected from query hits. For example, if you wanted to test how your *Mycobacterium leprae* samples compared to a bunch of different other *Mycobacterium leprae* genomes, your reference csv file may look like: +It is also possible to submit a valid NCBI query to the pipeline with reference genomes selected from query hits. For example, you could test how your *Mycobacterium leprae* samples compared to a bunch of different other *Mycobacterium leprae* genomes: ```{r} #| echo: false @@ -220,7 +229,7 @@ df = read.csv("data/mycobacterium_6.csv") df |> head() ``` -When enter the command to run the pipeline, the path this reference csv will need to be specified: +The path to this reference csv needs to be specified in the command to run the pipeline: ``` bash -nextflow run nf-core/pathogensurveillance --sample_inut mycobacterium_samples.csv --reference_input mycobacterium_references.csv --output_dir mycobacterium_test --download_bakta_db true -profile docker +nextflow run nf-core/pathogensurveillance --sample_data mycobacterium_samples.csv --reference_input mycobacterium_references.csv --out_dir mycobacterium_test --download_bakta_db true -profile docker ``` \ No newline at end of file
    sample_idshortread_1shortread_2path_1path_2sequence_type reference reference_id report_group color_by date_isolated date_receivedyearyear host cv_key nurseryXX.1
    22-299 test/data/reads/22-299_R1.fastq.gz test/data/reads/22-299_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/2/22 3/29/2220222022 Pelargonium x hortorum CV-1 MD 22-300 test/data/reads/22-300_R1.fastq.gz test/data/reads/22-300_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/2/22 3/30/2220222022 Pelargonium x hortorum CV-2 MD 22-301 test/data/reads/22-301_R1.fastq.gz test/data/reads/22-301_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/2/22 3/31/2220222022 Pelargonium x hortorum CV-3 MD 22-302 test/data/reads/22-302_R1.fastq.gz test/data/reads/22-302_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/2/22 4/1/2220222022 Pelargonium x hortorum CV-4 MD 22-303 test/data/reads/22-303_R1.fastq.gz test/data/reads/22-303_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/2/22 4/2/2220222022 Pelargonium x hortorum CV-5 MD 22-304 test/data/reads/22-304_R1.fastq.gz test/data/reads/22-304_R2.fastq.gzIllumina xan_test;subgroupyear;nurseryxan_testsubgroupyearnursery 3/7/22 4/3/2220222022 Pelargonium x hortorum CV-6 MD