naobservatory · simonleandergrimm · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,4 @@ test/.nextflow*
 pipeline_report.txt
 
 .nf-test/
-.nf-test.log
+.nf-test.log
diff --git a/bin/generate_samplesheet.sh b/bin/generate_samplesheet.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+
 set -u
 set -e
 
@@ -10,10 +11,28 @@ dir_path=""
 forward_suffix=""
 reverse_suffix=""
 s3=0
+single_end=0
 output_path="samplesheet.csv"  # Default output path
 group_file=""  # Optional parameter for the group file
 group_across_illumina_lanes=false
 
+# Function to print usage
+print_usage() {
+    echo "Usage:"
+    echo "For paired-end reads:"
+    echo "  $0 --dir_path <path> --forward_suffix <suffix> --reverse_suffix <suffix> [--s3] [--output_path <path>]"
+    echo "For single-end reads:"
+    echo "  $0 --dir_path <path> --single_end [--s3] [--output_path <path>]"
+    echo
+    echo "Options:"
+    echo "  --dir_path        Directory containing FASTQ files"
+    echo "  --forward_suffix  Suffix for forward reads (required for paired-end only)"
+    echo "  --reverse_suffix  Suffix for reverse reads (required for paired-end only)"
+    echo "  --single_end     Flag for single-end data"
+    echo "  --s3             Flag for S3 bucket access"
+    echo "  --output_path    Output path for samplesheet (default: samplesheet.csv)"
+}
+
 # Parse command-line arguments
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -33,10 +52,18 @@ while [[ $# -gt 0 ]]; do
             s3=1
             shift
             ;;
+        --single_end)
+            single_end=1
+            shift
+            ;;
         --output_path)
             output_path="$2"
             shift 2
             ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
         --group_file)  # Optional group file
             group_file="$2"
             shift 2
@@ -47,20 +74,22 @@ while [[ $# -gt 0 ]]; do
             ;;
         *)
             echo "Unknown option: $1"
+            print_usage
             exit 1
             ;;
     esac
 done
 
 # Check if all required parameters are provided
-if [[ -z "$dir_path" || -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
-    echo "Error: dir_path, forward_suffix, and reverse_suffix are required."
+if [[ -z "$dir_path" || -z "$single_end" ]]; then
+    echo "Error: dir_path and single_end are required."
     echo -e "\nUsage: $0 [options]"
     echo -e "\nRequired arguments:"
     echo -e "  --dir_path <path>         Directory containing FASTQ files"
-    echo -e "  --forward_suffix <suffix>  Suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
-    echo -e "  --reverse_suffix <suffix>  Suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
+    echo -e "  --single_end              Flag for single-end data"
     echo -e "\nOptional arguments:"
+    echo -e "  --forward_suffix <suffix>  When single_end is 0, suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
+    echo -e "  --reverse_suffix <suffix>  When single_end is 0, suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
     echo -e "  --s3                      Use if files are stored in S3 bucket"
     echo -e "  --output_path <path>      Output path for samplesheet [default: samplesheet.csv]"
     echo -e "  --group_file <path>       Path to group file for sample grouping [header column must have the names 'sample,group' in that order; additional columns may be included, however they will be ignored by the script]"
@@ -74,15 +103,28 @@ if $group_across_illumina_lanes && [[ -n "$group_file" ]]; then
     exit 1
 fi
 
+if [ $single_end -eq 0 ]; then
+    # Paired-end validation
+    if [[ -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
+        echo "Error: forward_suffix and reverse_suffix are required for paired-end reads."
+        print_usage
+        exit 1
+    fi
+fi
+
 # Display the parameters
 echo "Parameters:"
 echo "dir_path: $dir_path"
-echo "forward_suffix: $forward_suffix"
-echo "reverse_suffix: $reverse_suffix"
+echo "single_end: $single_end"
 echo "s3: $s3"
 echo "output_path: $output_path"
 echo "group_file: $group_file"
 echo "group_across_illumina_lanes: $group_across_illumina_lanes"
+if [ $single_end -eq 0 ]; then
+    echo "forward_suffix: $forward_suffix"
+    echo "reverse_suffix: $reverse_suffix"
+fi
+
 
 
 #### EXAMPLES ####
@@ -109,30 +151,45 @@ echo "group_across_illumina_lanes: $group_across_illumina_lanes"
 # Create a temporary file for the initial samplesheet
 temp_samplesheet=$(mktemp)
 
-echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
+# Create header based on single_end flag
+if [ $single_end -eq 0 ]; then
+    echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
+else
+    echo "sample,fastq" > "$temp_samplesheet"
+fi
+echo "group_file: $group_file"
 
 
 # Ensure dir_path ends with a '/'
 if [[ "$dir_path" != */ ]]; then
     dir_path="${dir_path}/"
 fi
 
-listing=0
-
+# Get file listing based on s3 flag
 if [ $s3 -eq 1 ]; then
     listing=$(aws s3 ls ${dir_path} | awk '{print $4}')
 else
     listing=$(ls ${dir_path} | awk '{print $1}')
 fi
 
-echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
-    sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
-    reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
-    # If sample + reverse_suffix exists in s3_listing, then add to samplesheet
-    if [ -n "$reverse_read" ]; then
-        echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
-    fi
-done
+# Process files based on single_end flag
+if [ $single_end -eq 0 ]; then
+    # Paired-end processing
+    echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
+        sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
+        reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
+        # If sample + reverse_suffix exists in s3_listing, then add to samplesheet
+        if [ -n "$reverse_read" ]; then
+            echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
+        fi
+    done
+else
+    # Single-end processing - just process all fastq.gz files
+    echo "$listing" | grep "\.fastq\.gz$" | while read -r read_file; do
+        sample=$(echo "$read_file" | sed -E "s/\.fastq\.gz$//")
+        echo "$sample,${dir_path}${read_file}" >> "$temp_samplesheet"
+    done
+fi
 
 # Check if group file is provided
 if [[ -n "$group_file" ]]; then

diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
@@ -0,0 +1,37 @@
+/************************************************
+| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
+************************************************/
+
+params {
+    mode = "run_dev_se"
+
+
+    // Directories
+    base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
+    ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
+
+    // Files
+    sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
+    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
+
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+
+    // Numerical
+    grouping = false // Whether to group samples by 'group' column in samplesheet
+    n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
+    n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
+    bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
+    blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
+    kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
+    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
+    fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
+    host_taxon = "vertebrate"
+}
+
+includeConfig "${projectDir}/configs/logging.config"
+includeConfig "${projectDir}/configs/containers.config"
+includeConfig "${projectDir}/configs/resources.config"
+includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/output.config"
+process.queue = "simon-batch-queue" // AWS Batch job queue
diff --git a/main.nf b/main.nf
@@ -1,6 +1,7 @@
 include { RUN } from "./workflows/run"
 include { RUN_VALIDATION } from "./workflows/run_validation"
 include { INDEX } from "./workflows/index"
+include { RUN_DEV_SE } from "./workflows/run_dev_se"
 
 workflow {
     if (params.mode == "index") {
@@ -9,6 +10,8 @@ workflow {
         RUN()
     } else if (params.mode == "run_validation") {
         RUN_VALIDATION()
+    } else if (params.mode == "run_dev_se") {
+        RUN_DEV_SE()
     }
 }
 

diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf
@@ -5,8 +5,13 @@ process FASTP {
         // reads is a list of two files: forward/reverse reads
         tuple val(sample), path(reads)
         path(adapters)
+        val single_end
     output:
-        tuple val(sample), path("${sample}_fastp_{1,2}.fastq.gz"), emit: reads
+        tuple val(sample), path({
+            single_end ?
+                "${sample}_fastp.fastq.gz" :
+                "${sample}_fastp_{1,2}.fastq.gz"
+        }), emit: reads
         tuple val(sample), path("${sample}_fastp_failed.fastq.gz"), emit: failed
         tuple val(sample), path("${sample}_fastp.{json,html}"), emit: log
     shell:
@@ -19,13 +24,23 @@ process FASTP {
         */
         '''
         # Define paths and subcommands
-        o1=!{sample}_fastp_1.fastq.gz
-        o2=!{sample}_fastp_2.fastq.gz
         of=!{sample}_fastp_failed.fastq.gz
         oj=!{sample}_fastp.json
         oh=!{sample}_fastp.html
         ad=!{adapters}
-        io="--in1 !{reads[0]} --in2 !{reads[1]} --out1 ${o1} --out2 ${o2} --failed_out ${of} --html ${oh} --json ${oj} --adapter_fasta ${ad}"
+        if [ $(echo "!{reads}" | wc -w) -eq 2 ]; then
+            echo "Processing paired-end reads"
+            o1=!{sample}_fastp_1.fastq.gz
+            o2=!{sample}_fastp_2.fastq.gz
+            io="--in1 !{reads[0]} --in2 !{reads[1]} --out1 ${o1} --out2 ${o2} --failed_out ${of} --html ${oh} --json ${oj} --adapter_fasta ${ad}"
+        else
+            echo "Processing single-end reads"
+            o=!{sample}_fastp.fastq.gz
+            io="--in1 !{reads[0]} --out1 ${o} --failed_out ${of} --html ${oh} --json ${oj} --adapter_fasta ${ad}"
+        fi
+        par="--cut_front --cut_tail --correction --detect_adapter_for_pe --trim_poly_x --cut_mean_quality 25 --average_qual 25 --qualified_quality_phred 20 --verbose --dont_eval_duplication --thread !{task.cpus} --low_complexity_filter"
+
+
         par="--cut_front --cut_tail --correction --detect_adapter_for_pe --trim_poly_x --cut_mean_quality 20 --average_qual 20 --qualified_quality_phred 20 --verbose --dont_eval_duplication --thread !{task.cpus} --low_complexity_filter"
         # Execute
         fastp ${io} ${par}
@@ -66,3 +81,4 @@ process FASTP_NOTRIM {
         fastp ${io} ${par}
         '''
 }
+
diff --git a/modules/local/summarizeMultiqcPair/main.nf b/modules/local/summarizeMultiqcPair/main.nf
@@ -4,10 +4,11 @@ process SUMMARIZE_MULTIQC_PAIR {
     label "single"
     input:
         tuple val(stage), val(sample), path(multiqc_data)
+        val(single_end)
     output:
         tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz")
     shell:
         '''
-        summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -o ${PWD}
+        summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
         '''
-}
+}
diff --git a/modules/local/summarizeMultiqcPair/resources/usr/bin/summarize-multiqc-pair.R b/modules/local/summarizeMultiqcPair/resources/usr/bin/summarize-multiqc-pair.R
@@ -13,12 +13,24 @@ option_list = list(
               help="Stage descriptor."),
   make_option(c("-S", "--sample"), type="character", default=NULL,
               help="Sample ID."),
+  make_option(c("-r", "--single_end"), type="character", default=FALSE,
+              help="Single-end flag."),
   make_option(c("-o", "--output_dir"), type="character", default=NULL,
               help="Path to output directory.")
 )
 opt_parser = OptionParser(option_list=option_list);
 opt = parse_args(opt_parser);
 
+# Convert single_end from string to logical
+if (opt$single_end == "true") {
+  single_end <- TRUE
+} else if (opt$single_end == "false") {
+  single_end <- FALSE
+} else {
+  stop("single_end must be 'true' or 'false'")
+}
+
+
 # Set input paths
 multiqc_json_path <- file.path(opt$input_dir, "multiqc_data.json")
 fastqc_tsv_path <- file.path(opt$input_dir, "multiqc_fastqc.txt")
@@ -57,8 +69,19 @@ basic_info_fastqc <- function(fastqc_tsv, multiqc_json){
   tab_tsv <- fastqc_tsv %>%
     mutate(n_bases_approx = process_n_bases(`Total Bases`)) %>%
     select(n_bases_approx, per_base_sequence_quality:adapter_content) %>%
-    summarize_all(function(x) paste(x, collapse="/")) %>%
-    mutate(n_bases_approx = n_bases_approx %>% str_split("/") %>% sapply(as.numeric) %>% colSums())
+    summarize_all(function(x) paste(x, collapse="/"))
+
+  if (single_end) {
+    tab_tsv <- tab_tsv %>%
+      mutate(n_bases_approx = n_bases_approx %>% as.numeric)
+  } else {
+    tab_tsv <- tab_tsv %>%
+      mutate(n_bases_approx = n_bases_approx %>%
+             str_split("/") %>%
+             sapply(as.numeric) %>%
+             colSums())
+  }
+
   # Combine
   return(bind_cols(tab_json, tab_tsv))
 }
@@ -86,7 +109,7 @@ extract_adapter_data <- function(multiqc_json){
 extract_per_base_quality_single <- function(per_base_quality_dataset){
   # Convert a single JSON per-base-quality dataset into a tibble
   data <- lapply(1:length(per_base_quality_dataset$name), function(n)
-    per_base_quality_dataset$data[[n]] %>% as.data.frame %>% 
+    per_base_quality_dataset$data[[n]] %>% as.data.frame %>%
       mutate(file=per_base_quality_dataset$name[n])) %>%
     bind_rows() %>% as_tibble %>%
     rename(position=V1, mean_phred_score=V2)
@@ -103,7 +126,7 @@ extract_per_base_quality <- function(multiqc_json){
 extract_per_sequence_quality_single <- function(per_sequence_quality_dataset){
   # Convert a single JSON per-sequence-quality dataset into a tibble
   data <- lapply(1:length(per_sequence_quality_dataset$name), function(n)
-    per_sequence_quality_dataset$data[[n]] %>% as.data.frame %>% 
+    per_sequence_quality_dataset$data[[n]] %>% as.data.frame %>%
       mutate(file=per_sequence_quality_dataset$name[n])) %>%
     bind_rows() %>% as_tibble %>%
     rename(mean_phred_score=V1, n_sequences=V2)

diff --git a/modules/local/truncateConcat/main.nf b/modules/local/truncateConcat/main.nf
@@ -5,16 +5,30 @@ process TRUNCATE_CONCAT {
     input:
         tuple val(sample), path(reads)
         val n_reads
+        val single_end
     output:
-        tuple val(sample), path("${sample}_trunc_{1,2}.fastq.gz"), emit: reads
+
+        tuple val(sample), path({
+            single_end ?
+                    "${sample}_trunc.fastq.gz" :
+                    "${sample}_trunc_{1,2}.fastq.gz"
+            }), emit: reads
     shell:
         '''
         echo "Number of output reads: !{n_reads}"
         n_lines=$(expr !{n_reads} \\* 4)
         echo "Number of output lines: ${n_lines}"
-        o1=!{sample}_trunc_1.fastq.gz
-        o2=!{sample}_trunc_2.fastq.gz
-        zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o1}
-        zcat !{reads[1]} | head -n ${n_lines} | gzip -c > ${o2}
+        if [ $(echo "!{reads}" | wc -w) -eq 2 ]; then
+            echo "Processing paired-end reads"
+            o1=!{sample}_trunc_1.fastq.gz
+            o2=!{sample}_trunc_2.fastq.gz
+            zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o1}
+            zcat !{reads[1]} | head -n ${n_lines} | gzip -c > ${o2}
+        else
+            echo "Processing single-end reads"
+            o=!{sample}_trunc.fastq.gz
+            zcat !{reads[0]} | head -n ${n_lines} | gzip -c > ${o}
+        fi
+
         '''
 }