BU-ISCIII · GuilleGorines · Sep 29, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog
@@ -1,31 +1,75 @@
 # module load BLAST+/2.11.0-gompi-2020b
 
-# header:
-#  1: stitle
-#  2: qaccver
-#  3: saccver
-#  4: pident
-#  5: length
-#  6: ismatch
-#  7: gapopen
-#  8: qstart
-#  9: qend
-# 10: sstart
-# 11: send
-# 12: evalue
-# 13: bitscore
-# 14: slen
-# 15: qlen
-# 16: qcovs
-# 17: %cgAligned
-# 18: %refCovered
-# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
-
 scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
 mkdir logs
-cat ../samples_id.txt | xargs -I %% echo "mkdir %%; cd %%; ln -s ../../*ASSEMBLY/03-assembly/unicycler/%%.fasta .; cd -" | bash
-cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 376530M --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db /data/bi/references/BLAST_dbs/nt_20211025/nt -query %%/%%.fasta -out %%/%%_blast.tab -outfmt '6 qseqid stitle std slen qlen qcovs' &" > _01_blast.sh
-cat ../samples_id.txt | xargs -I %% echo "awk 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print \$0,\$6/\$16,\$6/\$15}' %%/%%_blast.tab | awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print samplename,\$0}' > %%/%%_blast_filt.tab" > _02_filter_blast.sh
-echo -e "echo \"samplename\tcontigname\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tslen\tqlen\tqcovs\t%cgAligned\t%refCovered\" > header" > _03_gather_results.sh
-echo "cat header */*blast_filt.tab > all_samples_filtered_BLAST_results.tab" >> _03_gather_results.sh
-echo "rm header" >> _03_gather_results.sh
+
+# Location of assemblies to a variable so it only has to be changed here
+LOCATION=../*/*/assembly/*/*
+# Other databases: 
+# /data/bi/references/BLAST_dbs/nt_20211025/nt
+BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta"
+
+# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression)
+# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir
+echo "Samples that did not generate scaffolds:" > noscaffold.txt
+cat ../samples_id.txt | while read in; do
+    mkdir ${in}
+    # ls will return 0 if there are no scaffolds file
+    # NOTE: change extension and location at will
+    # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed
+    if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then
+        zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa
+    else
+        # Note assemblies that did not make a scaffold
+        zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa
+        echo ${in} >> noscaffold.txt
+    fi
+done 
+
+# NOTE3: change the -query flag to meet your requirements
+cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh
+
+# Filtering criteria:
+    # %refCovered > 0.7
+    # ref not a phage (stitle ~! /phage/)
+    # ref longer than 200 bp (slen > 200)
+
+# First awk: create the full table; second awk: filter it
+cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
+echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
+echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh
+cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh
+echo "rm header" >> _03_gather_results_add_header.sh
+
+# NOTES FOR FILTERING
+#
+# subject = reference
+#
+# COLS GENERATED BY US:
+#  1: samplename
+# GENERATED BY BLAST
+#  2: contigname - qseqid
+#  3: stitle 
+#  4: qaccver
+#  5: saccver
+#  6: pident
+#  7: length (of alignment)
+#  8: mismatch
+#  9: gaps
+# 10: qstart
+# 11: qend
+# 12: sstart
+# 13: send
+# 14: evalue
+# 15: bitscore
+# 16: ref len - slen
+# 17: query len - qlen
+# 18: qcovs
+# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
+# GENERATED BY US:
+# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes)
+# 20: %refCovered: length/slen
+
+# conda activate 2excel
+cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh
+echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog
@@ -0,0 +1,38 @@
+# module load Nextflow/21.10.6 singularity
+
+ln -s ../00-reads .
+ln -s ../samples_id.txt .
+echo "sample,fastq_1,fastq_2" > samplesheet.csv
+cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv
+
+
+scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
+
+cat <<EOF > pikavirus.sbatch
+#!/bin/sh
+#SBATCH --ntasks 1
+#SBATCH --cpus-per-task 2
+#SBATCH --mem 4G
+#SBATCH --time 4:00:00
+#SBATCH --partition middle_idx
+#SBATCH --output $(date '+%Y%m%d')_pikavirus01.log
+#SBATCH --chdir $scratch_dir
+
+export NXF_OPTS="-Xms500M -Xmx4G"
+
+nextflow run /scratch/bi/pipelines/PikaVirus/main.nf \\
+          -c ../../DOC/hpc_slurm_pikavirus.config \\
+          --input samplesheet.csv \\
+          --kraken_scouting false \\
+          --virus true \\
+          --bacteria false \\
+          --fungi false \\
+          --kaiju false \\
+          --mash_winner_strategy true \\
+          --mash_identitity_threshold 0.9 \\
+          --mash_shared_hashes_threshold 0.01 \\
+          --mash_pvalue_threshold 0.05 \\
+          -resume
+EOF
+
+echo "sbatch pikavirus.sbatch" > _01_nf_pikavirus.sh
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog
@@ -0,0 +1,24 @@
+# module load MultiQC
+cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done
+
+scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
+
+cat <<EOF > multiqc.sbatch
+#!/bin/sh
+#SBATCH --ntasks 1
+#SBATCH --cpus-per-task 2
+#SBATCH --mem 4G
+#SBATCH --time 00:30:00
+#SBATCH --partition short_idx
+#SBATCH --output $(date '+%Y%m%d')_multiqc.log
+#SBATCH --chdir $scratch_dir
+
+export NXF_OPTS="-Xms500M -Xmx4G"
+
+multiqc -d . --config multiqc_config.yaml
+
+EOF
+
+echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh
+
+echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml
@@ -0,0 +1,13 @@
+extra_fn_clean_exts:
+    - _R1
+    - _R2
+    - .R1
+    - .R2
+    - .sort
+    - _sort
+    - .stats
+    - _bamstat
+    - _align
+    - .txt
+report_comment: >
+    This report has been generated by BU-ISCIII
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/lablog
@@ -0,0 +1,30 @@
+ln -s ../00-reads .
+ln -s ../samples_id.txt .
+
+#module load Nextflow
+#module load singularity
+
+scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
+
+cat <<EOF > mag.sbatch
+#!/bin/sh
+#SBATCH --ntasks 1
+#SBATCH --cpus-per-task 2
+#SBATCH --mem 4G
+#SBATCH --time 2:00:00
+#SBATCH --partition middle_idx
+#SBATCH --output $(date '+%Y%m%d')_mag.log
+#SBATCH --chdir $scratch_dir
+
+export NXF_OPTS="-Xms500M -Xmx4G"
+
+nextflow run /data/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\
+          -c ../../DOC/mag.config \\
+          --input '00-reads/*_R{1,2}.fastq.gz' \\
+          --outdir $(date '+%Y%m%d')_mag \\
+          --kraken2_db /data/bi/references/kraken/minikraken_8GB_20200312.tgz \\
+          --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\
+          -resume
+EOF
+
+echo "sbatch mag.sbatch" > _01_run_mag.sh
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog
@@ -0,0 +1,75 @@
+# module load BLAST+/2.11.0-gompi-2020b
+
+scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")
+mkdir logs
+
+# Location of assemblies to a variable so it only has to be changed here
+LOCATION=../*/*/assembly/*/*
+# Other databases: 
+# /data/bi/references/BLAST_dbs/nt_20211025/nt
+BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta"
+
+# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression)
+# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir
+echo "Samples that did not generate scaffolds:" > noscaffold.txt
+cat ../samples_id.txt | while read in; do
+    mkdir ${in}
+    # ls will return 0 if there are no scaffolds file
+    # NOTE: change extension and location at will
+    # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed
+    if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then
+        zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa
+    else
+        # Note assemblies that did not make a scaffold
+        zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa
+        echo ${in} >> noscaffold.txt
+    fi
+done 
+
+# NOTE3: change the -query flag to meet your requirements
+cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh
+
+# Filtering criteria:
+    # %refCovered > 0.7
+    # ref not a phage (stitle ~! /phage/)
+    # ref longer than 200 bp (slen > 200)
+
+# First awk: create the full table; second awk: filter it
+cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh
+echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh
+echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh
+cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh
+echo "rm header" >> _03_gather_results_add_header.sh
+
+# NOTES FOR FILTERING
+#
+# subject = reference
+#
+# COLS GENERATED BY US:
+#  1: samplename
+# GENERATED BY BLAST
+#  2: contigname - qseqid
+#  3: stitle 
+#  4: qaccver
+#  5: saccver
+#  6: pident
+#  7: length (of alignment)
+#  8: mismatch
+#  9: gaps
+# 10: qstart
+# 11: qend
+# 12: sstart
+# 13: send
+# 14: evalue
+# 15: bitscore
+# 16: ref len - slen
+# 17: query len - qlen
+# 18: qcovs
+# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
+# GENERATED BY US:
+# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes)
+# 20: %refCovered: length/slen
+
+# conda activate 2excel
+cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh
+echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/README b/bu_isciii/templates/genomeev/ANALYSIS/README
@@ -0,0 +1,25 @@
+This document should be read as INSTRUCTIONS to perform the "genomeev" service, as created on 25 Sep 2023.
+The steps to follow to perform this service (which, by the way, can be done fairly quickly computationally speaking) are the following:
+
+- Load the samples into the RAW directory (manually or automatically using the BU-ISCIII tools)
+
+- Copy all files from this template (manually or automatically, make sure all files are there)
+
+- Copy the whole service folder to scratch_tmp (at least, we had to do that when this template was created)
+
+- First part is PikaVirus. Run PikaVirus by executing lablog_pikavirus, then enter the PikaVirus folder, execute the lablog (note that you need a samples_id.txt file, if you did not create it automatically, it has to be done manually), load the modules and do the thing. Feel free to change anything in PikaVirus through command or through the config (config is recommended so that any changes can be tracked). NOTE: wait for PikaVirus to end before you continue. Do something else in the meantime. read a paper or something dunno.
+
+- Once PikaVirus has ended, we have to dive into the results, particularly the "all_samples_virus_table.tsv" in the results dir. Here, we have to find the most abundant virus. I personally recommend opening this file in excel or similar, and find the virus that repeats the most in the samples using some formula such as "COUNTIF(range, value)". Make sure you are working with a genome and not with just a fragment of it.
+
+- Download said assembly locally, both its fna and its gff file. Make sure you store both files with the same name and different extension. The name SHOULD include the virus name, and the GCA/GCF code so its easier to identify (example: RotavirusG8_GCA_002669555_1.fasta; RotavirusG8_GCA_002669555_1.gff). Then, place it in the corresponding directory inside "/data/bi/references/virus".
+
+- Once the files have been placed, we have to modify the samples_ref.txt file. 
+    First column will be the exact same as the samples_id.txt file. 
+    Second column will be the name of the assemblies we downloaded in the previous step (example: RotavirusG8_GCA_002669555_1 ). Make sure that all the rows are the exact same.
+    Third column will be the name of the host (typically "human", but can be changed depending on the situation)
+
+- Execute the lablog_viralrecon. The ANALYSIS02 directory will be created and filled with the corresponding scripts. Load the modules and launch viralrecon.
+
+- Once it has ended, its time for MAG. Go to the ANALYSIS03 directory, execute the lablog, load the modules and run MAG with the specified params.
+
+- Last, but not least, go to the ANALYSIS04 directory and run the lablog, the lablog will check the assembly step in viralrecon, and will store the names of the samples that didnt assembly to the scaffold level in the noscaffold.txt file. Run normally the three scripts after loading the corresponding module, and that should be about everything there is to this service! 
diff --git a/bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh b/bu_isciii/templates/genomeev/ANALYSIS/_02_create_run_percentage_Ns.sh
@@ -0,0 +1 @@
+i=1; find */variants/ivar/consensus/ -type d -name 'bcftools' | while read in;  do echo "python ./percentajeNs.py ${in} %Ns_${i}.tab"; i=$((i+1)); done > _03_run_percentage_Ns.sh; echo "cat %Ns_* > %Ns.tab" >> _03_run_percentage_Ns.sh; echo "rm %Ns_*" >> _03_run_percentage_Ns.sh