Skip to content

Commit

Permalink
Merge pull request #187 from Daniel-VM/hotfix
Browse files Browse the repository at this point in the history
New Bacterial Genome Assembly Template: Short, Long, and Hybrid Sequences
  • Loading branch information
saramonzon authored Jan 31, 2024
2 parents fb17677 + 904f639 commit 7af8c12
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 54 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Code contributions to the hotfix:
- [Jaime Ozaez](https://github.com/jaimeozaez)
- [Sara Monzón](https://github.com/saramonzon)
- [Sarai Varona](https://github.com/svarona)
- [Daniel Valle](https://github.com/Daniel-VM)


### Template fixes and updates
Expand All @@ -67,6 +68,7 @@ Code contributions to the hotfix:
- Small changes to `buisciii_tools/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results` for blast and new excel_generator.py
- Introduced better error handling in excel_generator.py. Now it can also be used for single files
- Brought back `PASS_ONLY` to exometrio's `exomiser_configfile.yml`
- [#187](https://github.com/BU-ISCIII/buisciii-tools/pull/187) - Added new template for bacterial assembly. Allowing for short, long and hybrid assembly.

### Modules

Expand Down
136 changes: 105 additions & 31 deletions bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog
Original file line number Diff line number Diff line change
@@ -1,31 +1,97 @@
echo "Do you want to save trimmed reads in outdir?"
# Function to print colored text
print_color() {
case "$2" in
"red")
echo -e "\e[1;31m$1\e[0m"
;;
"green")
echo -e "\e[1;32m$1\e[0m"
;;
"blue")
echo -e "\e[1;34m$1\e[0m"
;;
*)
echo "$1"
;;
esac
}

read -p 'Write y or n: ' trimmed
# Function to prompt with color
prompt_with_color() {
read -p "$(print_color $1 'blue') $2" response
}

TRIMMED=$(echo "${trimmed}" | tr '[:upper:]' '[:lower:]')
# Select assembly mode
assembly_options=("short" "long" "hybrid")
print_color "Indicate the preferred assembly mode:" 'blue'
select ASSEMBLY_MODE in "${assembly_options[@]}"; do
if [ -n "$ASSEMBLY_MODE" ]; then
if [ $ASSEMBLY_MODE == "short" ]; then
ASSEMBLER="unicycler"
elif [ "$ASSEMBLY_MODE" == "long" ] || [ "$ASSEMBLY_MODE" == "hybrid" ]; then
ASSEMBLER="dragonflye"
fi
break
else
print_color "Invalid input. Please select a valid option." 'red'
fi
done
print_color "Selected assembly mode: $ASSEMBLY_MODE" 'green'

if [ "$TRIMMED" == "yes" ] || [ "$TRIMMED" == "y" ]
then SAVETRIMMED="True"
else SAVETRIMMED="False"
fi
# Select whether to save trimmed reads
trim_options=("Yes" "No")
print_color "Do you want to save trimmed reads in outdir?" 'blue'
select TRIMMED in "${trim_options[@]}"; do
if [ -n "$TRIMMED" ]; then
# rename trimmed
if [ "$TRIMMED" == "Yes" ] || [ "$TRIMMED" == "y" ]; then
SAVETRIMMED="true"
else
SAVETRIMMED="false"
fi

echo "Is gram positive or negative?"
break
else
print_color "Invalid input. Please select a valid option." 'red'
fi
done
print_color "Selected trimmed file option: $TRIMMED save trimmed" 'green'

read -p 'Write + or -: ' grammtype
# Select Prokka gram type
gram_options=("+" "-" "skip")

if [ "$grammtype" != "-" ] && [ "$grammtype" != "+" ]
then
echo "The given param: $grammtype does not match any of the accepted params ('+' or '-')"
exit 1
fi
print_color "Is gram positive or negative?" 'blue'
select GRAMTYPE in "${gram_options[@]}"; do
if [ -n "$GRAMTYPE" ]; then
if [ "$GRAMTYPE" != "skip" ]; then
PROKKA_ARGS="--prokka_args '--gram ${GRAMTYPE}'"
fi
break
else
print_color "Invalid input. Please select a valid option." 'red'
fi
done

print_color "Selected Prokka gram type: $GRAMTYPE" 'green'


# SETUP INTPUT SAMPLE SHEET
ln -s ../00-reads .
ln -s ../samples_id.txt .

echo "sample,fastq_1,fastq_2" > samplesheet.csv
cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv
echo "ID,R1,R2,LongFastQ,Fast5,GenomeSize" > samplesheet.csv
cat samples_id.txt | while read in; do
if [ "$ASSEMBLY_MODE" == "short" ]; then
echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz,NA,NA,NA";
elif [ "$ASSEMBLY_MODE" == "long" ]; then
echo "${in},NA,NA,00-reads/${in}.fastq.gz,NA,NA";
elif [ "$ASSEMBLY_MODE" == "hybrid" ]; then
echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz,00-reads/${in}.fastq.gz,NA,NA";
else
echo "Format not recognized for the sample : ${in}.";
fi
done >> samplesheet.csv

#module load Nextflow/21.10.6 singularity
scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g")

cat <<EOF > assembly.sbatch
Expand All @@ -38,20 +104,28 @@ cat <<EOF > assembly.sbatch
#SBATCH --output $(date '+%Y%m%d')_assembly01.log
#SBATCH --chdir $scratch_dir

export NXF_OPTS="-Xms500M -Xmx4G"

nextflow run /scratch/bi/pipelines/BU_ISCIII-bacterial-assembly/main.nf \\
-c ../../DOC/hpc_slurm_assembly.config \\
--input samplesheet.csv \\
--outdir ./ \\
--cut_mean_quality 20 \\
--qualified_quality_phred 20 \\
--gram ${grammtype} \\
--reference_outdir ../../REFERENCES \\
--save_trimmed ${SAVETRIMMED} \\
--kmerfinder_bacteria_database '/data/bi/references/kmerfinder/20190108_stable_dirs/bacteria' \\
--reference_ncbi_bacteria '/data/bi/references/bacteria/latest_db/assembly_summary_bacteria.txt' \\
-resume
# module load Nextflow/23.10.0 singularity
export NXF_OPTS="-Xms500M -Xmx8G"

nextflow run /data/bi/pipelines/nf-core-bacass/main.nf \\
-c ../../DOC/hpc_slurm_assembly.config \\
-profile singularity \\
--input samplesheet.csv \\
--outdir ./ \\
--assembly_type ${ASSEMBLY_MODE} \\
--assembler ${ASSEMBLER} \\
--skip_polish true \\
--save_trimmed ${SAVETRIMMED} \\
--fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\
--skip_kraken2 true \\
--skip_kmerfinder false \\
--kmerfinderdb /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria \\
--ncbi_assembly_metadata /data/bi/references/bacteria/20191212/assembly_summary_bacteria.txt \\
${PROKKA_ARGS} \\
-resume

EOF

echo "sbatch assembly.sbatch" > _01_nf_assembly.sh


23 changes: 20 additions & 3 deletions bu_isciii/templates/assembly/ANALYSIS/lablog_assembly
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
mkdir -p 00-reads
cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd -
cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd -
mv ANALYSIS01_ASSEMBLY $(date '+%Y%m%d')_ANALYSIS01_ASSEMBLY
cd 00-reads

# Loop through each file in the directory
while IFS= read -r sample; do
# Extract the file name with&without extension
filename_noext=$(basename -s .fastq.gz ../../RAW/${sample}*)

### Check if the file is a short read or long read
for fileitem in $filename_noext; do
if [[ "$fileitem" =~ _R[12] ]]; then
ln -s -f ../../RAW/${sample}*_R1*.fastq.gz ${sample}_R1.fastq.gz
ln -s -f ../../RAW/${sample}*_R2*.fastq.gz ${sample}_R2.fastq.gz
elif [[ ! "$fileitem" =~ _R[12] ]]; then
ln -s -f ../../RAW/${sample}.fastq.gz ${sample}.fastq.gz
fi
done
done < ../samples_id.txt

cd -
mv ANALYSIS01_ASSEMBLY "$(date '+%Y%m%d')_ANALYSIS01_ASSEMBLY"
Loading

0 comments on commit 7af8c12

Please sign in to comment.