diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog old mode 100644 new mode 100755 index c77b6a71..26d642fd --- a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog +++ b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog @@ -21,6 +21,6 @@ EOF echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh -cp /data/bi/services_and_colaborations/CNM/virology/SRVCNM585_20220223_SARSCOV279_icasas_S/ANALYSIS/20220223_ANALYSIS02_MET/99-stats/multiqc_config.yaml . +ln -s ../../../DOC/multiqc_config.yml . echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt b/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt new file mode 100644 index 00000000..5e3528b1 --- /dev/null +++ b/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt @@ -0,0 +1,4 @@ +SampleID Reference Host +SampleID Reference Host +SampleID Reference Host + diff --git a/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml new file mode 100644 index 00000000..96b7e613 --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml @@ -0,0 +1,13 @@ +extra_fn_clean_exts: + - _R1 + - _R2 + - .R1 + - .R2 + - .sort + - _sort + - .stats + - _bamstat + - _align + - .txt +report_comment: > + This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config new file mode 100644 index 00000000..12eaef0f --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config @@ -0,0 +1,22 @@ +singularity { + enabled = true + autoMounts = true +} + +process { + executor = 'slurm' + queue = 'middle_idx' + withName: 'FASTP' { + ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' + } + withName: 'NEXTCLADE_DATASETGET|NEXTCLADE_RUN' { + container = 'https://depot.galaxyproject.org/singularity/nextclade:2.14.0--h9ee0642_1' + } +} + +params { + // Max resource options + max_memory = 376.GB + max_cpus = 32 + max_time = '48.h' +} diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml new file mode 100644 index 00000000..01440c89 --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml @@ -0,0 +1,8 @@ +platform: 'illumina' +protocol: 'amplicon' +kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' +variant_caller: 'ivar' +consensus_caller: 'bcftools' +skip_pangolin: true +skip_nextclade: false +skip_assembly: false diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py new file mode 100755 index 00000000..55655950 --- /dev/null +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -0,0 +1,129 @@ +import os +import argparse +import pandas as pd +from typing import List, Dict + +# conda activate viralrecon_report +"""Usage: python excel_generator.py ./reference.tmp""" +parser = argparse.ArgumentParser( + description="Generate excel files from viralrecon results" +) +parser.add_argument( + "reference_file", + type=str, + help="File containing the references used in the analysis", +) + +args = parser.parse_args() + +print( + "Extracting references used for analysis and the samples associated with each reference\n" +) +with open(args.reference_file, "r") as file: + references = [line.rstrip() for line in file] + print(f"\nFound {len(references)} references: {str(references).strip('[]')}") + +reference_folders = {ref: str("excel_files_" + ref) for ref in references} +samples_ref_files = { + ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references +} + + +def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): + """Concatenate any tables that share the same header""" + if len(csvs_in_folder) == 0: + print(f"Could not find tables to merge over {merged_csv_name}") + return + with open(merged_csv_name, "wb") as merged_csv: + with open(csvs_in_folder[0], "rb") as f: + merged_csv.write( + f.read() + ) # This is the fastest way to concatenate csv files + if len(csvs_in_folder) > 1: + for file in csvs_in_folder[1:]: + with open(file, "rb") as f: + next(f) # this is used to skip the header + merged_csv.write(f.read()) + return merged_csv + + +def merge_lineage_tables( + reference_folders: Dict[str, str], samples_ref_files: Dict[str, str] +): + """Creates the tables for pangolin and nextclade""" + for ref, folder in reference_folders.items(): + print("Merging results for either pangolin or nextclade in a single csv file") + samples_for_ref = open(samples_ref_files[ref]).read().splitlines() + if os.path.isdir(os.path.abspath(folder + "/pangolin")): + pango_dir = os.path.join(folder, "pangolin") + csvs_in_folder = [ + file.path + for file in os.scandir(pango_dir) + if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref + ] + merged_csv_name = os.path.join(folder, str(ref + "_pangolin.csv")) + concat_tables_and_write( + csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name + ) + else: + print(f"No pangolin folder could be found for {ref}, omitting") + + if os.path.isdir(os.path.abspath(folder + "/nextclade")): + nextcl_dir = os.path.join(folder, "nextclade") + csvs_in_folder = [ + file.path + for file in os.scandir(nextcl_dir) + if os.path.basename(file).strip(".csv") in samples_for_ref + ] + merged_csv_name = os.path.join(folder, str(ref + "_nextclade.csv")) + concat_tables_and_write( + csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name + ) + else: + print(f"No nextclade folder could be found for {ref}, omitting") + + return + + +def excel_generator(csv_files: List[str]): + for file in csv_files: + if not os.path.exists(file): + print(f"File {file} does not exist, omitting...") + continue + print(f"Generating excel file for {file}") + output_name = str(file.split(".csv")[0] + ".xlsx") + # workbook = openpyxl.Workbook(output_name) + if "nextclade" in str(file): + pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False) + elif "illumina" in str(file): + table = pd.read_csv(file, sep="\t", header=0) + table["analysis_date"] = pd.to_datetime( + table["analysis_date"].astype(str), format="%Y%m%d" + ) + table.to_excel(output_name, index=False) + elif "assembly" in str(file): + pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False) + else: + pd.read_csv(file).to_excel(output_name, index=False) + return file + + +# Merge pangolin and nextclade csv files separatedly and create excel files for them +merge_lineage_tables(reference_folders, samples_ref_files) +for reference, folder in reference_folders.items(): + print(f"Creating excel files for reference {reference}") + csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] + excel_generator(csv_files) + +# Merge all the variant long tables into one and convert to excel format +variants_tables = [ + table.path for table in os.scandir(".") if "variants_long_table" in table.path +] +concat_tables_and_write( + csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" +) +pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False) + +# Create excel files for individual tables +result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"] +excel_generator(result_tables) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 1325a318..24125304 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -1,26 +1,48 @@ mkdir $(date '+%Y%m%d')_entrega01 cd $(date '+%Y%m%d')_entrega01 +mv ../excel_generator.py ./ + #Create directories depending on the analysis mkdir mapping_consensus mkdir variants_annot mkdir assembly_spades mkdir abacas_assembly mkdir blast +mkdir ref_samples + +#Setting up folder and files required for excel_generator.py +cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp +cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done +cat references.tmp | while read in; do mkdir excel_files_${in}; done +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done; +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done; + +#Create symbolic links to files that are going to be converted to excel +cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done -#Create symbolic links depending on the analysis -#Individual files ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html -ln -s ../../ANALYSIS/*/mapping_illumina*.xlsx ./mapping_illumina.xlsx -ln -s ../../ANALYSIS/*/*/variants/ivar/variants_long_table*.xlsx ./ -ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/pangolin/pangolin.xlsx ./pangolin.xlsx -ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/nextclade/nextclade.xlsx ./nextclade.xlsx -ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx -ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.xlsx ./filtered_all_samples_virus_table.xlsx +ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv +ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv +ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv + +#conda activate viralrecon_report +echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh +#Cleaning temp files and broken symbolic links +echo "find . -xtype l -delete" > _02_clean_folders.sh +echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh +echo "find . -type d -empty -delete" >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh +echo "rm references.tmp" >> _02_clean_folders.sh +echo "rm -rf ref_samples/" >> _02_clean_folders.sh +echo "rm ./*.csv" >> _02_clean_folders.sh +echo "mkdir excel_files" +echo 'mv *.xlsx excel_files/' -#Folders -cd mapping_consensus;ln -s ../../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/*.consensus.fa .; cd - -cd variants_annot; ln -s ../../../ANALYSIS/*/*/variants/ivar/snpeff/*.snpsift.txt .; cd - +#Create symbolic links to results for every process of the pipeline +cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd - +cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - -cd abacas_assembly; ln -s ../../../ANALYSIS/*/*/assembly/spades/rnaviral/abacas/*.abacas.fasta .; cd - -cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - +cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - +cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - \ No newline at end of file