From 9d537cf3a11b9884948685e68c4e603688c618ba Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:04:31 +0200 Subject: [PATCH 01/14] Introduced new location for multiqc_config.yml --- .../viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog old mode 100644 new mode 100755 index c77b6a71..26d642fd --- a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog +++ b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog @@ -21,6 +21,6 @@ EOF echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh -cp /data/bi/services_and_colaborations/CNM/virology/SRVCNM585_20220223_SARSCOV279_icasas_S/ANALYSIS/20220223_ANALYSIS02_MET/99-stats/multiqc_config.yaml . +ln -s ../../../DOC/multiqc_config.yml . echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh From b0a88dc666c869ab5308b8ed9d15f5da974b5318 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:06:13 +0200 Subject: [PATCH 02/14] Included the multiqc_config.yml file used in viralrecon to its template --- .../templates/viralrecon/DOC/multiqc_config.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 bu_isciii/templates/viralrecon/DOC/multiqc_config.yml diff --git a/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml new file mode 100644 index 00000000..96b7e613 --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml @@ -0,0 +1,13 @@ +extra_fn_clean_exts: + - _R1 + - _R2 + - .R1 + - .R2 + - .sort + - _sort + - .stats + - _bamstat + - _align + - .txt +report_comment: > + This report has been generated by BU-ISCIII From e86f3c7b06c2afcd4bac60f52eec2aca7f6d5093 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:07:59 +0200 Subject: [PATCH 03/14] Modified the existing results-lablog to work for more than 1 reference --- .../viralrecon/RESULTS/viralrecon_results | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 1325a318..37641511 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -1,26 +1,49 @@ mkdir $(date '+%Y%m%d')_entrega01 cd $(date '+%Y%m%d')_entrega01 +mv ../excel_generator.py ./ + #Create directories depending on the analysis mkdir mapping_consensus mkdir variants_annot mkdir assembly_spades mkdir abacas_assembly mkdir blast +mkdir ref_samples + +#Setting up folder and files required for excel_generator.py +cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp +cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done +cat references.tmp | while read in; do mkdir excel_files_${in}; done +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done; +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done; + +#Create symbolic links to files that are going to be converted to excel +cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table*.csv ${in}_variants_long_table.csv; done -#Create symbolic links depending on the analysis -#Individual files ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html -ln -s ../../ANALYSIS/*/mapping_illumina*.xlsx ./mapping_illumina.xlsx -ln -s ../../ANALYSIS/*/*/variants/ivar/variants_long_table*.xlsx ./ -ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/pangolin/pangolin.xlsx ./pangolin.xlsx -ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/nextclade/nextclade.xlsx ./nextclade.xlsx -ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx -ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.xlsx ./filtered_all_samples_virus_table.xlsx - -#Folders -cd mapping_consensus;ln -s ../../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/*.consensus.fa .; cd - -cd variants_annot; ln -s ../../../ANALYSIS/*/*/variants/ivar/snpeff/*.snpsift.txt .; cd - +ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv +ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv +ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv + +#conda activate viralrecon_report +echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh +#Cleaning temp files and broken symbolic links +echo "find . -type d -empty -delete" > _02_clean_folders.sh +echo "find . -xtype l -delete" >> _02_clean_folders.sh +echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh +echo "rm references.tmp" >> _02_clean_folders.sh +echo "rm -rf ref_samples/" >> _02_clean_folders.sh +echo "rm ./*.csv" >> _02_clean_folders.sh +echo "mkdir excel_files" +echo 'mv *.xlsx excel_files/' + +#Create symbolic links to results for every process of the pipeline +cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd - +cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - -cd abacas_assembly; ln -s ../../../ANALYSIS/*/*/assembly/spades/rnaviral/abacas/*.abacas.fasta .; cd - -cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - +cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - + + From 3f967ad45cc96c3c7fcbaef4f9596cd1f0798be5 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:08:59 +0200 Subject: [PATCH 04/14] Introduced a custom config file for Respiratory Syncytial Virus --- .../viralrecon/DOC/viralrecon_rsv.config | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config new file mode 100644 index 00000000..12eaef0f --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config @@ -0,0 +1,22 @@ +singularity { + enabled = true + autoMounts = true +} + +process { + executor = 'slurm' + queue = 'middle_idx' + withName: 'FASTP' { + ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' + } + withName: 'NEXTCLADE_DATASETGET|NEXTCLADE_RUN' { + container = 'https://depot.galaxyproject.org/singularity/nextclade:2.14.0--h9ee0642_1' + } +} + +params { + // Max resource options + max_memory = 376.GB + max_cpus = 32 + max_time = '48.h' +} From 2a8b52cda7a171205dbcbaaadb8b0a872eccccd7 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:09:25 +0200 Subject: [PATCH 05/14] Introduced custom params used for Respiratory Syncytial Virus --- .../templates/viralrecon/DOC/viralrecon_rsv_params.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml new file mode 100644 index 00000000..01440c89 --- /dev/null +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml @@ -0,0 +1,8 @@ +platform: 'illumina' +protocol: 'amplicon' +kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' +variant_caller: 'ivar' +consensus_caller: 'bcftools' +skip_pangolin: true +skip_nextclade: false +skip_assembly: false From a5070a901bfd4dcd2001944c269c2cc0c95ddc0b Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 8 Aug 2023 16:10:01 +0200 Subject: [PATCH 06/14] Included an auxiliary script to generate excel files of the results from viralrecon --- .../viralrecon/RESULTS/excel_generator.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 bu_isciii/templates/viralrecon/RESULTS/excel_generator.py diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py new file mode 100755 index 00000000..4ecad309 --- /dev/null +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -0,0 +1,91 @@ +import os +import argparse +import pandas as pd +from typing import List, Dict + +# conda activate viralrecon_report +"""Usage: python excel_generator.py ./reference.tmp""" +parser = argparse.ArgumentParser(description="Generate excel files from viralrecon results") +parser.add_argument("reference_file", type=str, help="File containing the references used in the analysis") + +args = parser.parse_args() + +print(f"Extracting references used for analysis and the samples associated with each reference\n") +with open(args.reference_file, "r") as file: + references = [line.rstrip() for line in file] + print(f"\nFound {len(references)} references: {str(references).strip('[]')}") + +reference_folders = {ref: str("excel_files_"+ref) for ref in references} +samples_ref_files = {ref: str("ref_samples/samples_"+ref+".tmp") for ref in references} + +def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): + """Concatenate any tables that share the same header""" + with open (merged_csv_name, "wb") as merged_csv: + with open(csvs_in_folder[0], "rb") as f: + merged_csv.write(f.read()) # This is the fastest way to concatenate csv files + for file in csvs_in_folder[1:]: + with open(file, "rb") as f: + next(f) #this is used to skip the header + merged_csv.write(f.read()) + return merged_csv + +def merge_lineage_tables(reference_folders: Dict[str,str], samples_ref_files: Dict[str,str]): + """Creates the tables for pangolin and nextclade""" + for ref, folder in reference_folders.items(): + print("Merging results for either pangolin or nextclade in a single csv file") + samples_for_ref = open(samples_ref_files[ref]).read().splitlines() + if os.path.isdir(os.path.abspath(folder+"/pangolin")): + pango_dir = os.path.join(folder,"pangolin") + csvs_in_folder = [file.path for file in os.scandir(pango_dir) + if os.path.basename(file).split("_")[0] in samples_for_ref] + merged_csv_name = os.path.join(folder,str(ref+"_pangolin.csv")) + concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) + else: + print(f"No pangolin folder could be found for {ref}, omitting") + + if os.path.isdir(os.path.abspath(folder+"/nextclade")): + nextcl_dir = os.path.join(folder,"nextclade") + csvs_in_folder = [file.path for file in os.scandir(nextcl_dir) + if os.path.splitext(os.path.basename(file))[0] in samples_for_ref] + merged_csv_name = os.path.join(folder,str(ref+"_nextclade.csv")) + concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) + else: + print(f"No nextclade folder could be found for {ref}, omitting") + + return + +def excel_generator(csv_files: List[str]): + for file in csv_files: + if not os.path.exists(file): + print(f"File {file} does not exist, omitting...") + continue + print(f"Generating excel file for {file}") + output_name = str(file.split(".csv")[0] + ".xlsx") + #workbook = openpyxl.Workbook(output_name) + if "nextclade" in str(file): + pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False) + elif "illumina" in str(file): + table = pd.read_csv(file, sep="\t", header=0) + table["analysis_date"] = pd.to_datetime(table["analysis_date"].astype(str), format='%Y%m%d') + table.to_excel(output_name, index=False) + elif "assembly" in str(file): + pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False) + else: + pd.read_csv(file).to_excel(output_name, index=False) + return file + +#Merge pangolin and nextclade csv files separatedly and create excel files for them +merge_lineage_tables(reference_folders, samples_ref_files) +for reference, folder in reference_folders.items(): + print(f"Creating excel files for reference {reference}") + csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] + excel_generator(csv_files) + +#Merge all the variant long tables into one and convert to excel format +variants_tables = [table.path for table in os.scandir(".") if "variants_long_table" in table.path] +concat_tables_and_write(csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv") +pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False) + +#Create excel files for individual tables +result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"] +excel_generator(result_tables) \ No newline at end of file From 1416fad7d869aca3899bb38d1019547e62ef0823 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 10 Aug 2023 12:54:37 +0200 Subject: [PATCH 07/14] Introduced new changes into results lablog --- bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 37641511..be65b6aa 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -19,7 +19,7 @@ cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANAL cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done; #Create symbolic links to files that are going to be converted to excel -cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table*.csv ${in}_variants_long_table.csv; done +cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv From 068e583905142b59fe34c924dbe4854b5d37770d Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 10 Aug 2023 13:42:31 +0200 Subject: [PATCH 08/14] Included new changes in results lablog --- bu_isciii/templates/viralrecon/RESULTS/lablog | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 bu_isciii/templates/viralrecon/RESULTS/lablog diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog b/bu_isciii/templates/viralrecon/RESULTS/lablog new file mode 100644 index 00000000..24125304 --- /dev/null +++ b/bu_isciii/templates/viralrecon/RESULTS/lablog @@ -0,0 +1,48 @@ +mkdir $(date '+%Y%m%d')_entrega01 +cd $(date '+%Y%m%d')_entrega01 + +mv ../excel_generator.py ./ + +#Create directories depending on the analysis +mkdir mapping_consensus +mkdir variants_annot +mkdir assembly_spades +mkdir abacas_assembly +mkdir blast +mkdir ref_samples + +#Setting up folder and files required for excel_generator.py +cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp +cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done +cat references.tmp | while read in; do mkdir excel_files_${in}; done +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done; +cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done; + +#Create symbolic links to files that are going to be converted to excel +cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done + +ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html +ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv +ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv +ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv + +#conda activate viralrecon_report +echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh +#Cleaning temp files and broken symbolic links +echo "find . -xtype l -delete" > _02_clean_folders.sh +echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh +echo "find . -type d -empty -delete" >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh +echo "rm references.tmp" >> _02_clean_folders.sh +echo "rm -rf ref_samples/" >> _02_clean_folders.sh +echo "rm ./*.csv" >> _02_clean_folders.sh +echo "mkdir excel_files" +echo 'mv *.xlsx excel_files/' + +#Create symbolic links to results for every process of the pipeline +cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd - +cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - +cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - +cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - +cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - \ No newline at end of file From a1cf59dd04b70e2319892f4193a031ff8e02c980 Mon Sep 17 00:00:00 2001 From: Shettland Date: Thu, 10 Aug 2023 15:29:13 +0200 Subject: [PATCH 09/14] Included changes into excel_generator.py --- .../viralrecon/RESULTS/excel_generator.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index 4ecad309..af212cb7 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -20,13 +20,17 @@ def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): """Concatenate any tables that share the same header""" + if len(csvs_in_folder)==0: + print(f"Could not find tables to merge over {merged_csv_name}") + return with open (merged_csv_name, "wb") as merged_csv: with open(csvs_in_folder[0], "rb") as f: merged_csv.write(f.read()) # This is the fastest way to concatenate csv files - for file in csvs_in_folder[1:]: - with open(file, "rb") as f: - next(f) #this is used to skip the header - merged_csv.write(f.read()) + if len(csvs_in_folder)>1: + for file in csvs_in_folder[1:]: + with open(file, "rb") as f: + next(f) #this is used to skip the header + merged_csv.write(f.read()) return merged_csv def merge_lineage_tables(reference_folders: Dict[str,str], samples_ref_files: Dict[str,str]): @@ -37,7 +41,7 @@ def merge_lineage_tables(reference_folders: Dict[str,str], samples_ref_files: Di if os.path.isdir(os.path.abspath(folder+"/pangolin")): pango_dir = os.path.join(folder,"pangolin") csvs_in_folder = [file.path for file in os.scandir(pango_dir) - if os.path.basename(file).split("_")[0] in samples_for_ref] + if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref] merged_csv_name = os.path.join(folder,str(ref+"_pangolin.csv")) concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) else: @@ -46,7 +50,7 @@ def merge_lineage_tables(reference_folders: Dict[str,str], samples_ref_files: Di if os.path.isdir(os.path.abspath(folder+"/nextclade")): nextcl_dir = os.path.join(folder,"nextclade") csvs_in_folder = [file.path for file in os.scandir(nextcl_dir) - if os.path.splitext(os.path.basename(file))[0] in samples_for_ref] + if os.path.basename(file).strip(".csv") in samples_for_ref] merged_csv_name = os.path.join(folder,str(ref+"_nextclade.csv")) concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) else: From 494fef773a5a1b8ad7f5e9c4514fcb5b1b22500a Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 16 Aug 2023 09:51:43 +0200 Subject: [PATCH 10/14] Deleted old results lablog --- .../viralrecon/RESULTS/viralrecon_results | 49 ------------------- 1 file changed, 49 deletions(-) delete mode 100644 bu_isciii/templates/viralrecon/RESULTS/viralrecon_results diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results deleted file mode 100644 index be65b6aa..00000000 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ /dev/null @@ -1,49 +0,0 @@ -mkdir $(date '+%Y%m%d')_entrega01 -cd $(date '+%Y%m%d')_entrega01 - -mv ../excel_generator.py ./ - -#Create directories depending on the analysis -mkdir mapping_consensus -mkdir variants_annot -mkdir assembly_spades -mkdir abacas_assembly -mkdir blast -mkdir ref_samples - -#Setting up folder and files required for excel_generator.py -cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp -cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done -cat references.tmp | while read in; do mkdir excel_files_${in}; done -cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done; -cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done; - -#Create symbolic links to files that are going to be converted to excel -cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done - -ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html -ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv -ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv -ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv - -#conda activate viralrecon_report -echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh -#Cleaning temp files and broken symbolic links -echo "find . -type d -empty -delete" > _02_clean_folders.sh -echo "find . -xtype l -delete" >> _02_clean_folders.sh -echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh -echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh -echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh -echo "rm references.tmp" >> _02_clean_folders.sh -echo "rm -rf ref_samples/" >> _02_clean_folders.sh -echo "rm ./*.csv" >> _02_clean_folders.sh -echo "mkdir excel_files" -echo 'mv *.xlsx excel_files/' - -#Create symbolic links to results for every process of the pipeline -cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd - -cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - -cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - -cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - - - From ed955d02bfdddaeac53d55f1addfec7174762e7f Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 16 Aug 2023 10:52:19 +0200 Subject: [PATCH 11/14] Introduced excel_generator.py, linting --- .../viralrecon/RESULTS/excel_generator.py | 102 ++++++++++++------ 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index af212cb7..e7d524bd 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -5,59 +5,86 @@ # conda activate viralrecon_report """Usage: python excel_generator.py ./reference.tmp""" -parser = argparse.ArgumentParser(description="Generate excel files from viralrecon results") -parser.add_argument("reference_file", type=str, help="File containing the references used in the analysis") +parser = argparse.ArgumentParser( + description="Generate excel files from viralrecon results" +) +parser.add_argument( + "reference_file", + type=str, + help="File containing the references used in the analysis", +) args = parser.parse_args() -print(f"Extracting references used for analysis and the samples associated with each reference\n") +print( + f"Extracting references used for analysis and the samples associated with each reference\n" +) with open(args.reference_file, "r") as file: references = [line.rstrip() for line in file] print(f"\nFound {len(references)} references: {str(references).strip('[]')}") -reference_folders = {ref: str("excel_files_"+ref) for ref in references} -samples_ref_files = {ref: str("ref_samples/samples_"+ref+".tmp") for ref in references} +reference_folders = {ref: str("excel_files_" + ref) for ref in references} +samples_ref_files = { + ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references +} + def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): """Concatenate any tables that share the same header""" - if len(csvs_in_folder)==0: + if len(csvs_in_folder) == 0: print(f"Could not find tables to merge over {merged_csv_name}") return - with open (merged_csv_name, "wb") as merged_csv: + with open(merged_csv_name, "wb") as merged_csv: with open(csvs_in_folder[0], "rb") as f: - merged_csv.write(f.read()) # This is the fastest way to concatenate csv files - if len(csvs_in_folder)>1: + merged_csv.write( + f.read() + ) # This is the fastest way to concatenate csv files + if len(csvs_in_folder) > 1: for file in csvs_in_folder[1:]: with open(file, "rb") as f: - next(f) #this is used to skip the header - merged_csv.write(f.read()) + next(f) # this is used to skip the header + merged_csv.write(f.read()) return merged_csv -def merge_lineage_tables(reference_folders: Dict[str,str], samples_ref_files: Dict[str,str]): + +def merge_lineage_tables( + reference_folders: Dict[str, str], samples_ref_files: Dict[str, str] +): """Creates the tables for pangolin and nextclade""" for ref, folder in reference_folders.items(): print("Merging results for either pangolin or nextclade in a single csv file") samples_for_ref = open(samples_ref_files[ref]).read().splitlines() - if os.path.isdir(os.path.abspath(folder+"/pangolin")): - pango_dir = os.path.join(folder,"pangolin") - csvs_in_folder = [file.path for file in os.scandir(pango_dir) - if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref] - merged_csv_name = os.path.join(folder,str(ref+"_pangolin.csv")) - concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) + if os.path.isdir(os.path.abspath(folder + "/pangolin")): + pango_dir = os.path.join(folder, "pangolin") + csvs_in_folder = [ + file.path + for file in os.scandir(pango_dir) + if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref + ] + merged_csv_name = os.path.join(folder, str(ref + "_pangolin.csv")) + concat_tables_and_write( + csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name + ) else: print(f"No pangolin folder could be found for {ref}, omitting") - if os.path.isdir(os.path.abspath(folder+"/nextclade")): - nextcl_dir = os.path.join(folder,"nextclade") - csvs_in_folder = [file.path for file in os.scandir(nextcl_dir) - if os.path.basename(file).strip(".csv") in samples_for_ref] - merged_csv_name = os.path.join(folder,str(ref+"_nextclade.csv")) - concat_tables_and_write(csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name) + if os.path.isdir(os.path.abspath(folder + "/nextclade")): + nextcl_dir = os.path.join(folder, "nextclade") + csvs_in_folder = [ + file.path + for file in os.scandir(nextcl_dir) + if os.path.basename(file).strip(".csv") in samples_for_ref + ] + merged_csv_name = os.path.join(folder, str(ref + "_nextclade.csv")) + concat_tables_and_write( + csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name + ) else: print(f"No nextclade folder could be found for {ref}, omitting") - + return + def excel_generator(csv_files: List[str]): for file in csv_files: if not os.path.exists(file): @@ -65,31 +92,38 @@ def excel_generator(csv_files: List[str]): continue print(f"Generating excel file for {file}") output_name = str(file.split(".csv")[0] + ".xlsx") - #workbook = openpyxl.Workbook(output_name) + # workbook = openpyxl.Workbook(output_name) if "nextclade" in str(file): pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False) elif "illumina" in str(file): table = pd.read_csv(file, sep="\t", header=0) - table["analysis_date"] = pd.to_datetime(table["analysis_date"].astype(str), format='%Y%m%d') + table["analysis_date"] = pd.to_datetime( + table["analysis_date"].astype(str), format="%Y%m%d" + ) table.to_excel(output_name, index=False) elif "assembly" in str(file): - pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False) + pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False) else: pd.read_csv(file).to_excel(output_name, index=False) return file -#Merge pangolin and nextclade csv files separatedly and create excel files for them + +# Merge pangolin and nextclade csv files separatedly and create excel files for them merge_lineage_tables(reference_folders, samples_ref_files) for reference, folder in reference_folders.items(): print(f"Creating excel files for reference {reference}") csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] excel_generator(csv_files) -#Merge all the variant long tables into one and convert to excel format -variants_tables = [table.path for table in os.scandir(".") if "variants_long_table" in table.path] -concat_tables_and_write(csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv") +# Merge all the variant long tables into one and convert to excel format +variants_tables = [ + table.path for table in os.scandir(".") if "variants_long_table" in table.path +] +concat_tables_and_write( + csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" +) pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False) -#Create excel files for individual tables +# Create excel files for individual tables result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"] -excel_generator(result_tables) \ No newline at end of file +excel_generator(result_tables) From bd9b88125115375ccff4d3e77dbb0261229f28d5 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 16 Aug 2023 10:56:47 +0200 Subject: [PATCH 12/14] Introduced excel_generator.py, linting2 --- bu_isciii/templates/viralrecon/RESULTS/excel_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index e7d524bd..55655950 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -17,7 +17,7 @@ args = parser.parse_args() print( - f"Extracting references used for analysis and the samples associated with each reference\n" + "Extracting references used for analysis and the samples associated with each reference\n" ) with open(args.reference_file, "r") as file: references = [line.rstrip() for line in file] From 5d636bad0f8cfec001c1848e99cdf63932381715 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 22 Aug 2023 09:05:44 +0200 Subject: [PATCH 13/14] Changed results lablog name to viralrecon_results --- .../templates/viralrecon/RESULTS/{lablog => viralrecon_results} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bu_isciii/templates/viralrecon/RESULTS/{lablog => viralrecon_results} (100%) diff --git a/bu_isciii/templates/viralrecon/RESULTS/lablog b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results similarity index 100% rename from bu_isciii/templates/viralrecon/RESULTS/lablog rename to bu_isciii/templates/viralrecon/RESULTS/viralrecon_results From 212d8d8876b4470acbec18fed8fae9c3baa193c5 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 22 Aug 2023 09:06:06 +0200 Subject: [PATCH 14/14] Included samples_ref.txt template --- bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt b/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt new file mode 100644 index 00000000..5e3528b1 --- /dev/null +++ b/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt @@ -0,0 +1,4 @@ +SampleID Reference Host +SampleID Reference Host +SampleID Reference Host +