From ef1df075e79d9c57aa926bafb1275745c7f8d9b2 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:04:20 +0100 Subject: [PATCH 01/11] removed unnecessary line --- .../exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog index dcb39acd..2dd494ea 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog @@ -76,8 +76,6 @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/bi/pipe #--------------------------------------------------------------------------------------------------------- -echo "srun --chdir /tmp --partition short_idx --nodelist ${EXOMISER_NODE} rm spring.log &" > _05_filter_heritance.sh - ## Lablog to modify the output reported by exomiser and create a final file with a personalized format. # Grep variant id for each inheritance model cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_heritance.sh @@ -91,4 +89,4 @@ echo "rm id_*" >> _05_filter_heritance.sh cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_heritance.sh # annot_all table is huge, lets shrink it a little bit -echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh \ No newline at end of file +echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh From a4c59c6b6d8da2803fb951a514958eda58ad970f Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:05:16 +0100 Subject: [PATCH 02/11] commented PASS_ONLY line parameter --- .../ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml index b3b9d027..84bd59f2 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml @@ -24,7 +24,7 @@ analysis: MITOCHONDRIAL: 100.0 } #FULL or PASS_ONLY - analysisMode: PASS_ONLY + #analysisMode: PASS_ONLY #Possible frequencySources: #Thousand Genomes project http://www.1000genomes.org/ # THOUSAND_GENOMES, From 032e34ea48791c56903c758e0ad0144753bd58f3 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:05:52 +0100 Subject: [PATCH 03/11] small changes in viralrecon_results script --- bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 24125304..86875d4d 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -27,13 +27,14 @@ ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv #conda activate viralrecon_report -echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh +echo "python ./excel_generator.py -r ./references.tmp" > _01_generate_excel_files.sh #Cleaning temp files and broken symbolic links echo "find . -xtype l -delete" > _02_clean_folders.sh echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh echo "find . -type d -empty -delete" >> _02_clean_folders.sh echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh +echo 'cat references.tmp | while read in; do rm ${in}_variants_long_table.xlsx; done' >> _02_clean_folders.sh echo "rm references.tmp" >> _02_clean_folders.sh echo "rm -rf ref_samples/" >> _02_clean_folders.sh echo "rm ./*.csv" >> _02_clean_folders.sh @@ -45,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - -cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - \ No newline at end of file +cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - From fc0810f154a64ac49e8135c00427607f897c19ca Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:07:09 +0100 Subject: [PATCH 04/11] introduced better error handling in excel_generator, now it can also be used for single files --- .../viralrecon/RESULTS/excel_generator.py | 105 +++++++++++------- 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index 55655950..f03f0b95 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -5,29 +5,25 @@ # conda activate viralrecon_report """Usage: python excel_generator.py ./reference.tmp""" +"""Single csv to excel Usage: python excel_generator.py -s csv_file.csv""" parser = argparse.ArgumentParser( description="Generate excel files from viralrecon results" ) parser.add_argument( - "reference_file", + "-r", + "--reference_file", type=str, help="File containing the references used in the analysis", ) - -args = parser.parse_args() - -print( - "Extracting references used for analysis and the samples associated with each reference\n" +parser.add_argument( + "-s", + "--single_csv", + type=str, + default="", + help="Transform a single csv file to excel format. Omit rest of processes" ) -with open(args.reference_file, "r") as file: - references = [line.rstrip() for line in file] - print(f"\nFound {len(references)} references: {str(references).strip('[]')}") - -reference_folders = {ref: str("excel_files_" + ref) for ref in references} -samples_ref_files = { - ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references -} +args = parser.parse_args() def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): """Concatenate any tables that share the same header""" @@ -91,39 +87,66 @@ def excel_generator(csv_files: List[str]): print(f"File {file} does not exist, omitting...") continue print(f"Generating excel file for {file}") - output_name = str(file.split(".csv")[0] + ".xlsx") + output_name = os.path.splitext(os.path.basename(file))[0] + ".xlsx" # workbook = openpyxl.Workbook(output_name) if "nextclade" in str(file): - pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False) - elif "illumina" in str(file): + table = pd.read_csv(file, sep=";", header=0) + elif "illumina" in str(file) or ".tsv" in str(file): table = pd.read_csv(file, sep="\t", header=0) table["analysis_date"] = pd.to_datetime( table["analysis_date"].astype(str), format="%Y%m%d" ) - table.to_excel(output_name, index=False) elif "assembly" in str(file): - pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False) + table = pd.read_csv(file, sep="\t", header=0) else: - pd.read_csv(file).to_excel(output_name, index=False) - return file - - -# Merge pangolin and nextclade csv files separatedly and create excel files for them -merge_lineage_tables(reference_folders, samples_ref_files) -for reference, folder in reference_folders.items(): - print(f"Creating excel files for reference {reference}") - csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] - excel_generator(csv_files) - -# Merge all the variant long tables into one and convert to excel format -variants_tables = [ - table.path for table in os.scandir(".") if "variants_long_table" in table.path -] -concat_tables_and_write( - csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" -) -pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False) + table = pd.read_csv(file) + table.drop(["index"], axis=1, errors="ignore") + table.to_excel(output_name, index=False) + return -# Create excel files for individual tables -result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"] -excel_generator(result_tables) +def single_csv_to_excel(csv_file): + excel_generator([csv_file]) + +def main(args): + if args.single_csv: + # If single_csv is called, just convert target csv to excel and skip the rest + print(f"Single file convertion selected. Skipping main process...") + single_csv_to_excel(args.single_csv) + exit(0) + + print( + "Extracting references used for analysis and the samples associated with each reference\n" + ) + with open(args.reference_file, "r") as file: + references = [line.rstrip() for line in file] + print(f"\nFound {len(references)} references: {str(references).strip('[]')}") + + reference_folders = {ref: str("excel_files_" + ref) for ref in references} + samples_ref_files = { + ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references + } + + # Merge pangolin and nextclade csv files separatedly and create excel files for them + merge_lineage_tables(reference_folders, samples_ref_files) + for reference, folder in reference_folders.items(): + print(f"Creating excel files for reference {reference}") + csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] + excel_generator(csv_files) + + # Merge all the variant long tables into one and convert to excel format + variants_tables = [ + table.path for table in os.scandir(".") if "variants_long_table" in table.path + ] + concat_tables_and_write( + csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" + ) + # Create excel files for individual tables + valid_extensions = [".csv", ".tsv", ".tab"] + rest_of_csvs = [ + file.path for file in os.scandir(".") if any(file.path.endswith(ext) for ext in valid_extensions) + ] + excel_generator(rest_of_csvs) + + +if __name__ == "__main__": + main(args) From e3a4be1f0f2a853b40df5b4ff7691f0374134c23 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:07:49 +0100 Subject: [PATCH 05/11] symbolic link for flu-C --- bu_isciii/templates/IRMA/RESULTS/irma_results | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 bu_isciii/templates/IRMA/RESULTS/irma_results diff --git a/bu_isciii/templates/IRMA/RESULTS/irma_results b/bu_isciii/templates/IRMA/RESULTS/irma_results old mode 100644 new mode 100755 index 4c910758..a2a5bb33 --- a/bu_isciii/templates/IRMA/RESULTS/irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/irma_results @@ -7,3 +7,4 @@ ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B . +ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C . \ No newline at end of file From f67bc19498d7b34b9665392903463ef1f1996f22 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:08:17 +0100 Subject: [PATCH 06/11] introduced handling of flu-C in 04-irma lablog --- .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) mode change 100644 => 100755 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog old mode 100644 new mode 100755 index 33f3a273..540640fe --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -15,12 +15,21 @@ echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_proce echo "mkdir B" >> _03_post_processing.sh +echo "mkdir C" >> _03_post_processing.sh + echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'A_' > A_fragment_list.txt" >> _03_post_processing.sh echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'B_' > B_fragment_list.txt" >> _03_post_processing.sh -echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g'; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh +echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'C_' > C_fragment_list.txt" >> _03_post_processing.sh + +echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed s@-@/@g | sed s/_A_/_/g ; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh + +echo 'grep -w 'B__' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_B_/_/g ; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh + +echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_C_/_/g ; fi >> C/${fragment}.txt; done; done' >> _03_post_processing.sh -echo 'grep -w 'B_' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh +echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh -echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' >> all_samples_completo.txt; done' >> _03_post_processing.sh +echo 'sed -i "s/__//g" irma_stats.txt' >> _03_post_processing.sh +echo 'sed -i "s/_\t/\t/g" irma_stats.txt' >> _03_post_processing.sh \ No newline at end of file From 7b58f453ee5f76b5e35f9724cbaa6942ec474a77 Mon Sep 17 00:00:00 2001 From: Shettland Date: Wed, 10 Jan 2024 10:08:54 +0100 Subject: [PATCH 07/11] introduced handling of flu-C in create_irma_stats.sh --- .../04-irma/create_irma_stats.sh | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) mode change 100644 => 100755 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh old mode 100644 new mode 100755 index 89e072a5..93f0ffec --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh @@ -1 +1,33 @@ -echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) <(paste <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1,3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_') <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2); done >> irma_stats.txt + +echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt + +cat ../samples_id.txt | while read in +do +SAMPLE_ID=$(echo ${in}) +TOTAL_READS=$(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) +MAPPEDREADS=$(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) +FLU_TYPE=$(paste <(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1 | cut -d '-' -f2) <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_') +HA=$(grep '4-[A-C]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2) +MP=$(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2) +NA=$(grep '4-[A-C]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2) +NP=$(grep '4-[A-C]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2) +NS=$(grep '4-[A-C]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2) +PA=$(grep '4-[A-C]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2) +PB1=$(grep '4-[A-C]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2) +PB2=$(grep '4-[A-C]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2) +#In case of Influenza C in samples: +HE=$(grep '4-C_HE' ${in}/tables/READ_COUNTS.txt | cut -f2) +if [[ -n "$HE" ]]; then + LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE)) +else + LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2)) +fi + +echo "$LINE" >> irma_stats.txt + +done + +ANY_C=$(grep "C_" irma_stats.txt) +if [[ -n "$ANY_C" ]]; then + sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats.txt +fi From 379d912227e59919f8b83527811f1bde170bee88 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 16 Jan 2024 11:11:46 +0100 Subject: [PATCH 08/11] New implementations in excel_generator.py --- .../viralrecon/RESULTS/excel_generator.py | 62 ++++++++++++++----- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index f03f0b95..b80eb373 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -20,11 +20,19 @@ "--single_csv", type=str, default="", - help="Transform a single csv file to excel format. Omit rest of processes" + help="Transform a single csv file to excel format. Omit rest of processes", +) +parser.add_argument( + "-l", + "--merge_lineage_files", + type=str, + default="", + help="Merge pangolin and nextclade lineage tables", ) args = parser.parse_args() + def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str): """Concatenate any tables that share the same header""" if len(csvs_in_folder) == 0: @@ -99,21 +107,30 @@ def excel_generator(csv_files: List[str]): elif "assembly" in str(file): table = pd.read_csv(file, sep="\t", header=0) else: - table = pd.read_csv(file) + try: + table = pd.read_csv(file) + except pd.errors.EmptyDataError: + print("Could not parse table from ", str(file)) + continue table.drop(["index"], axis=1, errors="ignore") table.to_excel(output_name, index=False) return -def single_csv_to_excel(csv_file): - excel_generator([csv_file]) + +def single_csv_to_excel(csv_file: str): + try: + excel_generator([csv_file]) + except FileNotFoundError as e: + print(f"Could not find file {e}") + def main(args): if args.single_csv: # If single_csv is called, just convert target csv to excel and skip the rest - print(f"Single file convertion selected. Skipping main process...") + print("Single file convertion selected. Skipping main process...") single_csv_to_excel(args.single_csv) exit(0) - + print( "Extracting references used for analysis and the samples associated with each reference\n" ) @@ -126,26 +143,37 @@ def main(args): ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references } - # Merge pangolin and nextclade csv files separatedly and create excel files for them - merge_lineage_tables(reference_folders, samples_ref_files) - for reference, folder in reference_folders.items(): - print(f"Creating excel files for reference {reference}") - csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")] - excel_generator(csv_files) + if args.merge_lineage_files: + # Merge pangolin and nextclade csv files separatedly and create excel files for them + merge_lineage_tables(reference_folders, samples_ref_files) + for reference, folder in reference_folders.items(): + print(f"Creating excel files for reference {reference}") + csv_files = [ + file.path for file in os.scandir(folder) if file.path.endswith(".csv") + ] + excel_generator(csv_files) # Merge all the variant long tables into one and convert to excel format variants_tables = [ table.path for table in os.scandir(".") if "variants_long_table" in table.path ] - concat_tables_and_write( - csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" - ) + try: + concat_tables_and_write( + csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv" + ) + except FileNotFoundError as e: + print("Not variants_long_table found for ", str(e)) # Create excel files for individual tables valid_extensions = [".csv", ".tsv", ".tab"] rest_of_csvs = [ - file.path for file in os.scandir(".") if any(file.path.endswith(ext) for ext in valid_extensions) + file.path + for file in os.scandir(".") + if any(file.path.endswith(ext) for ext in valid_extensions) ] - excel_generator(rest_of_csvs) + link_csvs = [file for file in rest_of_csvs if os.path.islink(file)] + broken_links = [file for file in link_csvs if not os.path.exists(os.readlink(file))] + valid_csvs = [file for file in rest_of_csvs if file not in broken_links] + excel_generator(valid_csvs) if __name__ == "__main__": From ed6beba8f7b3530387f1140d643f765754e7f85a Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 16 Jan 2024 13:42:17 +0100 Subject: [PATCH 09/11] Changed blast symlink in viralrecon_results --- bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 86875d4d..4531f3f3 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -46,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - -cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd - +cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd - From eb8c72d3ad43911c14392fbeea00d6356f9a9106 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 16 Jan 2024 13:43:02 +0100 Subject: [PATCH 10/11] Implementations in excel_generator.py and error handling --- bu_isciii/templates/viralrecon/RESULTS/excel_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index b80eb373..b554a324 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -99,12 +99,12 @@ def excel_generator(csv_files: List[str]): # workbook = openpyxl.Workbook(output_name) if "nextclade" in str(file): table = pd.read_csv(file, sep=";", header=0) - elif "illumina" in str(file) or ".tsv" in str(file): + elif "illumina" in str(file): table = pd.read_csv(file, sep="\t", header=0) table["analysis_date"] = pd.to_datetime( table["analysis_date"].astype(str), format="%Y%m%d" ) - elif "assembly" in str(file): + elif "assembly" in str(file) or "tsv" in str(file) or "tab" in str(file): table = pd.read_csv(file, sep="\t", header=0) else: try: From 75736dee2dc7d9aac3d13cea56068fc76ddd8301 Mon Sep 17 00:00:00 2001 From: Shettland Date: Tue, 16 Jan 2024 14:02:19 +0100 Subject: [PATCH 11/11] No more blast folder in viralrecon_results --- bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results index 4531f3f3..1b9f2275 100644 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -8,7 +8,6 @@ mkdir mapping_consensus mkdir variants_annot mkdir assembly_spades mkdir abacas_assembly -mkdir blast mkdir ref_samples #Setting up folder and files required for excel_generator.py @@ -46,4 +45,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd - cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - -cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd - +ln -s ../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx .