From ef1df075e79d9c57aa926bafb1275745c7f8d9b2 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:04:20 +0100
Subject: [PATCH 01/11] removed unnecessary line

---
 .../exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog
index dcb39acd..2dd494ea 100644
--- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog
+++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog
@@ -76,8 +76,6 @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/bi/pipe
 #---------------------------------------------------------------------------------------------------------
 
 
-echo "srun --chdir /tmp --partition short_idx --nodelist ${EXOMISER_NODE} rm spring.log &" > _05_filter_heritance.sh
-
 ## Lablog to modify the output reported by exomiser and create a final file with a personalized format.
 # Grep variant id for each inheritance model
 cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_heritance.sh
@@ -91,4 +89,4 @@ echo "rm id_*" >> _05_filter_heritance.sh
 cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_heritance.sh
 
 # annot_all table is huge, lets shrink it a little bit
-echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh
\ No newline at end of file
+echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh

From a4c59c6b6d8da2803fb951a514958eda58ad970f Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:05:16 +0100
Subject: [PATCH 02/11] commented PASS_ONLY line parameter

---
 .../ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml
index b3b9d027..84bd59f2 100644
--- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml
+++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml
@@ -24,7 +24,7 @@ analysis:
             MITOCHONDRIAL: 100.0
     }
     #FULL or PASS_ONLY
-    analysisMode: PASS_ONLY
+    #analysisMode: PASS_ONLY
     #Possible frequencySources:
     #Thousand Genomes project http://www.1000genomes.org/
     #   THOUSAND_GENOMES,

From 032e34ea48791c56903c758e0ad0144753bd58f3 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:05:52 +0100
Subject: [PATCH 03/11] small changes in viralrecon_results script

---
 bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
index 24125304..86875d4d 100644
--- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
+++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
@@ -27,13 +27,14 @@ ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv
 ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv
 
 #conda activate viralrecon_report
-echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh
+echo "python ./excel_generator.py -r ./references.tmp" > _01_generate_excel_files.sh
 #Cleaning temp files and broken symbolic links
 echo "find . -xtype l -delete" > _02_clean_folders.sh
 echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh
 echo "find . -type d -empty -delete" >> _02_clean_folders.sh
 echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh
 echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh
+echo 'cat references.tmp | while read in; do rm ${in}_variants_long_table.xlsx; done' >> _02_clean_folders.sh
 echo "rm references.tmp" >> _02_clean_folders.sh
 echo "rm -rf ref_samples/" >> _02_clean_folders.sh
 echo "rm ./*.csv" >> _02_clean_folders.sh
@@ -45,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d
 cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
 cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
 cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
-cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
\ No newline at end of file
+cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -

From fc0810f154a64ac49e8135c00427607f897c19ca Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:07:09 +0100
Subject: [PATCH 04/11] introduced better error handling in excel_generator,
 now it can also be used for single files

---
 .../viralrecon/RESULTS/excel_generator.py     | 105 +++++++++++-------
 1 file changed, 64 insertions(+), 41 deletions(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
index 55655950..f03f0b95 100755
--- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
+++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
@@ -5,29 +5,25 @@
 
 # conda activate viralrecon_report
 """Usage: python excel_generator.py ./reference.tmp"""
+"""Single csv to excel Usage: python excel_generator.py -s csv_file.csv"""
 parser = argparse.ArgumentParser(
     description="Generate excel files from viralrecon results"
 )
 parser.add_argument(
-    "reference_file",
+    "-r",
+    "--reference_file",
     type=str,
     help="File containing the references used in the analysis",
 )
-
-args = parser.parse_args()
-
-print(
-    "Extracting references used for analysis and the samples associated with each reference\n"
+parser.add_argument(
+    "-s",
+    "--single_csv",
+    type=str,
+    default="",
+    help="Transform a single csv file to excel format. Omit rest of processes"
 )
-with open(args.reference_file, "r") as file:
-    references = [line.rstrip() for line in file]
-    print(f"\nFound {len(references)} references: {str(references).strip('[]')}")
-
-reference_folders = {ref: str("excel_files_" + ref) for ref in references}
-samples_ref_files = {
-    ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
-}
 
+args = parser.parse_args()
 
 def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
     """Concatenate any tables that share the same header"""
@@ -91,39 +87,66 @@ def excel_generator(csv_files: List[str]):
             print(f"File {file} does not exist, omitting...")
             continue
         print(f"Generating excel file for {file}")
-        output_name = str(file.split(".csv")[0] + ".xlsx")
+        output_name = os.path.splitext(os.path.basename(file))[0] + ".xlsx"
         # workbook = openpyxl.Workbook(output_name)
         if "nextclade" in str(file):
-            pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False)
-        elif "illumina" in str(file):
+            table = pd.read_csv(file, sep=";", header=0)
+        elif "illumina" in str(file) or ".tsv" in str(file):
             table = pd.read_csv(file, sep="\t", header=0)
             table["analysis_date"] = pd.to_datetime(
                 table["analysis_date"].astype(str), format="%Y%m%d"
             )
-            table.to_excel(output_name, index=False)
         elif "assembly" in str(file):
-            pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False)
+            table = pd.read_csv(file, sep="\t", header=0)
         else:
-            pd.read_csv(file).to_excel(output_name, index=False)
-    return file
-
-
-# Merge pangolin and nextclade csv files separatedly and create excel files for them
-merge_lineage_tables(reference_folders, samples_ref_files)
-for reference, folder in reference_folders.items():
-    print(f"Creating excel files for reference {reference}")
-    csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
-    excel_generator(csv_files)
-
-# Merge all the variant long tables into one and convert to excel format
-variants_tables = [
-    table.path for table in os.scandir(".") if "variants_long_table" in table.path
-]
-concat_tables_and_write(
-    csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
-)
-pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False)
+            table = pd.read_csv(file)
+        table.drop(["index"], axis=1, errors="ignore")
+        table.to_excel(output_name, index=False)
+    return
 
-# Create excel files for individual tables
-result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"]
-excel_generator(result_tables)
+def single_csv_to_excel(csv_file):
+    excel_generator([csv_file])
+
+def main(args):
+    if args.single_csv:
+        # If single_csv is called, just convert target csv to excel and skip the rest
+        print(f"Single file convertion selected. Skipping main process...")
+        single_csv_to_excel(args.single_csv)
+        exit(0)
+    
+    print(
+        "Extracting references used for analysis and the samples associated with each reference\n"
+    )
+    with open(args.reference_file, "r") as file:
+        references = [line.rstrip() for line in file]
+        print(f"\nFound {len(references)} references: {str(references).strip('[]')}")
+
+    reference_folders = {ref: str("excel_files_" + ref) for ref in references}
+    samples_ref_files = {
+        ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
+    }
+
+    # Merge pangolin and nextclade csv files separatedly and create excel files for them
+    merge_lineage_tables(reference_folders, samples_ref_files)
+    for reference, folder in reference_folders.items():
+        print(f"Creating excel files for reference {reference}")
+        csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
+        excel_generator(csv_files)
+
+    # Merge all the variant long tables into one and convert to excel format
+    variants_tables = [
+        table.path for table in os.scandir(".") if "variants_long_table" in table.path
+    ]
+    concat_tables_and_write(
+        csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
+    )
+    # Create excel files for individual tables
+    valid_extensions = [".csv", ".tsv", ".tab"]
+    rest_of_csvs = [
+        file.path for file in os.scandir(".") if any(file.path.endswith(ext) for ext in valid_extensions)
+    ]
+    excel_generator(rest_of_csvs)
+
+
+if __name__ == "__main__":
+    main(args)

From e3a4be1f0f2a853b40df5b4ff7691f0374134c23 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:07:49 +0100
Subject: [PATCH 05/11] symbolic link for flu-C

---
 bu_isciii/templates/IRMA/RESULTS/irma_results | 1 +
 1 file changed, 1 insertion(+)
 mode change 100644 => 100755 bu_isciii/templates/IRMA/RESULTS/irma_results

diff --git a/bu_isciii/templates/IRMA/RESULTS/irma_results b/bu_isciii/templates/IRMA/RESULTS/irma_results
old mode 100644
new mode 100755
index 4c910758..a2a5bb33
--- a/bu_isciii/templates/IRMA/RESULTS/irma_results
+++ b/bu_isciii/templates/IRMA/RESULTS/irma_results
@@ -7,3 +7,4 @@ ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt .
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* .
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B .
+ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C .
\ No newline at end of file

From f67bc19498d7b34b9665392903463ef1f1996f22 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:08:17 +0100
Subject: [PATCH 06/11] introduced handling of flu-C in 04-irma lablog

---
 .../ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog

diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog
old mode 100644
new mode 100755
index 33f3a273..540640fe
--- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog
+++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog
@@ -15,12 +15,21 @@ echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_proce
 
 echo "mkdir B" >> _03_post_processing.sh
 
+echo "mkdir C" >> _03_post_processing.sh
+
 echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'A_' > A_fragment_list.txt" >> _03_post_processing.sh
 
 echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'B_' > B_fragment_list.txt" >> _03_post_processing.sh
 
-echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g'; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh
+echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'C_' > C_fragment_list.txt" >> _03_post_processing.sh
+
+echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed s@-@/@g | sed s/_A_/_/g ; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh
+
+echo 'grep -w 'B__' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_B_/_/g ; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh
+
+echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_C_/_/g ; fi >> C/${fragment}.txt; done; done' >> _03_post_processing.sh
 
-echo 'grep -w 'B_' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh
+echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh
 
-echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' >> all_samples_completo.txt; done' >> _03_post_processing.sh
+echo 'sed -i "s/__//g" irma_stats.txt' >> _03_post_processing.sh
+echo 'sed -i "s/_\t/\t/g" irma_stats.txt' >> _03_post_processing.sh
\ No newline at end of file

From 7b58f453ee5f76b5e35f9724cbaa6942ec474a77 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Wed, 10 Jan 2024 10:08:54 +0100
Subject: [PATCH 07/11] introduced handling of flu-C in create_irma_stats.sh

---
 .../04-irma/create_irma_stats.sh              | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh

diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh
old mode 100644
new mode 100755
index 89e072a5..93f0ffec
--- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh
+++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh
@@ -1 +1,33 @@
-echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) <(paste <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1,3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_') <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2); done >> irma_stats.txt
+
+echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt
+
+cat ../samples_id.txt | while read in 
+do 
+SAMPLE_ID=$(echo ${in})
+TOTAL_READS=$(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2)
+MAPPEDREADS=$(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2)
+FLU_TYPE=$(paste <(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1 | cut -d '-' -f2) <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_')
+HA=$(grep '4-[A-C]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+MP=$(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NA=$(grep '4-[A-C]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NP=$(grep '4-[A-C]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NS=$(grep '4-[A-C]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PA=$(grep '4-[A-C]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PB1=$(grep '4-[A-C]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PB2=$(grep '4-[A-C]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2)
+#In case of Influenza C in samples:
+HE=$(grep '4-C_HE' ${in}/tables/READ_COUNTS.txt | cut -f2)
+if [[ -n "$HE" ]]; then
+    LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE))
+else
+    LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2))
+fi
+
+echo "$LINE" >> irma_stats.txt
+
+done
+
+ANY_C=$(grep "C_" irma_stats.txt)
+if [[ -n "$ANY_C" ]]; then
+    sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats.txt
+fi

From 379d912227e59919f8b83527811f1bde170bee88 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Tue, 16 Jan 2024 11:11:46 +0100
Subject: [PATCH 08/11] New implementations in excel_generator.py

---
 .../viralrecon/RESULTS/excel_generator.py     | 62 ++++++++++++++-----
 1 file changed, 45 insertions(+), 17 deletions(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
index f03f0b95..b80eb373 100755
--- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
+++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
@@ -20,11 +20,19 @@
     "--single_csv",
     type=str,
     default="",
-    help="Transform a single csv file to excel format. Omit rest of processes"
+    help="Transform a single csv file to excel format. Omit rest of processes",
+)
+parser.add_argument(
+    "-l",
+    "--merge_lineage_files",
+    type=str,
+    default="",
+    help="Merge pangolin and nextclade lineage tables",
 )
 
 args = parser.parse_args()
 
+
 def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
     """Concatenate any tables that share the same header"""
     if len(csvs_in_folder) == 0:
@@ -99,21 +107,30 @@ def excel_generator(csv_files: List[str]):
         elif "assembly" in str(file):
             table = pd.read_csv(file, sep="\t", header=0)
         else:
-            table = pd.read_csv(file)
+            try:
+                table = pd.read_csv(file)
+            except pd.errors.EmptyDataError:
+                print("Could not parse table from ", str(file))
+                continue
         table.drop(["index"], axis=1, errors="ignore")
         table.to_excel(output_name, index=False)
     return
 
-def single_csv_to_excel(csv_file):
-    excel_generator([csv_file])
+
+def single_csv_to_excel(csv_file: str):
+    try:
+        excel_generator([csv_file])
+    except FileNotFoundError as e:
+        print(f"Could not find file {e}")
+
 
 def main(args):
     if args.single_csv:
         # If single_csv is called, just convert target csv to excel and skip the rest
-        print(f"Single file convertion selected. Skipping main process...")
+        print("Single file convertion selected. Skipping main process...")
         single_csv_to_excel(args.single_csv)
         exit(0)
-    
+
     print(
         "Extracting references used for analysis and the samples associated with each reference\n"
     )
@@ -126,26 +143,37 @@ def main(args):
         ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
     }
 
-    # Merge pangolin and nextclade csv files separatedly and create excel files for them
-    merge_lineage_tables(reference_folders, samples_ref_files)
-    for reference, folder in reference_folders.items():
-        print(f"Creating excel files for reference {reference}")
-        csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
-        excel_generator(csv_files)
+    if args.merge_lineage_files:
+        # Merge pangolin and nextclade csv files separatedly and create excel files for them
+        merge_lineage_tables(reference_folders, samples_ref_files)
+        for reference, folder in reference_folders.items():
+            print(f"Creating excel files for reference {reference}")
+            csv_files = [
+                file.path for file in os.scandir(folder) if file.path.endswith(".csv")
+            ]
+            excel_generator(csv_files)
 
     # Merge all the variant long tables into one and convert to excel format
     variants_tables = [
         table.path for table in os.scandir(".") if "variants_long_table" in table.path
     ]
-    concat_tables_and_write(
-        csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
-    )
+    try:
+        concat_tables_and_write(
+            csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
+        )
+    except FileNotFoundError as e:
+        print("Not variants_long_table found for ", str(e))
     # Create excel files for individual tables
     valid_extensions = [".csv", ".tsv", ".tab"]
     rest_of_csvs = [
-        file.path for file in os.scandir(".") if any(file.path.endswith(ext) for ext in valid_extensions)
+        file.path
+        for file in os.scandir(".")
+        if any(file.path.endswith(ext) for ext in valid_extensions)
     ]
-    excel_generator(rest_of_csvs)
+    link_csvs = [file for file in rest_of_csvs if os.path.islink(file)]
+    broken_links = [file for file in link_csvs if not os.path.exists(os.readlink(file))]
+    valid_csvs = [file for file in rest_of_csvs if file not in broken_links]
+    excel_generator(valid_csvs)
 
 
 if __name__ == "__main__":

From ed6beba8f7b3530387f1140d643f765754e7f85a Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Tue, 16 Jan 2024 13:42:17 +0100
Subject: [PATCH 09/11] Changed blast symlink in viralrecon_results

---
 bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
index 86875d4d..4531f3f3 100644
--- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
+++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
@@ -46,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d
 cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
 cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
 cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
-cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
+cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd -

From eb8c72d3ad43911c14392fbeea00d6356f9a9106 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Tue, 16 Jan 2024 13:43:02 +0100
Subject: [PATCH 10/11] Implementations in excel_generator.py and error
 handling

---
 bu_isciii/templates/viralrecon/RESULTS/excel_generator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
index b80eb373..b554a324 100755
--- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
+++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
@@ -99,12 +99,12 @@ def excel_generator(csv_files: List[str]):
         # workbook = openpyxl.Workbook(output_name)
         if "nextclade" in str(file):
             table = pd.read_csv(file, sep=";", header=0)
-        elif "illumina" in str(file) or ".tsv" in str(file):
+        elif "illumina" in str(file):
             table = pd.read_csv(file, sep="\t", header=0)
             table["analysis_date"] = pd.to_datetime(
                 table["analysis_date"].astype(str), format="%Y%m%d"
             )
-        elif "assembly" in str(file):
+        elif "assembly" in str(file) or "tsv" in str(file) or "tab" in str(file):
             table = pd.read_csv(file, sep="\t", header=0)
         else:
             try:

From 75736dee2dc7d9aac3d13cea56068fc76ddd8301 Mon Sep 17 00:00:00 2001
From: Shettland <pyblus@gmail.com>
Date: Tue, 16 Jan 2024 14:02:19 +0100
Subject: [PATCH 11/11] No more blast folder in viralrecon_results

---
 bu_isciii/templates/viralrecon/RESULTS/viralrecon_results | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
index 4531f3f3..1b9f2275 100644
--- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
+++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
@@ -8,7 +8,6 @@ mkdir mapping_consensus
 mkdir variants_annot
 mkdir assembly_spades
 mkdir abacas_assembly
-mkdir blast
 mkdir ref_samples
 
 #Setting up folder and files required for excel_generator.py
@@ -46,4 +45,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d
 cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
 cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
 cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
-cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd -
+ln -s ../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx .