BU-ISCIII · Shettland · Jan 16, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh
@@ -1 +1,33 @@
-echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) <(paste <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1,3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_') <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2); done >> irma_stats.txt
+
+echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt
+
+cat ../samples_id.txt | while read in 
+do 
+SAMPLE_ID=$(echo ${in})
+TOTAL_READS=$(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2)
+MAPPEDREADS=$(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2)
+FLU_TYPE=$(paste <(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1 | cut -d '-' -f2) <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_')
+HA=$(grep '4-[A-C]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+MP=$(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NA=$(grep '4-[A-C]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NP=$(grep '4-[A-C]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2)
+NS=$(grep '4-[A-C]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PA=$(grep '4-[A-C]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PB1=$(grep '4-[A-C]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2)
+PB2=$(grep '4-[A-C]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2)
+#In case of Influenza C in samples:
+HE=$(grep '4-C_HE' ${in}/tables/READ_COUNTS.txt | cut -f2)
+if [[ -n "$HE" ]]; then
+    LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE))
+else
+    LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2))
+fi
+
+echo "$LINE" >> irma_stats.txt
+
+done
+
+ANY_C=$(grep "C_" irma_stats.txt)
+if [[ -n "$ANY_C" ]]; then
+    sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats.txt
+fi
diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog
@@ -15,12 +15,21 @@ echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_proce
 
 echo "mkdir B" >> _03_post_processing.sh
 
+echo "mkdir C" >> _03_post_processing.sh
+
 echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'A_' > A_fragment_list.txt" >> _03_post_processing.sh
 
 echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'B_' > B_fragment_list.txt" >> _03_post_processing.sh
 
-echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g'; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh
+echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'C_' > C_fragment_list.txt" >> _03_post_processing.sh
+
+echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed s@-@/@g | sed s/_A_/_/g ; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh
+
+echo 'grep -w 'B__' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_B_/_/g ; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh
+
+echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_C_/_/g ; fi >> C/${fragment}.txt; done; done' >> _03_post_processing.sh
 
-echo 'grep -w 'B_' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh
+echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh
 
-echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' >> all_samples_completo.txt; done' >> _03_post_processing.sh
+echo 'sed -i "s/__//g" irma_stats.txt' >> _03_post_processing.sh
+echo 'sed -i "s/_\t/\t/g" irma_stats.txt' >> _03_post_processing.sh
diff --git a/bu_isciii/templates/IRMA/RESULTS/irma_results b/bu_isciii/templates/IRMA/RESULTS/irma_results
@@ -7,3 +7,4 @@ ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt .
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* .
 ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B .
+ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C .
diff --git a/...sciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml b/...sciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/exomiser_configfile.yml
@@ -24,7 +24,7 @@ analysis:
             MITOCHONDRIAL: 100.0
     }
     #FULL or PASS_ONLY
-    analysisMode: PASS_ONLY
+    #analysisMode: PASS_ONLY
     #Possible frequencySources:
     #Thousand Genomes project http://www.1000genomes.org/
     #   THOUSAND_GENOMES,

diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog
@@ -76,8 +76,6 @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/bi/pipe
 #---------------------------------------------------------------------------------------------------------
 
 
-echo "srun --chdir /tmp --partition short_idx --nodelist ${EXOMISER_NODE} rm spring.log &" > _05_filter_heritance.sh
-
 ## Lablog to modify the output reported by exomiser and create a final file with a personalized format.
 # Grep variant id for each inheritance model
 cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_heritance.sh
@@ -91,4 +89,4 @@ echo "rm id_*" >> _05_filter_heritance.sh
 cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_heritance.sh
 
 # annot_all table is huge, lets shrink it a little bit
-echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh
+echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh
diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
@@ -5,28 +5,32 @@
 
 # conda activate viralrecon_report
 """Usage: python excel_generator.py ./reference.tmp"""
+"""Single csv to excel Usage: python excel_generator.py -s csv_file.csv"""
 parser = argparse.ArgumentParser(
     description="Generate excel files from viralrecon results"
 )
 parser.add_argument(
-    "reference_file",
+    "-r",
+    "--reference_file",
     type=str,
     help="File containing the references used in the analysis",
 )
-
-args = parser.parse_args()
-
-print(
-    "Extracting references used for analysis and the samples associated with each reference\n"
+parser.add_argument(
+    "-s",
+    "--single_csv",
+    type=str,
+    default="",
+    help="Transform a single csv file to excel format. Omit rest of processes",
+)
+parser.add_argument(
+    "-l",
+    "--merge_lineage_files",
+    type=str,
+    default="",
+    help="Merge pangolin and nextclade lineage tables",
 )
-with open(args.reference_file, "r") as file:
-    references = [line.rstrip() for line in file]
-    print(f"\nFound {len(references)} references: {str(references).strip('[]')}")
 
-reference_folders = {ref: str("excel_files_" + ref) for ref in references}
-samples_ref_files = {
-    ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
-}
+args = parser.parse_args()
 
 
 def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
@@ -91,39 +95,86 @@ def excel_generator(csv_files: List[str]):
             print(f"File {file} does not exist, omitting...")
             continue
         print(f"Generating excel file for {file}")
-        output_name = str(file.split(".csv")[0] + ".xlsx")
+        output_name = os.path.splitext(os.path.basename(file))[0] + ".xlsx"
         # workbook = openpyxl.Workbook(output_name)
         if "nextclade" in str(file):
-            pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False)
+            table = pd.read_csv(file, sep=";", header=0)
         elif "illumina" in str(file):
             table = pd.read_csv(file, sep="\t", header=0)
             table["analysis_date"] = pd.to_datetime(
                 table["analysis_date"].astype(str), format="%Y%m%d"
             )
-            table.to_excel(output_name, index=False)
-        elif "assembly" in str(file):
-            pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False)
+        elif "assembly" in str(file) or "tsv" in str(file) or "tab" in str(file):
+            table = pd.read_csv(file, sep="\t", header=0)
         else:
-            pd.read_csv(file).to_excel(output_name, index=False)
-    return file
-
-
-# Merge pangolin and nextclade csv files separatedly and create excel files for them
-merge_lineage_tables(reference_folders, samples_ref_files)
-for reference, folder in reference_folders.items():
-    print(f"Creating excel files for reference {reference}")
-    csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
-    excel_generator(csv_files)
-
-# Merge all the variant long tables into one and convert to excel format
-variants_tables = [
-    table.path for table in os.scandir(".") if "variants_long_table" in table.path
-]
-concat_tables_and_write(
-    csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
-)
-pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False)
+            try:
+                table = pd.read_csv(file)
+            except pd.errors.EmptyDataError:
+                print("Could not parse table from ", str(file))
+                continue
+        table.drop(["index"], axis=1, errors="ignore")
+        table.to_excel(output_name, index=False)
+    return
+
 
-# Create excel files for individual tables
-result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"]
-excel_generator(result_tables)
+def single_csv_to_excel(csv_file: str):
+    try:
+        excel_generator([csv_file])
+    except FileNotFoundError as e:
+        print(f"Could not find file {e}")
+
+
+def main(args):
+    if args.single_csv:
+        # If single_csv is called, just convert target csv to excel and skip the rest
+        print("Single file convertion selected. Skipping main process...")
+        single_csv_to_excel(args.single_csv)
+        exit(0)
+
+    print(
+        "Extracting references used for analysis and the samples associated with each reference\n"
+    )
+    with open(args.reference_file, "r") as file:
+        references = [line.rstrip() for line in file]
+        print(f"\nFound {len(references)} references: {str(references).strip('[]')}")
+
+    reference_folders = {ref: str("excel_files_" + ref) for ref in references}
+    samples_ref_files = {
+        ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
+    }
+
+    if args.merge_lineage_files:
+        # Merge pangolin and nextclade csv files separatedly and create excel files for them
+        merge_lineage_tables(reference_folders, samples_ref_files)
+        for reference, folder in reference_folders.items():
+            print(f"Creating excel files for reference {reference}")
+            csv_files = [
+                file.path for file in os.scandir(folder) if file.path.endswith(".csv")
+            ]
+            excel_generator(csv_files)
+
+    # Merge all the variant long tables into one and convert to excel format
+    variants_tables = [
+        table.path for table in os.scandir(".") if "variants_long_table" in table.path
+    ]
+    try:
+        concat_tables_and_write(
+            csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
+        )
+    except FileNotFoundError as e:
+        print("Not variants_long_table found for ", str(e))
+    # Create excel files for individual tables
+    valid_extensions = [".csv", ".tsv", ".tab"]
+    rest_of_csvs = [
+        file.path
+        for file in os.scandir(".")
+        if any(file.path.endswith(ext) for ext in valid_extensions)
+    ]
+    link_csvs = [file for file in rest_of_csvs if os.path.islink(file)]
+    broken_links = [file for file in link_csvs if not os.path.exists(os.readlink(file))]
+    valid_csvs = [file for file in rest_of_csvs if file not in broken_links]
+    excel_generator(valid_csvs)
+
+
+if __name__ == "__main__":
+    main(args)
diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
@@ -27,13 +27,14 @@ ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv
 ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv
 
 #conda activate viralrecon_report
-echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh
+echo "python ./excel_generator.py -r ./references.tmp" > _01_generate_excel_files.sh
 #Cleaning temp files and broken symbolic links
 echo "find . -xtype l -delete" > _02_clean_folders.sh
 echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh
 echo "find . -type d -empty -delete" >> _02_clean_folders.sh
 echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh
 echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh
+echo 'cat references.tmp | while read in; do rm ${in}_variants_long_table.xlsx; done' >> _02_clean_folders.sh
 echo "rm references.tmp" >> _02_clean_folders.sh
 echo "rm -rf ref_samples/" >> _02_clean_folders.sh
 echo "rm ./*.csv" >> _02_clean_folders.sh
@@ -45,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d
 cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
 cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
 cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
-cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
+cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd -