Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduced handling of Influenza-C in IRMA template and small fixes in viralrecon and exometrio #177

Merged
merged 11 commits into from
Jan 16, 2024
34 changes: 33 additions & 1 deletion bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,33 @@
echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) <(paste <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1,3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_') <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2) <(grep '4-[A-B]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2); done >> irma_stats.txt

echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt

cat ../samples_id.txt | while read in
do
SAMPLE_ID=$(echo ${in})
TOTAL_READS=$(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2)
MAPPEDREADS=$(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2)
FLU_TYPE=$(paste <(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f1 | cut -d '-' -f2) <(grep '4-[A-B]_HA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3 | cut -d '-' -f2) <(grep '4-[A-B]_NA' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f3) | tr '\t' '_')
HA=$(grep '4-[A-C]_HA' ${in}/tables/READ_COUNTS.txt | cut -f2)
MP=$(grep '4-[A-C]_MP' ${in}/tables/READ_COUNTS.txt | cut -f2)
NA=$(grep '4-[A-C]_NA' ${in}/tables/READ_COUNTS.txt | cut -f2)
NP=$(grep '4-[A-C]_NP' ${in}/tables/READ_COUNTS.txt | cut -f2)
NS=$(grep '4-[A-C]_NS' ${in}/tables/READ_COUNTS.txt | cut -f2)
PA=$(grep '4-[A-C]_PA' ${in}/tables/READ_COUNTS.txt | cut -f2)
PB1=$(grep '4-[A-C]_PB1' ${in}/tables/READ_COUNTS.txt | cut -f2)
PB2=$(grep '4-[A-C]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2)
#In case of Influenza C in samples:
HE=$(grep '4-C_HE' ${in}/tables/READ_COUNTS.txt | cut -f2)
if [[ -n "$HE" ]]; then
LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE))
else
LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2))
fi

echo "$LINE" >> irma_stats.txt

done

ANY_C=$(grep "C_" irma_stats.txt)
if [[ -n "$ANY_C" ]]; then
sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats.txt
fi
15 changes: 12 additions & 3 deletions bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,21 @@ echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_proce

echo "mkdir B" >> _03_post_processing.sh

echo "mkdir C" >> _03_post_processing.sh

echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'A_' > A_fragment_list.txt" >> _03_post_processing.sh

echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'B_' > B_fragment_list.txt" >> _03_post_processing.sh

echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g'; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh
echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'C_' > C_fragment_list.txt" >> _03_post_processing.sh

echo 'cat HA_types.txt | while read type; do grep ${type} irma_stats.txt | cut -f1 | while read sample; do cat A_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed s@-@/@g | sed s/_A_/_/g ; fi >> ${type}/${fragment}.txt; done; done; done' >> _03_post_processing.sh

echo 'grep -w 'B__' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_B_/_/g ; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh

echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_C_/_/g ; fi >> C/${fragment}.txt; done; done' >> _03_post_processing.sh

echo 'grep -w 'B_' irma_stats.txt | cut -f1 | while read sample; do cat B_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g; fi >> B/${fragment}.txt; done; done' >> _03_post_processing.sh
echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh

echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' >> all_samples_completo.txt; done' >> _03_post_processing.sh
echo 'sed -i "s/__//g" irma_stats.txt' >> _03_post_processing.sh
echo 'sed -i "s/_\t/\t/g" irma_stats.txt' >> _03_post_processing.sh
1 change: 1 addition & 0 deletions bu_isciii/templates/IRMA/RESULTS/irma_results
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html
ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt .
ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* .
ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B .
ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C .
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ analysis:
MITOCHONDRIAL: 100.0
}
#FULL or PASS_ONLY
analysisMode: PASS_ONLY
#analysisMode: PASS_ONLY
#Possible frequencySources:
#Thousand Genomes project http://www.1000genomes.org/
# THOUSAND_GENOMES,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/bi/pipe
#---------------------------------------------------------------------------------------------------------


echo "srun --chdir /tmp --partition short_idx --nodelist ${EXOMISER_NODE} rm spring.log &" > _05_filter_heritance.sh

## Lablog to modify the output reported by exomiser and create a final file with a personalized format.
# Grep variant id for each inheritance model
cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_heritance.sh
Expand All @@ -91,4 +89,4 @@ echo "rm id_*" >> _05_filter_heritance.sh
cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_heritance.sh

# annot_all table is huge, lets shrink it a little bit
echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh
echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh
131 changes: 91 additions & 40 deletions bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,32 @@

# conda activate viralrecon_report
"""Usage: python excel_generator.py ./reference.tmp"""
"""Single csv to excel Usage: python excel_generator.py -s csv_file.csv"""
parser = argparse.ArgumentParser(
description="Generate excel files from viralrecon results"
)
parser.add_argument(
"reference_file",
"-r",
"--reference_file",
type=str,
help="File containing the references used in the analysis",
)

args = parser.parse_args()

print(
"Extracting references used for analysis and the samples associated with each reference\n"
parser.add_argument(
"-s",
"--single_csv",
type=str,
default="",
help="Transform a single csv file to excel format. Omit rest of processes",
)
parser.add_argument(
"-l",
"--merge_lineage_files",
type=str,
default="",
help="Merge pangolin and nextclade lineage tables",
)
with open(args.reference_file, "r") as file:
references = [line.rstrip() for line in file]
print(f"\nFound {len(references)} references: {str(references).strip('[]')}")

reference_folders = {ref: str("excel_files_" + ref) for ref in references}
samples_ref_files = {
ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
}
args = parser.parse_args()


def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
Expand Down Expand Up @@ -91,39 +95,86 @@ def excel_generator(csv_files: List[str]):
print(f"File {file} does not exist, omitting...")
continue
print(f"Generating excel file for {file}")
output_name = str(file.split(".csv")[0] + ".xlsx")
output_name = os.path.splitext(os.path.basename(file))[0] + ".xlsx"
# workbook = openpyxl.Workbook(output_name)
if "nextclade" in str(file):
pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False)
table = pd.read_csv(file, sep=";", header=0)
elif "illumina" in str(file):
table = pd.read_csv(file, sep="\t", header=0)
table["analysis_date"] = pd.to_datetime(
table["analysis_date"].astype(str), format="%Y%m%d"
)
table.to_excel(output_name, index=False)
elif "assembly" in str(file):
pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False)
elif "assembly" in str(file) or "tsv" in str(file) or "tab" in str(file):
table = pd.read_csv(file, sep="\t", header=0)
else:
pd.read_csv(file).to_excel(output_name, index=False)
return file


# Merge pangolin and nextclade csv files separatedly and create excel files for them
merge_lineage_tables(reference_folders, samples_ref_files)
for reference, folder in reference_folders.items():
print(f"Creating excel files for reference {reference}")
csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
excel_generator(csv_files)

# Merge all the variant long tables into one and convert to excel format
variants_tables = [
table.path for table in os.scandir(".") if "variants_long_table" in table.path
]
concat_tables_and_write(
csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
)
pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False)
try:
table = pd.read_csv(file)
except pd.errors.EmptyDataError:
print("Could not parse table from ", str(file))
continue
table.drop(["index"], axis=1, errors="ignore")
table.to_excel(output_name, index=False)
return


# Create excel files for individual tables
result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"]
excel_generator(result_tables)
def single_csv_to_excel(csv_file: str):
try:
excel_generator([csv_file])
except FileNotFoundError as e:
print(f"Could not find file {e}")


def main(args):
if args.single_csv:
# If single_csv is called, just convert target csv to excel and skip the rest
print("Single file convertion selected. Skipping main process...")
single_csv_to_excel(args.single_csv)
exit(0)

print(
"Extracting references used for analysis and the samples associated with each reference\n"
)
with open(args.reference_file, "r") as file:
references = [line.rstrip() for line in file]
print(f"\nFound {len(references)} references: {str(references).strip('[]')}")

reference_folders = {ref: str("excel_files_" + ref) for ref in references}
samples_ref_files = {
ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
}

if args.merge_lineage_files:
# Merge pangolin and nextclade csv files separatedly and create excel files for them
merge_lineage_tables(reference_folders, samples_ref_files)
for reference, folder in reference_folders.items():
print(f"Creating excel files for reference {reference}")
csv_files = [
file.path for file in os.scandir(folder) if file.path.endswith(".csv")
]
excel_generator(csv_files)

# Merge all the variant long tables into one and convert to excel format
variants_tables = [
table.path for table in os.scandir(".") if "variants_long_table" in table.path
]
try:
concat_tables_and_write(
csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
)
except FileNotFoundError as e:
print("Not variants_long_table found for ", str(e))
# Create excel files for individual tables
valid_extensions = [".csv", ".tsv", ".tab"]
rest_of_csvs = [
file.path
for file in os.scandir(".")
if any(file.path.endswith(ext) for ext in valid_extensions)
]
link_csvs = [file for file in rest_of_csvs if os.path.islink(file)]
broken_links = [file for file in link_csvs if not os.path.exists(os.readlink(file))]
valid_csvs = [file for file in rest_of_csvs if file not in broken_links]
excel_generator(valid_csvs)


if __name__ == "__main__":
main(args)
5 changes: 3 additions & 2 deletions bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv
ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv

#conda activate viralrecon_report
echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh
echo "python ./excel_generator.py -r ./references.tmp" > _01_generate_excel_files.sh
#Cleaning temp files and broken symbolic links
echo "find . -xtype l -delete" > _02_clean_folders.sh
echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh
echo "find . -type d -empty -delete" >> _02_clean_folders.sh
echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh
echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh
echo 'cat references.tmp | while read in; do rm ${in}_variants_long_table.xlsx; done' >> _02_clean_folders.sh
echo "rm references.tmp" >> _02_clean_folders.sh
echo "rm -rf ref_samples/" >> _02_clean_folders.sh
echo "rm ./*.csv" >> _02_clean_folders.sh
Expand All @@ -45,4 +46,4 @@ cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; d
cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
cd blast; ln -s ../../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . ; cd -
Loading