diff --git a/CHANGELOG.md b/CHANGELOG.md
index 75499ea0b..a2ce31375 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,12 +13,8 @@ Code contributions to the new version:
### Template fixes and updates
- Updated documentation and results markdown for viralrecon, pikavirus and MAG [#247](https://github.com/BU-ISCIII/buisciii-tools/pull/247)
-
-### Modules
-
-#### Added enhancements
-
- Added documentation and results markdown for RNAseq [#248](https://github.com/BU-ISCIII/buisciii-tools/pull/248)
+- Added documentation both output and results for plasmidID[#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258)
- Added markdown of assembly analysis procedure [#244](https://github.com/BU-ISCIII/buisciii-tools/pull/244)
- Added output and results markdowns for ExomeEB, ExomeTrio and WGStrio [#249](https://github.com/BU-ISCIII/buisciii-tools/pull/249)
- Added markdown of assembly results folder [#250](https://github.com/BU-ISCIII/buisciii-tools/pull/250)
@@ -26,8 +22,18 @@ Code contributions to the new version:
- Added output and results markdowns for cgMLST/wgMLST [#255](https://github.com/BU-ISCIII/buisciii-tools/pull/255)
- Added markdown for IRMA [#256](https://github.com/BU-ISCIII/buisciii-tools/pull/256)
+### Modules
+
+#### Added enhancements
+
+### Modules
+
+#### Added enhancements
+
#### Fixes
+- Fixed archive module. Updated correct header for scout tsv [#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258).
+
#### Changed
#### Removed
diff --git a/bu_isciii/archive.py b/bu_isciii/archive.py
index 09b65f246..9a83efc83 100644
--- a/bu_isciii/archive.py
+++ b/bu_isciii/archive.py
@@ -1037,7 +1037,7 @@ def generate_tsv_table(self, filename):
if self.services[service]["found_in_system"]
else "NOT found on iSkyLIMS"
)
- csv_dict["Delivery date"] = ""
+ csv_dict["Delivery date"] = self.services[service]["delivery_date"]
# Fields for archive
csv_dict["Path in archive"] = (
@@ -1045,7 +1045,7 @@ def generate_tsv_table(self, filename):
if self.services[service]["archived_path"] is not None
else "Archived path could not be generated"
)
- csv_dict["Found in archive"] = (
+ csv_dict["Found on archive"] = (
"Yes"
if "Archive" in self.services[service]["found"]
else "Not found in archive"
@@ -1077,12 +1077,12 @@ def generate_tsv_table(self, filename):
if "Data dir" in self.services[service]["found"]
else "Not found in data dir"
)
- csv_dict["Compressed size in data directory"] = (
+ csv_dict["Uncompressed size in data directory"] = (
self.services[service]["non_archived_size"]
if self.services[service]["non_archived_size"] != 0
else "Not calculated"
)
- csv_dict["Uncompressed size in data directory"] = (
+ csv_dict["Compressed size in data directory"] = (
self.services[service]["non_archived_compressed_size"]
if self.services[service]["non_archived_compressed_size"] != 0
else "Not calculated"
diff --git a/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png b/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png
new file mode 100644
index 000000000..e49bc9fcd
Binary files /dev/null and b/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png differ
diff --git a/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png b/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png
new file mode 100644
index 000000000..86a24bd26
Binary files /dev/null and b/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png differ
diff --git a/bu_isciii/assets/reports/md/plasmidid.md b/bu_isciii/assets/reports/md/plasmidid.md
new file mode 100755
index 000000000..90e371d72
--- /dev/null
+++ b/bu_isciii/assets/reports/md/plasmidid.md
@@ -0,0 +1,169 @@
+# PlasmidID
+
+This document describes the output produced by the pipeline.
+
+The directories listed below will be created in the analysis directory after the pipeline has finished. All paths are relative to the top-level results directory.
+
+- [PlasmidID](#plasmidid)
+ - [Preprocessing](#preprocessing)
+ - [Assembly](#assembly)
+ - [PlasmidID](#plasmidid-1)
+ - [Clustering (mash) results](#clustering-mash-results)
+ - [Prokka annotation](#prokka-annotation)
+ - [Mapping againts found plasmids](#mapping-againts-found-plasmids)
+ - [PlasmidID data for circos](#plasmidid-data-for-circos)
+ - [Circos images](#circos-images)
+ - [Reconstructed plasmid sequences](#reconstructed-plasmid-sequences)
+ - [Summary report](#summary-report)
+
+## Preprocessing
+
+Preprocessing is performed using the assembly template. Check that doc for reference.
+
+## Assembly
+
+Assembly steps are done using the assembly template. Check that doc for reference.
+
+## PlasmidID
+
+[PlasmidID](https://github.com/BU-ISCIII/plasmidID) v1.6.5 is a mapping-based, assembly-assisted plasmid identification tool that analyzes and gives graphic solution for plasmid identification.
+
+PlasmidID is a computational pipeline that maps Illumina reads over plasmid database sequences. The k-mer filtered, most covered sequences are clustered by identity to avoid redundancy and the longest are used as scaffold for plasmid reconstruction. Reads are assembled and annotated by automatic and specific annotation. All information generated from mapping, assembly, annotation and local alignment analyses is gathered and accurately represented in a circular image which allow user to determine plasmidic composition in any bacterial sample.
+
+### Clustering (mash) results
+
+Mash is employed to pinpoint the plasmids present in the sample using a specific database. Subsequently, Mash calculates the genetic distances between these identified plasmids. These distances are then used to group similar plasmids into clusters. From each cluster, the longest plasmid is selected to represent the group in subsequent analyses.
+
+
+Output files description
+
+`NO_GROUP/kmer`
+ - database.filtered_XX: Contains identifiers for plasmids identified by Mash that exhibit a distance value greater than a specified threshold (e.g., 0.95).
+ - database.filtered_XX_term.XX.clusters.tab: A tabulated file listing the clusters of plasmids grouped based on their genetic similarities.
+ - database.filtered_XX_term.XX.representative.fasta: The FASTA formatted sequence file of the longests plasmids selected as representatives for each cluster.
+ - database.filtered_XX_term.XX.representative.fasta.*.bt2: Bowtie2 index files for the representative FASTA sequences.
+ - database.filtered_XX_term.fasta: the FASTA formatted sequences from the plasmids in database.filtered_XX.
+ - database.filtered_XX_term.mash.distances.tab: Tabulated data of Mash-calculated distances between the filtered plasmid sequences, used for clustering.
+ - database.msh: The Mash sketch file of the database, which is a compact binary representation of the set of plasmids used for quick distance estimation.
+ - database.screen.tab: Output file listing the results of the Mash screen operation, which compares the sample against the database to find matching plasmids.
+
+
+### Prokka annotation
+
+Provided assemblies are automatic annotated using prokka.
+
+
+Output files description
+
+`NO_GROUP/database`
+Prokka output files can be found [here](https://github.com/tseemann/prokka?tab=readme-ov-file#output-files)
+ - SAMPLE_NAME.err
+ - SAMPLE_NAME.fna
+ - SAMPLE_NAME.gff
+ - SAMPLE_NAME.gff.renamed
+ - SAMPLE_NAME.gff.bed: gff in bed format
+ - SAMPLE_NAME.gff.reverse.bed: only reverse genes
+ - SAMPLE_NAME.gff.forward.bed: only forward genes
+ - SAMPLE_NAME.sqn
+ - SAMPLE_NAME.txt
+ - SAMPLE_NAME.faa
+ - SAMPLE_NAME.fsa
+ - SAMPLE_NAME.tbl
+ - SAMPLE_NAME.ffn
+ - SAMPLE_NAME.gbk
+ - SAMPLE_NAME.log
+ - SAMPLE_NAME.tsv
+
+
+### Mapping againts found plasmids
+
+Once we have selected the representative plasmids that may be present in the sample, Bowtie2 is employed to map the raw sequencing reads against these plasmid sequences in FASTA format. Plasmids that achieve more than 80% coverage are retained for further analysis. Coverage metrics are calculated and recorded in temporary output files.
+
+
+Output files description
+
+`NO_GROUP/mapping`
+ - SAMPLE_NAME.coverage: Contains initial coverage bedgraph data for each plasmid
+ - SAMPLE_NAME.coverage_adapted: Adjusted coverage mean for each plasmid
+ - SAMPLE_NAME.coverage_adapted_clustered: adjusted coverage mean filtered with more than 80% coverage.
+ - SAMPLE_NAME.coverage_adapted_clustered_ac: identificator of filtered plasmids
+ - SAMPLE_NAME.coverage_adapted_clustered_percentage: coverage data for each plasmid in percentaje (1-value)
+ - SAMPLE_NAME.coverage_adapted_filtered_80: Lists plasmids with coverage exceeding 80%, selected for subsequent analysis.
+ - SAMPLE_NAME.coverage_adapted_filtered_80_term.fasta: FASTA formatted file containing sequences of plasmids with more than 80% coverage.
+ - SAMPLE_NAME.coverage_adapted_filtered_80_term.fasta.blast.tmp.*: Temporary BLAST files for sequences that have passed the 80% coverage threshold, used for further comparative analysis.
+ - SAMPLE_NAME.sorted.bam: BAM file of aligned reads sorted by coordinates.
+ - SAMPLE_NAME.sorted.bam.bai: Index file for the sorted BAM file, facilitating faster data retrieval.
+
+
+### PlasmidID data for circos
+
+Blast is employed to annotate selected databases by comparing them against the assemblies. Additionally, each contig within the assembly is aligned with the identified plasmids. The necessary files for Circos visualization are created in the data folder.
+
+
+Output files description
+
+`NO_GROUP/SAMPLE_NAME/data`
+ - pID_highlights.conf: genes highlights for circos.
+ - pID_text_annotation.coordinates: text annotation coordinates for circos.
+ - SAMPLE_NAME.bedgraph: bedgraph coverage for each plasmid
+ - SAMPLE_NAME.bedgraph_term: filtered bedgraph coverage for each plasmid
+ - SAMPLE_NAME.DB.bed: blast result in bed format
+ - SAMPLE_NAME.DB.blast: for each annotation database blast result against the assembly
+ - SAMPLE_NAME.DB.coordinates: blast result with the coordinates needed for the circos image
+ - SAMPLE_NAME.fna.blast.tmp.*: blast tmp database files
+ - SAMPLE_NAME.gff.forward.coordinates: gff coordinates for forward genes for annotation track
+ - SAMPLE_NAME.gff.reverse.coordinates: gff coordinates for reverse genes for annotation track
+ - SAMPLE_NAME.karyotype_individual.txt: karyotype template for each plasmid individual image
+ - SAMPLE_NAME.karyotype_summary.txt: karyotype circos file for summary image
+ - SAMPLE_NAME.plasmids.bed: blast result plasmids in bed format.
+ - SAMPLE_NAME.plasmids.blast: blast result contigs against identified plasmids.
+ - SAMPLE_NAME.plasmids.blast.links: blast result for links for contigs that match different plasmids.
+ - SAMPLE_NAME.plasmids.complete: complete track information for citcos
+ - SAMPLE_NAME.plasmids.links: links for contigs that match different plasmids.
+
+
+### Circos images
+
+Circos is used for creating one image for each identified plasmid and a summary image with all the plasmids identified in one figure. A manual for image interpretation can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track) and a manual about how to select the correct plasmid can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/How-to-chose-the-right-plasmids).
+
+
+
+
+
+Output files description
+
+`NO_GROUP/images`
+
+- SAMPLE_NAME_PLASMID_individual.circos.conf: circos conf file used for generating the individual image
+- SAMPLE_NAME_PLASMID.png: circos image for individual plasmidID
+- SAMPLE_NAME_summary.circos.conf: circos conf file used for genering the summary image
+- SAMPLE_NAME_summary.png: summary image
+
+
+
+## Reconstructed plasmid sequences
+
+A multifasta file is created for each plasmid including all the contig sequences that have matched the identified contig.
+
+
+Output files description
+
+`NO_GROUP/fasta_files`
+
+- PLASMID_term.fasta: multifasta file for each plasmid identified in the sample.
+
+
+
+### Summary report
+
+A summary report consolidating all samples in the analysis is created.
+
+
+Output files description
+
+`NO_GROUP`
+
+- `NO_GROUP_final_results.html`: report with same info as table below that can be viewed using chrome.
+- `NO_GROUP_final_results.tab`: plasmid info for each sample.
+
+
diff --git a/bu_isciii/assets/reports/results/plasmidid.md b/bu_isciii/assets/reports/results/plasmidid.md
new file mode 100644
index 000000000..047dddc5e
--- /dev/null
+++ b/bu_isciii/assets/reports/results/plasmidid.md
@@ -0,0 +1,27 @@
+# PlasmidID
+
+Here we describe the results from the Viralrecon pipeline for viral genome reconstruction.
+
+> [!WARNING]
+> Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested.
+
+## Summary report
+
+A summary report consolidating all samples in the analysis is created.
+
+- `NO_GROUP_final_results.html`: report with same info as table below that can be viewed using chrome.
+- `NO_GROUP_final_results.tab`: plasmid info for each sample. Header columns are described here:
+ - id: plasmid unique identifier for each entry.
+ - length: The length of the plasmid sequence.
+ - species description: A description of the species from which the sequence originates.
+ - fraction_covered: The fraction of the sequence that is covered by alignments.
+ - contig_name: The name of the contigs associated with the sequence.
+ - percentage: The percentage of the genome or sequence that is covered.
+ - images: Links or references to related images or visual data.
+
+## Circos images
+
+Circos is used for creating one image for each identified plasmid and a summary image with all the plasmids identified in one figure. A manual for image interpretation can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track) and a manual about how to select the correct plasmid can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/How-to-chose-the-right-plasmids).
+
+- `images/SAMPLE_NAME_PLASMID_individual.png`: circos image for individual plasmidID
+- `images/SAMPLE_NAME_summary_image.png`: summary image
\ No newline at end of file
diff --git a/bu_isciii/assets/reports/results/viralrecon.md b/bu_isciii/assets/reports/results/viralrecon.md
index f58232d11..48533c3de 100644
--- a/bu_isciii/assets/reports/results/viralrecon.md
+++ b/bu_isciii/assets/reports/results/viralrecon.md
@@ -1,70 +1,70 @@
-## Viralrecon
+# Viralrecon
Here we describe the results from the Viralrecon pipeline for viral genome reconstruction.
> [!WARNING]
> Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested.
-### Mapping approach results
+## Mapping approach results
* mapping_illumina.xlsx: statistics for mapped reads against viral and host genomes.
- - run: Run name
- - user: User name
- - host: Host name
- - Virussequence: Reference virus used
- - sample: Sample name
- - totalreads: Total reads after trimming
- - readshostR1: Total reads of host genome in R1
- - readshost: Total reads of host genome in R1 and R2
- - %readshost: Percentage of reads that correspond to the host genome
- - readsvirus: Number of reference viral genome reads
- - %readsvirus: Percentage of reference viral genome reads
- - unmappedreads: number of reads that did not correspond to viral reference or host genome.
- - %unmapedreads: Percentage of reads that did not correspond to viral reference or host genome.
- - medianDPcoveragevirus: Median depth of coverage of the reference viral genome
- - Coverage>10x(%): Percentage of viral reference genome coverage to more than 10X
- - Variantsinconsensusx10: Number of variants included in the consensus after filtering: more than 10X and 0.75 AF
- - %Ns10x: Percentage of consesus genome masked due to having less than 10X depth
- - Lineage: Pangolin assigned lineage. *Warning: Only for SARS-CoV-2 sequencing data*
- - Date: Analysis date. *Warning: Only for SARS-CoV-2 sequencing data*
+ * run: Run name
+ * user: User name
+ * host: Host name
+ * Virussequence: Reference virus used
+ * sample: Sample name
+ * totalreads: Total reads after trimming
+ * readshostR1: Total reads of host genome in R1
+ * readshost: Total reads of host genome in R1 and R2
+ * %readshost: Percentage of reads that correspond to the host genome
+ * readsvirus: Number of reference viral genome reads
+ * %readsvirus: Percentage of reference viral genome reads
+ * unmappedreads: number of reads that did not correspond to viral reference or host genome.
+ * %unmapedreads: Percentage of reads that did not correspond to viral reference or host genome.
+ * medianDPcoveragevirus: Median depth of coverage of the reference viral genome
+ * Coverage>10x(%): Percentage of viral reference genome coverage to more than 10X
+ * Variantsinconsensusx10: Number of variants included in the consensus after filtering: more than 10X and 0.75 AF
+ * %Ns10x: Percentage of consesus genome masked due to having less than 10X depth
+ * Lineage: Pangolin assigned lineage. *Warning: Only for SARS-CoV-2 sequencing data*
+ * Date: Analysis date. *Warning: Only for SARS-CoV-2 sequencing data*
* mapping_consensus: this folder contains the masked (<10x) genomes obtained with consensus sequences using mapping and majority variant calling
* variants_annot: table with all annotated variants. *Warning: Only when annotation .gff file was provided*
* variants_long_table.xlsx: Table with variants for all the samples in long format. *Warning: Only when annotation .gff file was provided*
- - SAMPLE: sample name
- - CHROM: Reference ID
- - POS: Position of the variant
- - REF: Ref allele
- - ALT: Alt allele
- - FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear
- - DP: Position depth
- - REF_DP: Ref allele depth
- - ALT_DP: Alt allele depth
- - AF: Allele frequency
- - GENE: Gene name in annotation file
- - EFFECT: Effect of the variant
- - HGVS_C: Position annotation at CDS level
- - HGVS_P: Position annotation at protein level
- - HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format
- - Caller: Variant caller used
+ * SAMPLE: sample name
+ * CHROM: Reference ID
+ * POS: Position of the variant
+ * REF: Ref allele
+ * ALT: Alt allele
+ * FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear
+ * DP: Position depth
+ * REF_DP: Ref allele depth
+ * ALT_DP: Alt allele depth
+ * AF: Allele frequency
+ * GENE: Gene name in annotation file
+ * EFFECT: Effect of the variant
+ * HGVS_C: Position annotation at CDS level
+ * HGVS_P: Position annotation at protein level
+ * HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format
+ * Caller: Variant caller used
* pangolin.xlsx: Pangolin complete results *Warning: Only for SARS-CoV-2 sequencing data*
* nextclade.xlsx: Results from Nextclade *Warning: Only for SARS-CoV-2 sequencing data*
-### *de novo* assembly approach results
+## *de novo* assembly approach results
* assembly_stats.xlsx: Stats of the *de novo* assembly steps. This table contains the following columns:
- - run: Run name
- - user: User name
- - host: Host name
- - Virussequence: Reference virus used
- - sample: Sample name
- - totalreads: Total reads after trimming
- - readshostR1: Total reads of host genome in R1
- - readshost: Total reads of host genome in R1 and R2
- - %readshost: Percentage of reads that correspond to the host genome
- - Non-host-reedas: Number of reads remaining after host removal
- - \#Contigs: Number of contigs in the assembly
- - Largest contig: Size in nucleotides of the larges contig in the assembly
- - % Genome fraction: Percentage of the reference genome covered by the assembly. *Warning: Only when reference genome was provided*
+ * run: Run name
+ * user: User name
+ * host: Host name
+ * Virussequence: Reference virus used
+ * sample: Sample name
+ * totalreads: Total reads after trimming
+ * readshostR1: Total reads of host genome in R1
+ * readshost: Total reads of host genome in R1 and R2
+ * %readshost: Percentage of reads that correspond to the host genome
+ * Non-host-reedas: Number of reads remaining after host removal
+ * \#Contigs: Number of contigs in the assembly
+ * Largest contig: Size in nucleotides of the larges contig in the assembly
+ * % Genome fraction: Percentage of the reference genome covered by the assembly. *Warning: Only when reference genome was provided*
* assembly_spades: Scaffolds fasta files with the spades de novo assembly. *Warning: Only when NO reference genome was provided, or reference genome didn't match*
* abacas_assembly: spades de novo assembly where contigs were contiguated using ABACAS and the reference genome. *Warning: Only when reference genome was provided*
diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json
index 794c3d0e2..b7b8eefb9 100755
--- a/bu_isciii/templates/services.json
+++ b/bu_isciii/templates/services.json
@@ -80,8 +80,8 @@
"files":[]
},
"no_copy": ["RAW", "TMP"],
- "delivery_md": "",
- "results_md": ""
+ "delivery_md": "assets/reports/md/plasmidid.md",
+ "results_md": "assets/reports/results/plasmidid.md"
},
"wgmlst_taranis": {
"label": "",
diff --git a/bu_isciii/utils.py b/bu_isciii/utils.py
index 67541434a..6bcab2c98 100755
--- a/bu_isciii/utils.py
+++ b/bu_isciii/utils.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+import logging
import calendar
import datetime
import hashlib
@@ -28,6 +29,7 @@ def rich_force_colors():
return None
+log = logging.getLogger(__name__)
stderr = rich.console.Console(
stderr=True, style="dim", highlight=False, force_terminal=rich_force_colors()
)
@@ -267,10 +269,13 @@ def get_dir_size(path):
for path, dirs, files in os.walk(path):
for file in files:
- if os.path.islink(os.path.join(path, file)):
- size += os.lstat(os.path.join(path, file)).st_size
- else:
- size += os.path.getsize(os.path.join(path, file))
+ try:
+ if os.path.islink(os.path.join(path, file)):
+ size += os.lstat(os.path.join(path, file)).st_size
+ else:
+ size += os.path.getsize(os.path.join(path, file))
+ except FileNotFoundError as e:
+ log.warning(f"File not found error while scouting size: {e}")
return size