Skip to content

Commit

Permalink
Merge pull request #140 from phac-nml/inx_id
Browse files Browse the repository at this point in the history
added sample irida_next sample field option
  • Loading branch information
apetkau authored Nov 26, 2024
2 parents 9ed27b8 + ae296a3 commit f853ce8
Show file tree
Hide file tree
Showing 19 changed files with 479 additions and 44 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### `Changed`
### `Added`

- Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)

- Added a new `sample_name` field to the `schema_input.json` file: [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)

- Incorporated a `--skip_read_merging` parameter to prevent read merging [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)

### `Changed`

- Added a `sample_name` field, `sample` still exists but is used to incorporate additional names/identifiers in IRIDANext [PR 140](https://github.com/phac-nml/mikrokondo/pull/140)

- RASUSA now used for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125)


### `Updated`

- Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123)
Expand Down
7 changes: 6 additions & 1 deletion assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
"sample": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Sample name must be provided and cannot contain spaces",
"meta": ["external_id"],
"errorMessage": "Sample name to be used in report generation. Valid characters include alphanumeric and -. All other characters will be replaced by underscores."
},
"sample_name": {
"type": "string",
"errorMessage": "Optional. Used to override sample when used in tools like IRIDA-Next. Valid characters include alphanumeric and -. All other characters will be replaced by underscores.",
"meta": ["id"]
},
"fastq_1": {
Expand Down
33 changes: 23 additions & 10 deletions bin/report_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ class JsonImport:
__keep_keys = frozenset(__key_order.keys())
__delimiter = "\t"
__key_delimiter = "."
__inx_irida_key = "meta.external_id"

def __init__(self, report_fp, output_name, sample_suffix):
self.tool_data = None # TODO set this in output of group tool fields
self.tool_data = None
self.output_name = output_name
self.output_transposed = os.path.splitext(os.path.basename(self.output_name))[0] + "_transposed.tsv"
self.output_dir = os.path.dirname(self.output_name)
Expand All @@ -49,7 +50,7 @@ def __init__(self, report_fp, output_name, sample_suffix):
self.flat_sample_string = sample_suffix
self.data = self.ingest_report(self.report_fp)
self.flat_data, self.common_fields, self.tool_fields, self.table = self.flatten_json(self.data)
self.output_indv_json(self.flat_data)
self.flat_data = self.output_indv_json(self.flat_data)
self.output_flat_json(self.flat_data)
self.write_table(self.table)

Expand All @@ -64,7 +65,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
"""
keys = set([k for k in table_data])
ordered_keys = []

# Get the wanted information to the top of the page
poisoned_keys = set()
for option in self.__key_order:
Expand All @@ -79,7 +79,6 @@ def write_table(self, table_data: Dict[str, Dict[str, str]]):
ordered_keys.extend(scalar_keys)
ordered_keys.extend(sorted([i for i in keys if i not in ordered_keys and i not in poisoned_keys]))
row_labels = sorted([i for i in next(iter(table_data.values()))])

self.write_tsv(table_data, row_labels, ordered_keys)
self.write_transposed_tsv(table_data, row_labels, ordered_keys)

Expand Down Expand Up @@ -233,7 +232,6 @@ def remove_prefix_id_fields(self, flattened_dict):
top_level_keys.add(item_key)
temp[item_key] = v

#self.tool_data = tool_data
return reformatted_data, top_level_keys, tool_keys


Expand All @@ -242,7 +240,7 @@ def ingest_report(self, report_fp):
report_fp: File path to the json report to be read in
"""
data = None
with open(report_fp, "r", encoding="utf8") as report:
with open(report_fp, "r") as report:
data = json.load(report)
return data

Expand All @@ -262,11 +260,27 @@ def output_indv_json(self, flattened_data):
Args:
flattened_data (json: Dict[sample_id: Dict[tool_info: value]]):
"""
updated_items = dict()
for k, v in flattened_data.items():
with open(os.path.join(self.output_dir, k + self.flat_sample_string), "w") as output:
out_key = k
sample_dir = k
dir_name = v.get(self.__inx_irida_key)
if k != dir_name:
sample_dir = dir_name
#! this field affects the identification of the irida next id being passed out of the pipeline
out_key = sample_dir # this field must be overwritten for iridanext to identify the correct metdata field
out_dir = os.path.join(self.output_dir, sample_dir)
out_path = os.path.join(out_dir, k + self.flat_sample_string)
if not os.path.isdir(out_dir): # Check for directory existence, as it will still exist on pipeline resumes
os.mkdir(out_dir)

with open(out_path, "w") as output:
json_data = json.dumps({k: v}, indent=2)
output.write(json_data)
updated_items[out_key] = v

flattened_data = updated_items
return flattened_data

def to_file(self):
with open(self.output_name, "w") as out_file:
Expand All @@ -282,7 +296,6 @@ def to_file(self):
out_file.write(f'"{val_write}"')
else:
out_file.write(val_write)
# out_file.write(str(ii[1][i]).replace('\n', ' \\'))
out_file.write(self.__delimiter)
out_file.write("\n")

Expand All @@ -291,7 +304,7 @@ def to_file(self):



def main_(args_in):
def main(args_in):
default_samp_suffix = "_flat_sample.json"
parser = argparse.ArgumentParser("Table Summary")
parser.add_argument("-f", "--file-in", help="Path to the mikrokondo json summary")
Expand All @@ -307,4 +320,4 @@ def main_(args_in):

if __name__ == "__main__":
# pass json file to program to parse it
main_(sys.argv[1:])
main(sys.argv[1:])
2 changes: 1 addition & 1 deletion conf/irida_next.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ iridanext {
overwrite = true
validate = false
files {
idkey = "sample"
idkey = 'external_id' // Previously sample
global = [
"**/FinalReports/Aggregated/Json/final_report.json",
"**/FinalReports/Aggregated/Tables/final_report.tsv"
Expand Down
11 changes: 7 additions & 4 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,33 @@ Mikrokondo requires a sample sheet to be run. This FOFN (file of file names) con
- long_reads
- assembly

> **Note:** Illegal characters (e.g. characters that match the expression [^A-Za-z0-9_\-] ) in the sample name will be replaced with underscores.
Example layouts for different sample-sheets include:

_Illumina paired-end data_

|sample|fastq_1|fastq_2|
|------|-------|-------|
|sample_name|path_to_forward_reads|path_to_reversed_reads|
|sample|path_to_forward_reads|path_to_reversed_reads|

_Nanopore_

|sample|long_reads|
|------|----------|
|sample_name|path_to_reads|
|sample|path_to_reads|

_Hybrid Assembly_

|sample|fastq_1|fastq_2|long_reads|
|-------|-------|------|----------|
|sample_name|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|
|sample|path_to_forward_reads|path_to_reversed_reads|path_to_long_reads|

_Starting with assembly only_

|sample|assembly|
|------|--------|
|sample_name|path_to_assembly|
|sample|path_to_assembly|

_Example merging paired-end data_

Expand Down Expand Up @@ -96,6 +97,8 @@ _Example merging paired-end data_
Numerous steps within mikrokondo can be turned off without compromising the stability of the pipeline. This skip options can reduce run-time of the pipeline or allow for completion of the pipeline despite errors.
** All of the above options can be turned on by entering `--{skip_option} true` in the command line arguments to the pipeline (where optional parameters can be added)**


- `--skip_read_merging`: Do not merge reads, if duplicate sample names are present the names will be made unique.
- `--skip_abricate`: turn off abricate AMR detection
- `--skip_bakta`: turn off bakta annotation pipeline (generally a slow step, requiring a database to be specified).
- `--skip_checkm`: used as part of the contamination detection within mikrokondo, its run time and resource usage can be quite lengthy.
Expand Down
13 changes: 6 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ if (params.help) {
if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }





/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
NAMED WORKFLOW FOR PIPELINE
Expand Down Expand Up @@ -111,15 +108,17 @@ workflow MIKROKONDO {
REPORT_AGGREGATE(REPORT.out.final_report)
ch_versions = ch_versions.mix(REPORT_AGGREGATE.out.versions)


updated_samples = REPORT_AGGREGATE.out.flat_samples.flatten().map{
sample ->
def name_trim = sample.getName()
def trimmed_name = name_trim.substring(0, name_trim.length() - params.report_aggregate.sample_flat_suffix.length())
tuple([
def external_id_name = sample.getParent().getBaseName()
def output_map = [
"id": trimmed_name,
"sample": trimmed_name],
sample)
"sample": trimmed_name,
"external_id": external_id_name]

tuple(output_map, sample)
}

GZIP_FILES(updated_samples)
Expand Down
8 changes: 4 additions & 4 deletions modules/local/combine_data.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ process COMBINE_DATA{
def fields_merge = meta.fields_merge

if(fastq_1){
cmd_ << "cat ${meta.fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
cmd_ << "cat ${fastq_1.join(' ')} > out/${prefix}_R1.merged.fastq.gz;"
}
if(fastq_2){
cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
cmd_ << "cat ${fastq_2.join(' ')} > out/${prefix}_R2.merged.fastq.gz;"
}
if(long_reads){
cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
cmd_ << "cat ${long_reads.join(' ')} > out/${prefix}.merged.fastq.gz;"
}
if(assembly){
cmd_ << "cat ${meta.fastq_2.join(' ')} > out/${prefix}.merged.fastq.gz;"
cmd_ << "cat ${assembly.join(' ')} > out/${prefix}.merged.fastq.gz;"
}
def cmd = cmd_.join("\n")
// creating dummy outputs so that all outputs exist for any scenario
Expand Down
5 changes: 3 additions & 2 deletions modules/local/report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ process REPORT{

if(!sample_data.containsKey(meta_data.sample)){
sample_data[meta_data.sample] = [:]
// TODO add strings to constants file
sample_data[meta_data.sample]["meta"] = [:]
}

update_map_values(sample_data, meta_data, "metagenomic")
update_map_values(sample_data, meta_data, "id")
update_map_values(sample_data, meta_data, "sample")
update_map_values(sample_data, meta_data, "external_id")
update_map_values(sample_data, meta_data, "assembly")
update_map_values(sample_data, meta_data, "hybrid")
update_map_values(sample_data, meta_data, "single_end")
Expand All @@ -63,7 +65,6 @@ process REPORT{
if(!check_file_params(report_tag, extension)){
continue
}
// TODO pass in report metadata
def output_data = parse_data(report_value, extension, report_tag, headers_list)
if(output_data){
report_value = output_data
Expand Down
2 changes: 1 addition & 1 deletion modules/local/report_aggregate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ process REPORT_AGGREGATE{
path("final_report.tsv"), emit: final_report
path("final_report_transposed.tsv"), emit: final_report_transposed
path("final_report_flattened.json"), emit: flattened_files
path("*${sample_flat_suffix}"), emit: flat_samples
path("*/*${sample_flat_suffix}"), emit: flat_samples
path "versions.yml", emit: versions

script:
Expand Down
9 changes: 4 additions & 5 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/



// Global default params, used in configs
params {
// Input options
Expand Down Expand Up @@ -43,12 +44,11 @@ params {
show_hidden_params = false
validationS3PathCheck = true
validationShowHiddenParams = false
validationSchemaIgnoreParams = 'locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
validationSchemaIgnoreParams = 'rasusa,locidex_summary,allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
validationFailUnrecognisedParams = false // for the qcreport fields

// SKIP options
// TODO need to add constants section
// TODO investigate usage of template scripts to replace mash modules
skip_read_merging = true
skip_report = false
skip_raw_read_metrics = false
skip_version_gathering = false
Expand All @@ -58,7 +58,7 @@ params {
skip_checkm = false
skip_depth_sampling = false // TODO have it mentioned that this should be turned off for metagenomic runs
skip_ont_header_cleaning = true // TODO an awk script can likely replace this and be much faster at what it does...
skip_polishing = false // TODO make it clear this does not apply to Hybrid assembly
skip_polishing = false
skip_species_classification = false
skip_mlst = false
skip_mobrecon = false
Expand Down Expand Up @@ -146,7 +146,6 @@ params {
version = false



// If a param in camel case is present nextflow automatically creates a kebab case parameter as well

stage_in_mode = 'symlink'
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,12 @@
"type": "boolean",
"description": "Do not enter the subtyping workflow, e.g. ECTyper, SISTR etc will not be ran."
},
"skip_read_merging": {
"type": "boolean",
"default": true,
"description": "Do not merge reads",
"hidden": true
},
"skip_bakta": {
"type": "boolean",
"default": true,
Expand Down
Loading

0 comments on commit f853ce8

Please sign in to comment.