--use-id output filename stems

ian-small · Oct 17, 2024 · 332c95b · 332c95b
1 parent 50362f2
commit 332c95b
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -47,30 +47,32 @@ You can run Chloë from the terminal. To access the annotator help manual use:
 julia --project=. chloe.jl annotate --help
 ```
 
+Equivalently you can invoke Chloe with:
+
+```bash
+julia --project=. -e 'using Chloe; chloe_main()' annotate --help
+```
+
+
 For annotating single sequences (e.g. the test genome `NC_020019.1.fa` available in the folder `testfa` with the default output in `.sff` format:
 ```bash
 julia --project=. chloe.jl annotate testfa/NC_020019.1.fa
 ```
 
-For annotating all fasta file in a directory ending with `.fa` specifying the `.gff` output format: 
+For annotating all fasta file in a directory ending with `.fa` specifying the `.sff` output format: 
 
 ```bash
-julia --project=. chloe.jl annotate -g testfa/*.fa
+julia --project=. chloe.jl annotate --sff testfa/*.fa
 ```
 
-This will create `.gff` files for each fasta file and write them back into the directory where the annotated fasta files are located.
+This will create `.chloe.sff` files for each fasta file and write them back into the directory where the annotated fasta files are located.
 
 To see what other commands are available:
 
 ```bash
 julia --project=. chloe.jl --help
 ```
 
-Annotate fasta files from command line specifying the location of your Chloë references
-```bash
-julia --project=. -e 'using Chloe; chloe_main()' -- annotate -r cp *.fa
-```
-
 ## Julia Projects
 You can install Chloë as a Julia package and environment from within the Julia REPL. To create a project in your directory `myproject` initiate a Julia project and add Chloë as a package:
 
@@ -94,27 +96,6 @@ outfile, uid = Chloe.annotate(references,  "NC_011032.1.fa") #run annotation on
 println(outfile) #print output in REPL
 ```
 
-Write to buffer instead of to a file.
-
-```julia
-import Chloe
-references = Chloe.ReferenceDb("cp")
-io, uid = Chloe.annotate(references, "NC_011032.1.fa", nothing, IOBuffer())
-# show .sff content
-println(String(take!(io)))
-```
-
-Read from an already open fasta file.
-
-
-```julia
-import Chloe
-references = Chloe.ReferenceDb("cp")
-outfile, uid = open("NC_011032.1.fa", "r") do io
-    Chloe.annotate(references, io)
-end
-```
-
 Or if you prefer you can use the commandline interface from the REPL to invoke Chloe:
 
 ```julia
@@ -128,7 +109,7 @@ For more recipes using Chloë see our [Recipes](https://github.com/ian-small/chl
 
 ## Output formats
 
-Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The default output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:
+Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The `--sff` output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:
 
 
 <img src="assets/sff.png" width="600">
@@ -141,14 +122,13 @@ gene name/gene copy (so if 2 or higher is a duplicate of another gene)/feature t
 Subsequent columns are: strand, start, length, phase;
 Then 5 columns of interest if you want to understand why Chloë has predicted this particular feature: length relative to feature template, proportion of references that match, mean coverage of aligned genomes (out of 100), feature probability (from XGBoost model), coding probability (from XGBoost model)
 
-Most users will probably want to use `chloe.jl annotate -g` to obtain the output in standard `.gff` format: 
-
+The default output is GFF:
 
 <img src="assets/gff.png" width="700">
 
 
 By default, Chloë filters out features which are detected to have one of a set of problematic issues, or which have a feature probability of < 0.5.
-You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate -s 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
+You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate --sff --sensitivity 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
 ```[ Warning: rps16/1 lacks a start codon
 [ Warning: rps16/1 has a premature stop codon
 [ Warning: rps16/1 CDS is not divisible by 3

diff --git a/RECIPES.md b/RECIPES.md
@@ -28,7 +28,7 @@ fastas = readdir(fasta_directory) |> filter(f -> endswith(f, r"\.fa")) |> pmap(f
 # outputs is the list of output files
 outputs = @distributed (vcat) for fasta = fastas
     # note that `references` in on the worker process
-    output, uid = annotate(references, fasta, nothing, fasta * ".sff")
+    output, uid = annotate(references, fasta, nothing, ".")
     [output]
 end
 ```

diff --git a/src/annotate_genomes.jl b/src/annotate_genomes.jl
@@ -521,17 +521,13 @@ end
 
 function write_result(result::ChloeAnnotation, asgff3::Bool, filestem::String)::Tuple{Union{String,IO},String}
     if !asgff3
-        writeSFF(
-            filestem * ".sff",
-            result.target_id,
-            result.target_length,
-            geomean(values(result.coverages)),
-            result.annotation
-        )
+        out = filestem * ".chloe.sff"
+        writeSFF(out, result.target_id, result.target_length, geomean(values(result.coverages)), result.annotation)
     else
-        writeGFF3(filestem * ".gff", result.target_id, result.target_length, result.annotation)
+        out = filestem * ".chloe.gff"
+        writeGFF3(out, result.target_id, result.target_length, result.annotation)
     end
-    return filestem, result.target_id
+    return out, result.target_id
 end
 
 function fasta_reader(infile::IO)::Tuple{String,FwdRev{CircularSequence}}
@@ -567,45 +563,62 @@ function annotate(
 )::Tuple{Union{String,IO},String}
     config = isnothing(config) ? ChloeConfig() : config
     result = annotate_one_worker(db, target_id, target, config)
-    filestem = joinpath(output, result.target_id * ".chloe")
     if ~config.no_transform
         target, result = transform!(target, result, db.templates)
-        FASTAWriter(open(filestem * ".fa", "w")) do outfile
+        FASTAWriter(open(output * ".chloe.fa", "w")) do outfile
             write(outfile, FASTARecord(result.target_id, target.forward[1:length(target.forward)]))
         end
     end
-    write_result(result, config.asgff3, filestem)
+    write_result(result, config.asgff3, output)
 end
 
 function annotate(
     db::AbstractReferenceDb,
     infile::String,
     config::Union{ChloeConfig,Nothing}=nothing,
-    output::MayBeString="."
+    output::MayBeString=".",
+    stem::MayBeString=nothing
 )
     if isnothing(output)
         output = dirname(infile)
     end
     maybe_gzread(infile) do io
-        annotate(db, io, config, output)
+        annotate(db, io, config, output, stem)
     end
 end
 
 function annotate(
     db::AbstractReferenceDb,
     infile::IO,
     config::Union{ChloeConfig,Nothing}=nothing,
-    output::MayBeString="."
+    output::MayBeString=".",
+    stem::MayBeString=nothing
 )
     target_id, seqs = fasta_reader(infile)
+    output = isnothing(stem) ? joinpath(output, target_id) : joinpath(output, stem)
     annotate(db, target_id, seqs, config, output)
 end
 
-function annotate_batch(db::AbstractReferenceDb, fa_files::Vector{String}, config::ChloeConfig, output::MayBeString=".")
+function filestem(fname)
+    fname = splitdir(fname)[2]
+    if endswith(fname, r"\.gz")
+        fname, _ = splitext(fname)
+    end
+    splitext(fname)[1]
+end
+
+function annotate_batch(
+    db::AbstractReferenceDb,
+    fa_files::Vector{String},
+    config::ChloeConfig,
+    output::MayBeString=".",
+    use_id::Bool=false
+)
     odir = isnothing(output) ? fname -> dirname(fname) : _ -> output
     for infile in fa_files
+        stem = use_id ? nothing : filestem(infile)
         maybe_gzread(infile) do io
-            annotate(db, io, config, odir(infile))
+            annotate(db, io, config, odir(infile), stem)
         end
     end
 end

diff --git a/src/chloe_cmd.jl b/src/chloe_cmd.jl
@@ -25,7 +25,8 @@ function chloe(;
     output::Union{String,Nothing}=nothing,
     no_transform::Bool=false,
     sff::Bool=false,
-    no_filter::Bool=false
+    no_filter::Bool=false,
+    use_id::Bool=false
 )
     if ~isnothing(output)
         if ~isdir(output)
@@ -41,7 +42,7 @@ function chloe(;
         no_filter=no_filter,
         reference=reference_dir
     )
-    Annotator.annotate_batch(db, fasta_files, config, output)
+    Annotator.annotate_batch(db, fasta_files, config, output, use_id)
 end
 
 function getargs(args::Vector{String}=ARGS)
@@ -118,6 +119,9 @@ function getargs(args::Vector{String}=ARGS)
         "--sff"
         action = :store_true
         help = "save output in sff format instead of gff3"
+        "--use-id"
+        action = :store_true
+        help = "Use the target_id found in the fasta file as the output filename"
     end
 
     parse_args(args, cmd_args; as_symbols=true)