diff --git a/README.md b/README.md
index 1155be0..934c0ce 100644
--- a/README.md
+++ b/README.md
@@ -47,18 +47,25 @@ You can run Chloë from the terminal. To access the annotator help manual use:
julia --project=. chloe.jl annotate --help
```
+Equivalently you can invoke Chloe with:
+
+```bash
+julia --project=. -e 'using Chloe; chloe_main()' annotate --help
+```
+
+
For annotating single sequences (e.g. the test genome `NC_020019.1.fa` available in the folder `testfa` with the default output in `.sff` format:
```bash
julia --project=. chloe.jl annotate testfa/NC_020019.1.fa
```
-For annotating all fasta file in a directory ending with `.fa` specifying the `.gff` output format:
+For annotating all fasta file in a directory ending with `.fa` specifying the `.sff` output format:
```bash
-julia --project=. chloe.jl annotate -g testfa/*.fa
+julia --project=. chloe.jl annotate --sff testfa/*.fa
```
-This will create `.gff` files for each fasta file and write them back into the directory where the annotated fasta files are located.
+This will create `.chloe.sff` files for each fasta file and write them back into the directory where the annotated fasta files are located.
To see what other commands are available:
@@ -66,11 +73,6 @@ To see what other commands are available:
julia --project=. chloe.jl --help
```
-Annotate fasta files from command line specifying the location of your Chloë references
-```bash
-julia --project=. -e 'using Chloe; chloe_main()' -- annotate -r cp *.fa
-```
-
## Julia Projects
You can install Chloë as a Julia package and environment from within the Julia REPL. To create a project in your directory `myproject` initiate a Julia project and add Chloë as a package:
@@ -94,27 +96,6 @@ outfile, uid = Chloe.annotate(references, "NC_011032.1.fa") #run annotation on
println(outfile) #print output in REPL
```
-Write to buffer instead of to a file.
-
-```julia
-import Chloe
-references = Chloe.ReferenceDb("cp")
-io, uid = Chloe.annotate(references, "NC_011032.1.fa", nothing, IOBuffer())
-# show .sff content
-println(String(take!(io)))
-```
-
-Read from an already open fasta file.
-
-
-```julia
-import Chloe
-references = Chloe.ReferenceDb("cp")
-outfile, uid = open("NC_011032.1.fa", "r") do io
- Chloe.annotate(references, io)
-end
-```
-
Or if you prefer you can use the commandline interface from the REPL to invoke Chloe:
```julia
@@ -128,7 +109,7 @@ For more recipes using Chloë see our [Recipes](https://github.com/ian-small/chl
## Output formats
-Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The default output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:
+Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The `--sff` output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:
@@ -141,14 +122,13 @@ gene name/gene copy (so if 2 or higher is a duplicate of another gene)/feature t
Subsequent columns are: strand, start, length, phase;
Then 5 columns of interest if you want to understand why Chloë has predicted this particular feature: length relative to feature template, proportion of references that match, mean coverage of aligned genomes (out of 100), feature probability (from XGBoost model), coding probability (from XGBoost model)
-Most users will probably want to use `chloe.jl annotate -g` to obtain the output in standard `.gff` format:
-
+The default output is GFF:
By default, Chloë filters out features which are detected to have one of a set of problematic issues, or which have a feature probability of < 0.5.
-You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate -s 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
+You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate --sff --sensitivity 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
```[ Warning: rps16/1 lacks a start codon
[ Warning: rps16/1 has a premature stop codon
[ Warning: rps16/1 CDS is not divisible by 3
diff --git a/RECIPES.md b/RECIPES.md
index a5b0a13..7361dff 100644
--- a/RECIPES.md
+++ b/RECIPES.md
@@ -28,7 +28,7 @@ fastas = readdir(fasta_directory) |> filter(f -> endswith(f, r"\.fa")) |> pmap(f
# outputs is the list of output files
outputs = @distributed (vcat) for fasta = fastas
# note that `references` in on the worker process
- output, uid = annotate(references, fasta, nothing, fasta * ".sff")
+ output, uid = annotate(references, fasta, nothing, ".")
[output]
end
```
diff --git a/src/annotate_genomes.jl b/src/annotate_genomes.jl
index 7c9c8f7..1a959e1 100644
--- a/src/annotate_genomes.jl
+++ b/src/annotate_genomes.jl
@@ -521,17 +521,13 @@ end
function write_result(result::ChloeAnnotation, asgff3::Bool, filestem::String)::Tuple{Union{String,IO},String}
if !asgff3
- writeSFF(
- filestem * ".sff",
- result.target_id,
- result.target_length,
- geomean(values(result.coverages)),
- result.annotation
- )
+ out = filestem * ".chloe.sff"
+ writeSFF(out, result.target_id, result.target_length, geomean(values(result.coverages)), result.annotation)
else
- writeGFF3(filestem * ".gff", result.target_id, result.target_length, result.annotation)
+ out = filestem * ".chloe.gff"
+ writeGFF3(out, result.target_id, result.target_length, result.annotation)
end
- return filestem, result.target_id
+ return out, result.target_id
end
function fasta_reader(infile::IO)::Tuple{String,FwdRev{CircularSequence}}
@@ -567,27 +563,27 @@ function annotate(
)::Tuple{Union{String,IO},String}
config = isnothing(config) ? ChloeConfig() : config
result = annotate_one_worker(db, target_id, target, config)
- filestem = joinpath(output, result.target_id * ".chloe")
if ~config.no_transform
target, result = transform!(target, result, db.templates)
- FASTAWriter(open(filestem * ".fa", "w")) do outfile
+ FASTAWriter(open(output * ".chloe.fa", "w")) do outfile
write(outfile, FASTARecord(result.target_id, target.forward[1:length(target.forward)]))
end
end
- write_result(result, config.asgff3, filestem)
+ write_result(result, config.asgff3, output)
end
function annotate(
db::AbstractReferenceDb,
infile::String,
config::Union{ChloeConfig,Nothing}=nothing,
- output::MayBeString="."
+ output::MayBeString=".",
+ stem::MayBeString=nothing
)
if isnothing(output)
output = dirname(infile)
end
maybe_gzread(infile) do io
- annotate(db, io, config, output)
+ annotate(db, io, config, output, stem)
end
end
@@ -595,17 +591,34 @@ function annotate(
db::AbstractReferenceDb,
infile::IO,
config::Union{ChloeConfig,Nothing}=nothing,
- output::MayBeString="."
+ output::MayBeString=".",
+ stem::MayBeString=nothing
)
target_id, seqs = fasta_reader(infile)
+ output = isnothing(stem) ? joinpath(output, target_id) : joinpath(output, stem)
annotate(db, target_id, seqs, config, output)
end
-function annotate_batch(db::AbstractReferenceDb, fa_files::Vector{String}, config::ChloeConfig, output::MayBeString=".")
+function filestem(fname)
+ fname = splitdir(fname)[2]
+ if endswith(fname, r"\.gz")
+ fname, _ = splitext(fname)
+ end
+ splitext(fname)[1]
+end
+
+function annotate_batch(
+ db::AbstractReferenceDb,
+ fa_files::Vector{String},
+ config::ChloeConfig,
+ output::MayBeString=".",
+ use_id::Bool=false
+)
odir = isnothing(output) ? fname -> dirname(fname) : _ -> output
for infile in fa_files
+ stem = use_id ? nothing : filestem(infile)
maybe_gzread(infile) do io
- annotate(db, io, config, odir(infile))
+ annotate(db, io, config, odir(infile), stem)
end
end
end
diff --git a/src/chloe_cmd.jl b/src/chloe_cmd.jl
index 48bd942..b53374f 100644
--- a/src/chloe_cmd.jl
+++ b/src/chloe_cmd.jl
@@ -25,7 +25,8 @@ function chloe(;
output::Union{String,Nothing}=nothing,
no_transform::Bool=false,
sff::Bool=false,
- no_filter::Bool=false
+ no_filter::Bool=false,
+ use_id::Bool=false
)
if ~isnothing(output)
if ~isdir(output)
@@ -41,7 +42,7 @@ function chloe(;
no_filter=no_filter,
reference=reference_dir
)
- Annotator.annotate_batch(db, fasta_files, config, output)
+ Annotator.annotate_batch(db, fasta_files, config, output, use_id)
end
function getargs(args::Vector{String}=ARGS)
@@ -118,6 +119,9 @@ function getargs(args::Vector{String}=ARGS)
"--sff"
action = :store_true
help = "save output in sff format instead of gff3"
+ "--use-id"
+ action = :store_true
+ help = "Use the target_id found in the fasta file as the output filename"
end
parse_args(args, cmd_args; as_symbols=true)