Skip to content

Commit

Permalink
--use-id output filename stems
Browse files Browse the repository at this point in the history
  • Loading branch information
arabidopsis committed Oct 17, 2024
1 parent 50362f2 commit 332c95b
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 53 deletions.
46 changes: 13 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,30 +47,32 @@ You can run Chloë from the terminal. To access the annotator help manual use:
julia --project=. chloe.jl annotate --help
```

Equivalently you can invoke Chloe with:

```bash
julia --project=. -e 'using Chloe; chloe_main()' annotate --help
```


For annotating single sequences (e.g. the test genome `NC_020019.1.fa` available in the folder `testfa` with the default output in `.sff` format:
```bash
julia --project=. chloe.jl annotate testfa/NC_020019.1.fa
```

For annotating all fasta file in a directory ending with `.fa` specifying the `.gff` output format:
For annotating all fasta file in a directory ending with `.fa` specifying the `.sff` output format:

```bash
julia --project=. chloe.jl annotate -g testfa/*.fa
julia --project=. chloe.jl annotate --sff testfa/*.fa
```

This will create `.gff` files for each fasta file and write them back into the directory where the annotated fasta files are located.
This will create `.chloe.sff` files for each fasta file and write them back into the directory where the annotated fasta files are located.

To see what other commands are available:

```bash
julia --project=. chloe.jl --help
```

Annotate fasta files from command line specifying the location of your Chloë references
```bash
julia --project=. -e 'using Chloe; chloe_main()' -- annotate -r cp *.fa
```

## Julia Projects
You can install Chloë as a Julia package and environment from within the Julia REPL. To create a project in your directory `myproject` initiate a Julia project and add Chloë as a package:

Expand All @@ -94,27 +96,6 @@ outfile, uid = Chloe.annotate(references, "NC_011032.1.fa") #run annotation on
println(outfile) #print output in REPL
```

Write to buffer instead of to a file.

```julia
import Chloe
references = Chloe.ReferenceDb("cp")
io, uid = Chloe.annotate(references, "NC_011032.1.fa", nothing, IOBuffer())
# show .sff content
println(String(take!(io)))
```

Read from an already open fasta file.


```julia
import Chloe
references = Chloe.ReferenceDb("cp")
outfile, uid = open("NC_011032.1.fa", "r") do io
Chloe.annotate(references, io)
end
```

Or if you prefer you can use the commandline interface from the REPL to invoke Chloe:

```julia
Expand All @@ -128,7 +109,7 @@ For more recipes using Chloë see our [Recipes](https://github.com/ian-small/chl

## Output formats

Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The default output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:
Internally, Chloë numbers each strand independently from its 5' end, and tracks features by (start, length) rather then by (start, stop). This avoids most of the issues with features crossing the arbitrary end of a circular genome. The `--sff` output of Chloë (`.sff` files) uses these conventions. For example, here's the start of a typical `.sff` output file:


<img src="assets/sff.png" width="600">
Expand All @@ -141,14 +122,13 @@ gene name/gene copy (so if 2 or higher is a duplicate of another gene)/feature t
Subsequent columns are: strand, start, length, phase;
Then 5 columns of interest if you want to understand why Chloë has predicted this particular feature: length relative to feature template, proportion of references that match, mean coverage of aligned genomes (out of 100), feature probability (from XGBoost model), coding probability (from XGBoost model)

Most users will probably want to use `chloe.jl annotate -g` to obtain the output in standard `.gff` format:

The default output is GFF:

<img src="assets/gff.png" width="700">


By default, Chloë filters out features which are detected to have one of a set of problematic issues, or which have a feature probability of < 0.5.
You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate -s 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
You can retain these putative features by lowering the sensitivity threshold and asking for no filtering. For example, `chloe.jl annotate --sff --sensitivity 0 --no-filter` will retain all the features that Chloë was able to detect, including those that fail the checks. Features with issues will be flagged as warnings during the annotation:
```[ Warning: rps16/1 lacks a start codon
[ Warning: rps16/1 has a premature stop codon
[ Warning: rps16/1 CDS is not divisible by 3
Expand Down
2 changes: 1 addition & 1 deletion RECIPES.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ fastas = readdir(fasta_directory) |> filter(f -> endswith(f, r"\.fa")) |> pmap(f
# outputs is the list of output files
outputs = @distributed (vcat) for fasta = fastas
# note that `references` in on the worker process
output, uid = annotate(references, fasta, nothing, fasta * ".sff")
output, uid = annotate(references, fasta, nothing, ".")
[output]
end
```
Expand Down
47 changes: 30 additions & 17 deletions src/annotate_genomes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,17 +521,13 @@ end

function write_result(result::ChloeAnnotation, asgff3::Bool, filestem::String)::Tuple{Union{String,IO},String}
if !asgff3
writeSFF(
filestem * ".sff",
result.target_id,
result.target_length,
geomean(values(result.coverages)),
result.annotation
)
out = filestem * ".chloe.sff"
writeSFF(out, result.target_id, result.target_length, geomean(values(result.coverages)), result.annotation)
else
writeGFF3(filestem * ".gff", result.target_id, result.target_length, result.annotation)
out = filestem * ".chloe.gff"
writeGFF3(out, result.target_id, result.target_length, result.annotation)
end
return filestem, result.target_id
return out, result.target_id
end

function fasta_reader(infile::IO)::Tuple{String,FwdRev{CircularSequence}}
Expand Down Expand Up @@ -567,45 +563,62 @@ function annotate(
)::Tuple{Union{String,IO},String}
config = isnothing(config) ? ChloeConfig() : config
result = annotate_one_worker(db, target_id, target, config)
filestem = joinpath(output, result.target_id * ".chloe")
if ~config.no_transform
target, result = transform!(target, result, db.templates)
FASTAWriter(open(filestem * ".fa", "w")) do outfile
FASTAWriter(open(output * ".chloe.fa", "w")) do outfile
write(outfile, FASTARecord(result.target_id, target.forward[1:length(target.forward)]))
end
end
write_result(result, config.asgff3, filestem)
write_result(result, config.asgff3, output)
end

function annotate(
db::AbstractReferenceDb,
infile::String,
config::Union{ChloeConfig,Nothing}=nothing,
output::MayBeString="."
output::MayBeString=".",
stem::MayBeString=nothing
)
if isnothing(output)
output = dirname(infile)
end
maybe_gzread(infile) do io
annotate(db, io, config, output)
annotate(db, io, config, output, stem)
end
end

function annotate(
db::AbstractReferenceDb,
infile::IO,
config::Union{ChloeConfig,Nothing}=nothing,
output::MayBeString="."
output::MayBeString=".",
stem::MayBeString=nothing
)
target_id, seqs = fasta_reader(infile)
output = isnothing(stem) ? joinpath(output, target_id) : joinpath(output, stem)
annotate(db, target_id, seqs, config, output)
end

function annotate_batch(db::AbstractReferenceDb, fa_files::Vector{String}, config::ChloeConfig, output::MayBeString=".")
function filestem(fname)
fname = splitdir(fname)[2]
if endswith(fname, r"\.gz")
fname, _ = splitext(fname)
end
splitext(fname)[1]
end

function annotate_batch(
db::AbstractReferenceDb,
fa_files::Vector{String},
config::ChloeConfig,
output::MayBeString=".",
use_id::Bool=false
)
odir = isnothing(output) ? fname -> dirname(fname) : _ -> output
for infile in fa_files
stem = use_id ? nothing : filestem(infile)
maybe_gzread(infile) do io
annotate(db, io, config, odir(infile))
annotate(db, io, config, odir(infile), stem)
end
end
end
Expand Down
8 changes: 6 additions & 2 deletions src/chloe_cmd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ function chloe(;
output::Union{String,Nothing}=nothing,
no_transform::Bool=false,
sff::Bool=false,
no_filter::Bool=false
no_filter::Bool=false,
use_id::Bool=false
)
if ~isnothing(output)
if ~isdir(output)
Expand All @@ -41,7 +42,7 @@ function chloe(;
no_filter=no_filter,
reference=reference_dir
)
Annotator.annotate_batch(db, fasta_files, config, output)
Annotator.annotate_batch(db, fasta_files, config, output, use_id)
end

function getargs(args::Vector{String}=ARGS)
Expand Down Expand Up @@ -118,6 +119,9 @@ function getargs(args::Vector{String}=ARGS)
"--sff"
action = :store_true
help = "save output in sff format instead of gff3"
"--use-id"
action = :store_true
help = "Use the target_id found in the fasta file as the output filename"
end

parse_args(args, cmd_args; as_symbols=true)
Expand Down

0 comments on commit 332c95b

Please sign in to comment.