diff --git a/bin/regression.sh b/bin/regression.sh index 740926d..008e0f3 100755 --- a/bin/regression.sh +++ b/bin/regression.sh @@ -10,7 +10,7 @@ G='\e[1;32m' R='\e[1;31m' echo "start annotations..." -time -p julia --project=. --threads=8 "$@" chloe.jl -l info annotate -o testo --numgsrefs 16 --numchloerefs 0 testfa/*.fa +time -p julia --project=. --threads=8 "$@" chloe.jl -l info annotate -o testo --numgsrefs 16 testfa/*.fa for f in $(ls testo) do echo "diffing $f" diff --git a/bin/rregression.sh b/bin/rregression.sh index bf28492..ff918ab 100755 --- a/bin/rregression.sh +++ b/bin/rregression.sh @@ -49,7 +49,7 @@ done echo -e "index: ${index[@]}: $TOTAL/${C}$n${O}" echo "start annotations with: ${todo[@]}" -time -p julia --threads=8 --project=. "$@" chloe.jl -l warn annotate -o testo --numgsrefs 16 --numchloerefs 0 "${todo[@]}" +time -p julia --threads=8 --project=. "$@" chloe.jl -l warn annotate -o testo --numgsrefs 16 "${todo[@]}" for idx in ${index[@]} do diff --git a/bin/tregression.sh b/bin/tregression.sh index 8d1c38f..42a4152 100755 --- a/bin/tregression.sh +++ b/bin/tregression.sh @@ -11,7 +11,7 @@ C='\e[1;36m' # bold cyan A='\e[1;30m' # grey echo -e "testing ${C}${#}${O} files" -time -p julia --threads=8 --project=. chloe.jl -l warn annotate -o testo --numgsrefs 16 --numchloerefs 0 "$@" +time -p julia --threads=8 --project=. chloe.jl -l warn annotate -o testo --numgsrefs 16 "$@" for f in "$@" do diff --git a/src/annotate_genomes.jl b/src/annotate_genomes.jl index da1c795..8b3d30b 100644 --- a/src/annotate_genomes.jl +++ b/src/annotate_genomes.jl @@ -346,12 +346,7 @@ function annotate_one_worker(db::AbstractReferenceDb, append!(refpicks, searchhashes(hash, refhashes)[1:numrefs]) end - # find closest chloe references - refhashes = get_chloeminhashes(db, config) - if !isnothing(refhashes) - numrefs = min(config.numchloerefs, length(refhashes)) - append!(refpicks, searchhashes(hash, refhashes)[1:numrefs]) - end + numrefs = length(refpicks) t2 = time_ns() diff --git a/src/dist/chloe_cmd.jl b/src/dist/chloe_cmd.jl index d06a9e7..e060a27 100644 --- a/src/dist/chloe_cmd.jl +++ b/src/dist/chloe_cmd.jl @@ -16,11 +16,11 @@ function quiet_metafmt(level, _module, group, id, file, line) return color, prefix, "" end -function chloe(; gsrefsdir="default", chloerefsdir="default", numgsrefs=DEFAULT_NUMGSREFS, numchloerefs=DEFAULT_NUMGSREFS, fasta_files=String[], +function chloe(; gsrefsdir="default", numgsrefs=DEFAULT_NUMGSREFS, fasta_files=String[], template="default", sensitivity=DEFAULT_SENSITIVITY, output::Union{Nothing,String}=nothing, gff::Bool=false, nofilter::Bool=false) - db = Annotator.ReferenceDb(; gsrefsdir=gsrefsdir, chloerefsdir=chloerefsdir, template=template) - config = Annotator.ChloeConfig(; numgsrefs=numgsrefs, numchloerefs=numchloerefs, sensitivity=sensitivity, to_gff3=gff, nofilter=nofilter) + db = Annotator.ReferenceDb(; gsrefsdir=gsrefsdir, template=template) + config = Annotator.ChloeConfig(; numgsrefs=numgsrefs, sensitivity=sensitivity, to_gff3=gff, nofilter=nofilter) Annotator.annotate(db, fasta_files, config, output) end @@ -86,19 +86,14 @@ function getargs() "--reference", "-r" arg_type = String default = "default" - dest_name = "chloerefsdir" + dest_name = "gsrefsdir" metavar = "DIRECTORY" - help = "reference directory [default: $(DEFAULT_CHLOEREFS)]" + help = "reference directory [default: $(DEFAULT_GSREFS)]" "--numgsrefs" arg_type = Int default = DEFAULT_NUMGSREFS dest_name = "numgsrefs" help = "number of references to compare to [default: $(DEFAULT_NUMGSREFS)]" - "--numchloerefs" - arg_type = Int - default = DEFAULT_NUMCHLOEREFS - dest_name = "numchloerefs" - help = "number of references to compare to [default: $(DEFAULT_NUMCHLOEREFS)]" "--template", "-t" arg_type = String default = "default" diff --git a/src/dist/chloe_distributed.jl b/src/dist/chloe_distributed.jl index 5dd3852..d76d421 100644 --- a/src/dist/chloe_distributed.jl +++ b/src/dist/chloe_distributed.jl @@ -75,7 +75,7 @@ function create_responder(apispecs::Vector{Function}, addr::String, ctx::ZMQ.Con end function arm_procs_full(procs, backend::MayBeString=nothing, level::String="info"; - gsrefsdir="default", chloerefsdir="default", template="default") + gsrefsdir="default", template="default") @everywhere procs begin include(joinpath($REPO_DIR, "annotate_genomes.jl")) @@ -83,7 +83,7 @@ function arm_procs_full(procs, backend::MayBeString=nothing, level::String="info include(joinpath($REPO_DIR, "dist/tasks.jl")) set_global_logger($level, $backend; topic="annotator") - global REFERENCE = Annotator.ReferenceDb(; gsrefsdir=$gsrefsdir, chloerefsdir=$chloerefsdir, template=$template) + global REFERENCE = Annotator.ReferenceDb(; gsrefsdir=$gsrefsdir, template=$template) end # [ @spawnat p begin # include(joinpath(REPO_DIR, "annotate_genomes.jl")) @@ -96,13 +96,13 @@ function arm_procs_full(procs, backend::MayBeString=nothing, level::String="info end function arm_procs(procs, backend::MayBeString=nothing, level::String="info"; - gsrefsdir="default", chloerefsdir="default", template="default") + gsrefsdir="default", template="default") # use when toplevel has already done # @everywhere using Chloe @everywhere procs begin set_global_logger($level, $backend; topic="annotator") - global REFERENCE = Annotator.ReferenceDb(; gsrefsdir=$gsrefsdir, chloerefsdir=$chloerefsdir, template=$template) + global REFERENCE = Annotator.ReferenceDb(; gsrefsdir=$gsrefsdir, template=$template) end # [ @spawnat p begin # set_global_logger(level, backend; topic="annotator") @@ -111,7 +111,7 @@ function arm_procs(procs, backend::MayBeString=nothing, level::String="info"; end function chloe_distributed(full::Bool=true; gsrefsdir="default", address=ZMQ_WORKER, - chloerefsdir="default", template="default", level="warn", workers=3, + template="default", level="warn", workers=3, backend::MayBeString=nothing, broker::MayBeString=nothing) if !isnothing(backend) @@ -124,14 +124,11 @@ function chloe_distributed(full::Bool=true; gsrefsdir="default", address=ZMQ_WOR if gsrefsdir == "default" gsrefsdir = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_GSREFS)) end - if chloerefsdir == "default" - chloerefsdir = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_CHLOEREFS)) - end if template == "default" template = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_TEMPLATE)) end # don't wait for workers to find the wrong directory - verify_refs(gsrefsdir, chloerefsdir, template) + verify_refs(gsrefsdir, template) # user may have added run with # julia command -p2 etc. @@ -148,13 +145,13 @@ function chloe_distributed(full::Bool=true; gsrefsdir="default", address=ZMQ_WOR # arm_procs(procs, reference, backend, level) if full - arm_procs_full(procs, backend, level; gsrefsdir=gsrefsdir, chloerefsdir=chloerefsdir, template=template) + arm_procs_full(procs, backend, level; gsrefsdir=gsrefsdir, template=template) else - arm_procs(procs, backend, level; gsrefsdir=gsrefsdir, chloerefsdir=chloerefsdir, template=template) + arm_procs(procs, backend, level; gsrefsdir=gsrefsdir, template=template) end function arm(new_procs) - arm_procs_full(new_procs, backend, level; gsrefsdir=gsrefsdir, chloerefsdir=chloerefsdir, template=template) + arm_procs_full(new_procs, backend, level; gsrefsdir=gsrefsdir, template=template) end # it seems impossible to add new workers after the fact @@ -402,11 +399,6 @@ function get_distributed_args() dest_name = "gsrefsdir" metavar = "DIRECTORY" help = "reference directory [default: $(DEFAULT_GSREFS)]" - "--chloerefsdir", "-c" - arg_type = String - default = "default" - dest_name = "chloerefsdir" - help = "reference minhashes [default: $(DEFAULT_CHLOEREFS)]" "--template", "-t" arg_type = String default = "default" diff --git a/src/globals.jl b/src/globals.jl index 5dd961a..67b1c0d 100644 --- a/src/globals.jl +++ b/src/globals.jl @@ -1,8 +1,6 @@ const REPO_DIR = dirname(@__FILE__) const DEFAULT_GSREFS = "chloe_references/gsrefs" const DEFAULT_NUMGSREFS = 16 -const DEFAULT_CHLOEREFS = "chloe_references/chloerefs" -const DEFAULT_NUMCHLOEREFS = 0 const DEFAULT_TEMPLATE = "chloe_references/templates.tsv" const DEFAULT_SENSITIVITY = 0.5 diff --git a/src/reference.jl b/src/reference.jl index c5a1b3b..0bc31a7 100644 --- a/src/reference.jl +++ b/src/reference.jl @@ -4,32 +4,26 @@ abstract type AbstractReferenceDb end mutable struct ReferenceDb <: AbstractReferenceDb lock::ReentrantLock gsrefsdir::String - chloerefsdir::String template_file::String templates::Union{Nothing,Dict{String,FeatureTemplate}} gsrefhashes::Union{Nothing,Dict{String,Vector{Int64}}} - chloerefhashes::Union{Nothing,Dict{String,Vector{Int64}}} end struct ChloeConfig numgsrefs::Int - numchloerefs::Int sensitivity::Real to_gff3::Bool nofilter::Bool end -function ReferenceDb(; gsrefsdir="default", chloerefsdir="default", template="default")::ReferenceDb +function ReferenceDb(; gsrefsdir="default", template="default")::ReferenceDb if gsrefsdir == "default" gsrefsdir = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_GSREFS)) end - if chloerefsdir == "default" - chloerefsdir = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_CHLOEREFS)) - end if template == "default" template = normpath(joinpath(REPO_DIR, "..", "..", DEFAULT_TEMPLATE)) end - return ReferenceDb(ReentrantLock(), gsrefsdir, chloerefsdir, template, nothing, nothing, nothing) + return ReferenceDb(ReentrantLock(), gsrefsdir, template, nothing, nothing) end function get_templates(db::ReferenceDb) @@ -51,23 +45,11 @@ function get_gsminhashes(db::ReferenceDb, config::ChloeConfig) end end -function get_chloeminhashes(db::ReferenceDb, config::ChloeConfig) - config.numchloerefs < 1 && return nothing - lock(db.lock) do - if isnothing(db.chloerefhashes) - db.chloerefhashes = readminhashes(normpath(joinpath(db.chloerefsdir, "reference_minhashes.hash"))) - end - return db.chloerefhashes - end -end function get_single_reference!(db::ReferenceDb, refID::AbstractString, reference_feature_counts::Dict{String,Int})::SingleReference path = findfastafile(db.gsrefsdir, refID) - if isnothing(path) - path = findfastafile(db.chloerefsdir, refID) - end if isnothing(path) || !isfile(path) - msg = "unable to find $(refID) fasta file in $(db.gsrefsdir) or in $(db.chloerefsdir)!" + msg = "unable to find $(refID) fasta file in $(db.gsrefsdir)!" @error msg throw(ArgumentError(msg)) end @@ -77,7 +59,7 @@ function get_single_reference!(db::ReferenceDb, refID::AbstractString, reference read!(reader, ref) sffpath = path[1:findlast('.', path)] * "sff" #assumes fasta files and sff files differ only by the file name extension if !isfile(sffpath) - msg = "unable to find $(refID) sff file in $(db.gsrefsdir) or in $(db.chloerefsdir)!" + msg = "unable to find $(refID) sff file in $(db.gsrefsdir)!" @error msg throw(ArgumentError(msg)) end @@ -87,11 +69,11 @@ function get_single_reference!(db::ReferenceDb, refID::AbstractString, reference end end -const KWARGS = ["numgsrefs", "numchloerefs", "sensitivity", "to_gff3", "nofilter"] +const KWARGS = ["numgsrefs", "sensitivity", "to_gff3", "nofilter"] -function ChloeConfig(; numgsrefs=DEFAULT_NUMGSREFS, numchloerefs=DEFAULT_NUMCHLOEREFS, sensitivity=DEFAULT_SENSITIVITY, +function ChloeConfig(; numgsrefs=DEFAULT_NUMGSREFS, sensitivity=DEFAULT_SENSITIVITY, to_gff3::Bool=false, nofilter::Bool=false) - return ChloeConfig(numgsrefs, numchloerefs, sensitivity, to_gff3, nofilter) + return ChloeConfig(numgsrefs, sensitivity, to_gff3, nofilter) end # needs to be V <: Any since this is comming from a JSON blob @@ -108,10 +90,10 @@ function ChloeConfig(dict::Dict{String,V} where {V<:Any}) return ChloeConfig(; Dict(Symbol(k) => cvt(k, v) for (k, v) in dict if k in KWARGS)...) end function Base.show(io::IO, c::ChloeConfig) - print(io, "ChloeConfig[numgsrefs=$(c.numgsrefs), numchloerefs=$(c.numchloerefs) sensitivity=$(c.sensitivity), nofilter=$(c.nofilter)]") + print(io, "ChloeConfig[numgsrefs=$(c.numgsrefs), sensitivity=$(c.sensitivity), nofilter=$(c.nofilter)]") end -function verify_refs(gsrefsdir, chloerefsdir, template) +function verify_refs(gsrefsdir, template) # used by master process to check reference directory # *before* starting worker processes... if !isdir(gsrefsdir) @@ -119,11 +101,6 @@ function verify_refs(gsrefsdir, chloerefsdir, template) @error msg throw(ArgumentError(msg)) end - # if !isdir(chloerefsdir) - # msg = "Reference directory $(chloerefsdir) is not a directory!" - # @error msg - # throw(ArgumentError(msg)) - # end end function read_single_reference!(refdir::String, refID::AbstractString, reference_feature_counts::Dict{String,Int})::SingleReference