From 528d8a444cb039ef7d1c5d0ec890b5829a29adcf Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 15 Oct 2021 22:04:35 +0200 Subject: [PATCH 1/2] Faster computation of quantiles in `describe` Computing all quantiles when we only need the median is signficantly slower. Also avoid trying to compute quantiles for string columns since the failure only happens after sorting the vector, which is almost all of the work. --- src/abstractdataframe/abstractdataframe.jl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 41539befe7..7853b4e198 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -702,11 +702,19 @@ function get_stats(@nospecialize(col::Union{AbstractVector, Base.SkipMissing}), stats::AbstractVector{Symbol}) d = Dict{Symbol, Any}() - if :q25 in stats || :median in stats || :q75 in stats - q = try quantile(col, [.25, .5, .75]) catch; (nothing, nothing, nothing) end - d[:q25] = q[1] - d[:median] = q[2] - d[:q75] = q[3] + if eltype(col) <: Union{Missing, AbstractString} + d[:q25] = d[:median] = d[:q75] = nothing + elseif :q25 in stats || :median in stats || :q75 in stats + mcol = Base.copymutable(col) + if :q25 in stats + d[:q25] = try quantile!(mcol, 0.25) catch; nothing; end + end + if :median in stats + d[:median] = try quantile!(mcol, 0.50) catch; nothing; end + end + if :q75 in stats + d[:q75] = try quantile!(mcol, 0.75) catch; nothing; end + end end if :min in stats || :max in stats From c9c249bae51007b21353e1730a6163fdab25adc7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Fri, 15 Oct 2021 22:45:56 +0200 Subject: [PATCH 2/2] More general approach --- src/abstractdataframe/abstractdataframe.jl | 29 +++++++++++++--------- test/dataframe.jl | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7853b4e198..6821cbd700 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -702,18 +702,23 @@ function get_stats(@nospecialize(col::Union{AbstractVector, Base.SkipMissing}), stats::AbstractVector{Symbol}) d = Dict{Symbol, Any}() - if eltype(col) <: Union{Missing, AbstractString} - d[:q25] = d[:median] = d[:q75] = nothing - elseif :q25 in stats || :median in stats || :q75 in stats - mcol = Base.copymutable(col) - if :q25 in stats - d[:q25] = try quantile!(mcol, 0.25) catch; nothing; end - end - if :median in stats - d[:median] = try quantile!(mcol, 0.50) catch; nothing; end - end - if :q75 in stats - d[:q75] = try quantile!(mcol, 0.75) catch; nothing; end + if :q25 in stats || :median in stats || :q75 in stats + # types that do not support basic arithmetic (like strings) will only fail + # after sorting the data, so check this beforehand to fail early + T = eltype(col) + if isconcretetype(T) && !hasmethod(-, Tuple{T, T}) + d[:q25] = d[:median] = d[:q75] = nothing + else + mcol = Base.copymutable(col) + if :q25 in stats + d[:q25] = try quantile!(mcol, 0.25) catch; nothing; end + end + if :median in stats + d[:median] = try quantile!(mcol, 0.50) catch; nothing; end + end + if :q75 in stats + d[:q75] = try quantile!(mcol, 0.75) catch; nothing; end + end end end diff --git a/test/dataframe.jl b/test/dataframe.jl index a00e432650..e69392710c 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -668,7 +668,7 @@ end nothing, nothing, nothing], min = [1.0, 1.0, "a", "a", Date(2000), 1], q25 = [1.75, 1.5, nothing, nothing, nothing, nothing], - median = [2.5, 2.0, nothing, nothing, nothing, nothing], + median = [2.5, 2.0, nothing, nothing, VERSION >= v"1.7.0-beta1.2" ? Date(2002) : nothing, nothing], q75 = [3.25, 2.5, nothing, nothing, nothing, nothing], max = [4.0, 3.0, "d", "c", Date(2004), 2], nunique = [nothing, nothing, 4, 3, 4, 2],