Merge pull request #13 from piever/dev

Parallel docs
JuliaData · Mar 17, 2018 · a1bb240 · a1bb240
2 parents 7cbdda5 + 8e5900b
commit a1bb240
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 9 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,9 @@
+# JuliaDBMeta.jl NEWS
+
+## 0.2 current version
+
+### 0.2
+
+- add `cols` to select columns programmatically
+- add out-of-core support
+- **breaking** `@groupby` no longer flattens by default
diff --git a/docs/src/column_macros.md b/docs/src/column_macros.md
@@ -2,6 +2,15 @@
 
 Column-wise macros allow using symbols instead of columns. The order of the arguments is always the same: the first argument is the table and the last argument is the expression (can be a `begin ... end` block). If the table is omitted, the macro is automatically curried (useful for piping).
 
+Shared features across all row-wise macros:
+
+ - Symbols refer to fields of the row.
+ - `_` refers to the whole table.
+ - To use actual symbols, escape them with `^`, as in `^(:a)`.
+ - Use `cols(c)` to refer to field c where `c` is a variable that evaluates to a symbol. `c` must be available in the scope where the macro is called.
+ - An optional grouping argument is allowed: see [Column-wise macros with grouping argument](@ref)
+ - Out-of-core tables are not supported out of the box, except when grouping
+
 ## Replace symbols with columns
 
 ```@docs

diff --git a/docs/src/out_of_core.md b/docs/src/out_of_core.md
@@ -1,9 +1,58 @@
 # Out-of-core support
 
+JuliaDBMeta supports out-of-core operations in several different ways. In the following examples, we will have started the REPL with `julia -p 4`
+
+## Row-wise macros parallelize out of the box
+
 [Row-wise macros](@ref) can be trivially implemented in parallel and will work out of the box with out-of-core tables.
 
+```jldoctest distributed
+julia> iris = loadtable(Pkg.dir("JuliaDBMeta", "test", "tables", "iris.csv"));
+
+julia> iris5 = table(iris, chunks = 5);
+
+julia> @where iris5 :SepalLength == 4.9 && :Species == "setosa"
+Distributed Table with 4 rows in 2 chunks:
+SepalLength  SepalWidth  PetalLength  PetalWidth  Species
+──────────────────────────────────────────────────────────
+4.9          3.0         1.4          0.2         "setosa"
+4.9          3.1         1.5          0.1         "setosa"
+4.9          3.1         1.5          0.2         "setosa"
+4.9          3.6         1.4          0.1         "setosa"
+```
+
+## Grouping operations parallelize with some data shuffling
+
 [Grouping operations](@ref) will work on out-of-core data tables, but may involve some data shuffling as it requires data belonging to the same group to be on the same processor.
 
+```jldoctest distributed
+julia> @groupby iris5 :Species {mean(:SepalLength)}
+Distributed Table with 3 rows in 3 chunks:
+Species       mean(SepalLength)
+───────────────────────────────
+"setosa"      5.006
+"versicolor"  5.936
+"virginica"   6.588
+```
+
+## Apply a pipeline to your data in chunks
+
 [`@applychunked`](@ref) will apply the analysis pipeline separately to each chunk of data in parallel and collect the result as a distributed table.
 
-[Column-wise macros](@ref) do not have a parallel implementation yet (they require working on the whole column at the same time which makes it difficult to parallelize them).
+```jldoctest distributed
+julia> @applychunked iris5 begin
+           @where :Species == "setosa" && :SepalLength == 4.9
+           @transform {Ratio = :SepalLength / :SepalWidth}
+       end
+Distributed Table with 4 rows in 2 chunks:
+SepalLength  SepalWidth  PetalLength  PetalWidth  Species   Ratio
+───────────────────────────────────────────────────────────────────
+4.9          3.0         1.4          0.2         "setosa"  1.63333
+4.9          3.1         1.5          0.1         "setosa"  1.58065
+4.9          3.1         1.5          0.2         "setosa"  1.58065
+4.9          3.6         1.4          0.1         "setosa"  1.36111
+```
+
+## Column-wise macros do not parallelize yet
+
+[Column-wise macros](@ref) do not have a parallel implementation yet, unless when grouping: they require working on the whole column at the same time which makes it difficult to parallelize them.
diff --git a/docs/src/row_macros.md b/docs/src/row_macros.md
@@ -2,6 +2,14 @@
 
 Row-wise macros allow using symbols to refer to fields of a row. The order of the arguments is always the same: the first argument is the table and the last argument is the expression (can be a `begin ... end` block). If the table is omitted, the macro is automatically curried (useful for piping).
 
+Shared features across all row-wise macros:
+
+ - Symbols refer to fields of the row.
+ - `_` refers to the whole row.
+ - To use actual symbols, escape them with `^`, as in `^(:a)`.
+ - Use `cols(c)` to refer to field c where `c` is a variable that evaluates to a symbol. `c` must be available in the scope where the macro is called.
+ - Out-of-core tables are supported out of the box
+
 ## Modify data in place
 
 ```@docs

diff --git a/src/groupby.jl b/src/groupby.jl
@@ -1,15 +1,15 @@
-_groupby(f, d::AbstractDataset, args...; kwargs...) = 
-    IndexedTables.groupby(f, d, args...; flatten = true, usekey = true, kwargs...)
+_groupby(f, d::AbstractDataset, args...; kwargs...) =
+    IndexedTables.groupby(f, d, args...; usekey = true, kwargs...)
 
 _groupby(f, args...; kwargs...) = d::AbstractDataset -> _groupby(f, d, args...; kwargs...)
 
 function groupby_helper(args...)
     anon_func, syms = extract_anonymous_function(last(args), replace_column, usekey = true)
     if !isempty(syms) && !(:(_) in syms)
         fields = Expr(:call, :(JuliaDBMeta.All), syms...)
-        Expr(:call, :(JuliaDBMeta._groupby), anon_func, args[1:end-1]..., Expr(:kw, :select, fields))
+        Expr(:call, :(JuliaDBMeta._groupby), anon_func, Expr(:kw, :select, fields), replace_keywords(args[1:end-1])...)
     else
-        Expr(:call, :(JuliaDBMeta._groupby), anon_func, args[1:end-1]...)
+        Expr(:call, :(JuliaDBMeta._groupby), anon_func, replace_keywords(args[1:end-1])...)
     end
 end
 
@@ -45,6 +45,19 @@ x  m
 1  5.7
 2  3.3
 ```
+
+When the summary function returns an iterable, use `flatten=true` to flatten the result:
+
+```jldoctest groupby
+julia> @groupby(t, :x, flatten = true, select = {:y+1})
+Table with 4 rows, 2 columns:
+x  y + 1
+────────
+1  5
+1  7
+2  6
+2  8
+```
 """
 macro groupby(args...)
     esc(groupby_helper(args...))

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,7 +1,10 @@
-using JuliaDBMeta, Compat, NamedTuples
-using Compat.Test
+addprocs(4)
 
-iris1 = loadtable(joinpath(@__DIR__, "tables", "iris.csv"))
+@everywhere using JuliaDBMeta, Compat, NamedTuples
+@everywhere using JuliaDB, Dagger
+@everywhere using Compat.Test
+
+iris1 = collect(loadtable(joinpath(@__DIR__, "tables", "iris.csv")))
 iris2 = table(iris1, chunks = 5)
 
 @testset "utils" begin
@@ -76,14 +79,15 @@ end
     @test (@where_vec t (:x .< 3) .& (:z .== 0.2)) == view(t, [2])
     @test @where_vec(t, 1:2) == view(t, 1:2)
     @test @where_vec(rows(t), 1:2) == view(t, 1:2)
+    @test JuliaDBMeta._view(rows(t), 1:2) == view(rows(t), 1:2)
     @test @where_vec((:x .< 3) .& (:z .== 0.2))(t) == view(t, [2])
     @test (@where t (:x < 3) .& (:z == 0.2)) == view(t, [2])
     @test @where((:x < 3) .& (:z == 0.2))(t) == view(t, [2])
 
     t = table([1,1,3], [4,5,6], [0.1, 0.2, 0.3], names = [:x, :y, :z])
     grp = groupby(@map(@NT(z = :z))∘@where(:y != 5), t, :x, flatten = true)
     @test grp == table([1, 3], [0.1, 0.3], names = [:x, :z], pkey = :x)
-    collect(@where iris2 :SepalLength > 4) == @where iris1 :SepalLength > 4
+    @test collect(@where iris2 :SepalLength > 4) == @where iris1 :SepalLength > 4
 end
 
 @testset "apply" begin
@@ -145,4 +149,6 @@ end
     @test @groupby({m = maximum(:y - :z) / _.key.x})(reindex(t, :x)) == outcome
     @test @groupby(t, :x, {l = length(_)}) == table([1,2], [2,2], names = [:x, :l], pkey = :l)
     @test @groupby(t, :x, {l = length(_)}) == t |> @groupby(:x, {l = length(_)})
+    @test @groupby(t, :x, flatten = true, _) == reindex(t, :x)
+    @test @groupby(t, :x, {identity = _}) == groupby(identity, t, :x)
 end