From fd541e542c608d266c1ed8cbed3470c6b342dec3 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Sun, 21 Nov 2021 15:26:30 +0800 Subject: [PATCH 01/33] change the location of log file --- src/main/clojure/aggregate/aggre_onyx_comps.clj | 6 ++++-- src/main/clojure/clojask/onyx_comps.clj | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj index 5179f06..4cbfb48 100644 --- a/src/main/clojure/aggregate/aggre_onyx_comps.clj +++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj @@ -223,7 +223,8 @@ {:zookeeper/address "127.0.0.1:2188" :zookeeper/server? true :zookeeper.server/port 2188 - :onyx/tenancy-id id}) + :onyx/tenancy-id id + :onyx.log/file "_clojask/clojask.log"}) (def peer-config {:zookeeper/address "127.0.0.1:2188" @@ -231,7 +232,8 @@ :onyx.peer/job-scheduler :onyx.job-scheduler/balanced :onyx.messaging/impl :aeron :onyx.messaging/peer-port 40200 - :onyx.messaging/bind-addr "localhost"}) + :onyx.messaging/bind-addr "localhost" + :onyx.log/file "_clojask/clojask.log"}) (def env (onyx.api/start-env env-config)) diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj index 47898b4..964ae2a 100644 --- a/src/main/clojure/clojask/onyx_comps.clj +++ b/src/main/clojure/clojask/onyx_comps.clj @@ -446,7 +446,8 @@ {:zookeeper/address "127.0.0.1:2188" :zookeeper/server? true :zookeeper.server/port 2188 - :onyx/tenancy-id id}) + :onyx/tenancy-id id + :onyx.log/file "_clojask/clojask.log"}) (def peer-config {:zookeeper/address "127.0.0.1:2188" @@ -454,7 +455,8 @@ :onyx.peer/job-scheduler :onyx.job-scheduler/balanced :onyx.messaging/impl :aeron :onyx.messaging/peer-port 40200 - :onyx.messaging/bind-addr "localhost"}) + :onyx.messaging/bind-addr "localhost" + :onyx.log/file "_clojask/clojask.log"}) (def env (onyx.api/start-env env-config)) From a767454b6b8da6aa45767e764739dfda69d8d735 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Sun, 21 Nov 2021 22:00:46 +0800 Subject: [PATCH 02/33] Update README.md --- README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 0de624c..297c3d7 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,56 @@ -# clojask +# Clojask Clojure data frame with parallel computing on larger-than-memory datasets -#### Run the main function in `core`: +### Features -``` -lein run -``` +- **Unlimited size** + + Theoretically speaking, it supports dataset larger than memory, even to infinity! + +- **Fast** + + Faster than Dask in most operations, and the larger the dataframe is, the bigger the advantage + +- **All native types** + + All the datatypes used to store data is native Clojure (or Java) types! + +- **From file to file** + + Integrate IO inside the dataframe. No need to write your own read-in and output functions! + +- **Parallel** -#### Run the tests in `test`: + Most operations could be executed into multiple threads or even machines. See the principle in [Onyx](http://www.onyxplatform.org/). + +- **Lazy operations** + + Most operations will not be executed immediately. Dataframe will intelligently pipeline the operations altogether in computation. + +### Installation + +Available on [Clojars](https://clojars.org/com.github.clojure-finance/clojask). + +Insert this line into your `project.clj` if using Leiningen. ``` -lein test +[com.github.clojure-finance/clojask "1.0.0"] ``` +Insert this line into your `deps.edn` if using CLI. -To run a particular test defined in the namespace: ``` -lein test :only core-test/df-api-test +com.github.clojure-finance/clojask {:mvn/version "1.0.0"} ``` -#### Requirements for the input file: -- the first row should contain the column names +### Documentation + +The detailed doc for every API can be found [here](https://clojure-finance.github.io/clojask-website/posts-output/API/). + +### Examples + +A separate repository for some typical usage of Clojask can be found [here](https://github.com/clojure-finance/clojask-examples). + +### Problem Feedback + +If your question is not answered in existing [issues](https://github.com/clojure-finance/clojask/issues). Feel free to create a new one. From e4b98fccbe90c3a021b9650ae23e51b8780f4d5f Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Mon, 22 Nov 2021 21:50:30 +0800 Subject: [PATCH 03/33] Remove old example file --- examples/multi-threading.clj | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 examples/multi-threading.clj diff --git a/examples/multi-threading.clj b/examples/multi-threading.clj deleted file mode 100644 index baff528..0000000 --- a/examples/multi-threading.clj +++ /dev/null @@ -1,15 +0,0 @@ -(ns examples.timezone - (:require [clojask.dataframe :refer :all] - [clojure.core.async :as async])) - - (defn main - [] - (def x (dataframe "resources/Employees-large.csv")) - (def y (dataframe "resources/Employees.csv")) - - ;; create a thread for each operation - (async/thread (set-type x "double" "Department")) - (async/thread (set-type y "double" "Department")) - - (time (left-join x y ["Employee"] ["Employee"] 4 "resources/test.csv" :exception false)) - ) \ No newline at end of file From e887f381fdf4ebbc8c987ab396e2abc3c15b9f78 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Fri, 10 Dec 2021 18:14:41 +0800 Subject: [PATCH 04/33] Amend printJoinCol to allow input for col-prefix as optional arg --- src/main/clojure/clojask/dataframe.clj | 34 ++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 8f93903..4d91891 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -31,7 +31,7 @@ (getColNames []) (printCol [output-path] "print column names to output file") (printAggreCol [output-path] "print column names to output file for aggregate") - (printJoinCol [b-df a-keys b-keys output-path] "print column names to output file for join") + (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join") (delCol [col-to-del] "delete column(s) in the dataframe") (reorderCol [new-col-order] "reorder columns in the dataframe") (renameCol [new-col-names] "reorder columns in the dataframe") @@ -156,13 +156,15 @@ (printJoinCol ;; print column names, called by join APIs - [this b-df this-keys b-keys output-path] + [this b-df this-keys b-keys output-path col-prefix] (cond (not (= java.lang.String (type output-path))) (throw (Clojask_TypeException. "Output path should be a string."))) - (let [a-col-set (.getColNames this) + (let [a-col-prefix (first col-prefix) + b-col-prefix (last col-prefix) + a-col-set (.getColNames this) b-col-set (.getColNames b-df) - a-col-header (map #(str "1_" %) a-col-set) - b-col-header (map #(str "2_" %) b-col-set)] + a-col-header (map #(str a-col-prefix "_" %) a-col-set) + b-col-header (map #(str b-col-prefix "_" %) b-col-set)] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," (concat a-col-header b-col-header)) "\n"))))) @@ -467,7 +469,7 @@ result)) (defn inner-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -480,13 +482,13 @@ (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) (u/init-file dist) ;; print column names - (.printJoinCol a b a-keys b-keys dist) + (.printJoinCol a b a-keys b-keys dist col-prefix) ;; first group b by keys (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 1))) (defn left-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -495,17 +497,19 @@ (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes."))) (cond (not (= (count a-keys) (count b-keys))) (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) + (cond (not (= (count col-prefix) 2)) + (throw (Clojask_TypeException. "The length of col-prefix should be equal to 2."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) (u/init-file dist) ;; print column names - (.printJoinCol a b a-keys b-keys dist) + (.printJoinCol a b a-keys b-keys dist col-prefix) ;; first group b by keys (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 2))) (defn right-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -518,13 +522,13 @@ (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) (u/init-file dist) ;; print column names - (.printJoinCol b a a-keys b-keys dist) + (.printJoinCol b a a-keys b-keys dist col-prefix) ;; first group b by keys (start-onyx-groupby num-worker 10 a "./_clojask/join/b/" a-keys exception) (start-onyx-join num-worker 10 b a dist exception b-keys a-keys nil nil 2))) (defn rolling-join-forward - [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}] + [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [col-prefix exception limit] :or {col-prefix ["1" "2"] exception false limit nil}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -543,13 +547,13 @@ (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s)."))) (u/init-file dist) ;; print column names - (.printJoinCol a b a-keys b-keys dist) + (.printJoinCol a b a-keys b-keys dist col-prefix) (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 4 limit))))) ;; all of the code is the same as above except for the last line (defn rolling-join-backward - [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}] + [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [col-prefix exception limit] :or {col-prefix ["1" "2"] exception false limit nil}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -568,6 +572,6 @@ (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s)."))) (u/init-file dist) ;; print column names - (.printJoinCol a b a-keys b-keys dist) + (.printJoinCol a b a-keys b-keys dist col-prefix) (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 5 nil))))) From e5e9ff9c04da868bf0b590d8ea34b02f6608e0cc Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sat, 18 Dec 2021 14:03:33 +0800 Subject: [PATCH 05/33] Refactor code in dataframe.clj --- src/main/clojure/clojask/dataframe.clj | 55 ++++++++++++++------------ 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 4d91891..a571093 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -20,25 +20,27 @@ '[com.clojask.exception Clojask_OperationException]) (definterface DFIntf - (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order]) + (checkOutputPath [output-path] "check if output path is of string type") (operate [operation colName] "operate an operation to column and replace in place") (operate [operation colName newCol] "operate an operation to column and add the result as new column") (setType [type colName] "types supported: int double string date") (setParser [parser col] "add the parser for a col which acts like setType") - (colDesc []) - (colTypes []) - (getColIndex []) - (getColNames []) + (colDesc [] "get column description in ColInfo") + (colTypes [] "get column type in ColInfo") + (getColIndex [] "get column indices, excluding deleted columns") + (getColNames [] "get column names") (printCol [output-path] "print column names to output file") (printAggreCol [output-path] "print column names to output file for aggregate") (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join") - (delCol [col-to-del] "delete column(s) in the dataframe") + (delCol [col-to-del] "delete one or more columns in the dataframe") (reorderCol [new-col-order] "reorder columns in the dataframe") - (renameCol [new-col-names] "reorder columns in the dataframe") + (renameCol [new-col-names] "rename columns in the dataframe") (groupby [a] "group the dataframe by the key(s)") (aggregate [a c b] "aggregate the group-by result by the function") - (head [n]) + (head [n] "return first n lines in dataframe") (filter [cols predicate]) + (computeTypeCheck [num-worker output-dir]) + (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order]) (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception]) (computeAggre [^int num-worker ^String output-dir ^boolean exception]) (sort [a b] "sort the dataframe based on columns") @@ -58,6 +60,11 @@ ^Boolean have-col] DFIntf + (checkOutputPath + [this output-path] + (cond (not (= java.lang.String (type output-path))) + (throw (Clojask_TypeException. "Output path should be a string.")))) + (operate ;; has assert [this operation colName] (if (nil? (.operate col-info operation colName)) @@ -136,8 +143,7 @@ (printCol ;; print column names, called by compute [this output-path] - (cond (not (= java.lang.String (type output-path))) - (throw (Clojask_TypeException. "Output path should be a string."))) + (.checkOutputPath this output-path) (let [col-set (.getColNames this)] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) @@ -145,20 +151,17 @@ (printAggreCol ;; print column names, called by computeAggre [this output-path] - (cond (not (= java.lang.String (type output-path))) - (throw (Clojask_TypeException. "Output path should be a string."))) + (.checkOutputPath this output-path) (let [groupby-key-index (.getGroupbyKeys (:row-info this)) groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index)))) aggre-new-keys (.getAggreNewKeys (:row-info this))] - ;(println (vec (map #(last %) groupby-key-index))) (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n"))))) (printJoinCol ;; print column names, called by join APIs [this b-df this-keys b-keys output-path col-prefix] - (cond (not (= java.lang.String (type output-path))) - (throw (Clojask_TypeException. "Output path should be a string."))) + (.checkOutputPath this output-path) (let [a-col-prefix (first col-prefix) b-col-prefix (last col-prefix) a-col-set (.getColNames this) @@ -233,7 +236,6 @@ [this] (doseq [tmp (.getFormatter (:col-info this))] (.operate this (nth tmp 1) (get (.getIndexKey col-info) (nth tmp 0))))) - ;; currently put read file here (preview [this sample-size return-size format] @@ -241,9 +243,18 @@ (throw (Clojask_TypeException. "Arguments passed to preview must be integers."))) (preview/preview this sample-size return-size format)) + (computeTypeCheck + [this num-worker output-dir] + (cond (not (= java.lang.String (type output-dir))) + (throw (Clojask_TypeException. "Output directory should be a string."))) + (cond (not (integer? num-worker)) + (throw (Clojask_TypeException. "Number of workers should be an integer."))) + (if (> num-worker 8) + (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))) + (compute [this ^int num-worker ^String output-dir ^boolean exception ^boolean order] - ;(assert (= java.lang.String (type output-dir)) "output path should be a string") + (.computeTypeCheck this num-worker output-dir) (if (<= num-worker 8) (try (.final this) @@ -257,10 +268,7 @@ (computeAggre [this ^int num-worker ^String output-dir ^boolean exception] - (cond (not (= java.lang.String (type output-dir))) - (throw (Clojask_TypeException. "Output-dir should be a string."))) - (if (> num-worker 8) - (throw (Clojask_OperationException. "Max number of worker nodes is 8."))) + (.computeTypeCheck this num-worker output-dir) (.printAggreCol this output-dir) ;; print column names to output-dir (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)] (if (= res "success") @@ -269,8 +277,7 @@ (computeGroupAggre [this ^int num-worker ^String output-dir ^boolean exception] - (cond (not (= java.lang.String (type output-dir))) - (throw (Clojask_TypeException. "Output-dir should be a string."))) + (.computeTypeCheck this num-worker output-dir) (if (<= num-worker 8) (try (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)] @@ -371,8 +378,6 @@ (DataFrame. path 300 col-info row-info have-col)) (catch Exception e (do - ;; (println "No such file or directory") - ;; (throw e) (throw (Clojask_OperationException. "no such file or directory")) nil)))) From cf47c0857132ce39b2083bf98eca528acb0cca58 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Sat, 18 Dec 2021 21:05:55 +0800 Subject: [PATCH 06/33] add select to compute and make join lazy --- src/main/clojure/clojask/dataframe.clj | 145 ++++++++++++++---------- src/main/clojure/clojask/onyx_comps.clj | 18 ++- 2 files changed, 96 insertions(+), 67 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 8f93903..7486fb1 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -12,15 +12,15 @@ [clojask.preview :as preview] [clojure.pprint :as pprint]) (:import [clojask.ColInfo ColInfo] - [clojask.RowInfo RowInfo]) + [clojask.RowInfo RowInfo] + [com.clojask.exception Clojask_TypeException] + [com.clojask.exception Clojask_OperationException]) (:refer-clojure :exclude [filter group-by sort])) "The clojask lazy dataframe" -(import '[com.clojask.exception Clojask_TypeException] - '[com.clojask.exception Clojask_OperationException]) (definterface DFIntf - (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order]) + (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select] "final evaluatation") (operate [operation colName] "operate an operation to column and replace in place") (operate [operation colName newCol] "operate an operation to column and add the result as new column") (setType [type colName] "types supported: int double string date") @@ -240,18 +240,22 @@ (preview/preview this sample-size return-size format)) (compute - [this ^int num-worker ^String output-dir ^boolean exception ^boolean order] + [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select] ;(assert (= java.lang.String (type output-dir)) "output path should be a string") - (if (<= num-worker 8) - (try - (.final this) - (.printCol this output-dir) ;; print column names to output-dir - (let [res (start-onyx num-worker batch-size this output-dir exception order)] - (if (= res "success") - "success" - "failed")) - (catch Exception e e)) - (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))) + (let [key-index (.getKeyIndex (:col-info this)) + select (if (coll? select) select [select]) + index (if (= select [nil]) (take (count key-index) (iterate inc 0)) (vals (select-keys key-index select)))] + (assert (= (count select) (count index)) (Clojask_OperationException. "Must select existing columns. You may check it using")) + (if (<= num-worker 8) + (try + (.final this) + (.printCol this output-dir) ;; to-do: based on the index + (let [res (start-onyx num-worker batch-size this output-dir exception order index)] + (if (= res "success") + "success" + "failed")) + (catch Exception e e)) + (throw (Clojask_OperationException. "Max number of worker nodes is 8."))))) (computeAggre [this ^int num-worker ^String output-dir ^boolean exception] @@ -390,15 +394,6 @@ (.errorPredetect this "this function cannot be appended into the current pipeline") result))) -(defn compute - [this num-worker output-dir & {:keys [exception order] :or {exception false order true}}] - (u/init-file output-dir) - (if (= (.getAggreFunc (:row-info this)) []) - (.compute this num-worker output-dir exception order) - (if (not= (.getGroupbyKeys (:row-info this)) []) - (.computeGroupAggre this num-worker output-dir exception) - (.computeAggre this num-worker output-dir exception)))) - (defn group-by [this key] (let [result (.groupby this key)] @@ -466,8 +461,35 @@ (.errorPredetect this "invalid arguments passed to rename-col function") result)) +;; ============= Below is the definition for the joineddataframe ================ +(definterface JDFIntf + (getColNames [] "get the names of all the columns") + (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude])) + +(defrecord JoinedDataFrame + [^DataFrame a + ^DataFrame b + a-keys + b-keys + a-roll + b-roll + type + limit] + JDFIntf + (getColNames + [this]) + + (compute + [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude] + (let [] + (u/init-file output-dir) + ;; print column names + ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames + (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) + (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit)))) + (defn inner-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -478,15 +500,15 @@ (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) - (u/init-file dist) - ;; print column names - (.printJoinCol a b a-keys b-keys dist) - ;; first group b by keys - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) - (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 1))) + (let [a-file (io/file (:path a)) + b-file (io/file (:path b))] + (if (<= (.length a-file) (.length b-file)) + (JoinedDataFrame. a b a-keys b-keys nil nil 1 nil) + (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil))) + )) (defn left-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -497,15 +519,10 @@ (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) - (u/init-file dist) - ;; print column names - (.printJoinCol a b a-keys b-keys dist) - ;; first group b by keys - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) - (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 2))) + (JoinedDataFrame. a b a-keys b-keys nil nil 2 nil))) (defn right-join - [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}] + [a b a-keys b-keys] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -516,15 +533,10 @@ (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) - (u/init-file dist) - ;; print column names - (.printJoinCol b a a-keys b-keys dist) - ;; first group b by keys - (start-onyx-groupby num-worker 10 a "./_clojask/join/b/" a-keys exception) - (start-onyx-join num-worker 10 b a dist exception b-keys a-keys nil nil 2))) + (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil))) (defn rolling-join-forward - [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}] + [a b a-keys b-keys a-roll b-roll & {:keys [limit] :or {limit nil}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -541,15 +553,11 @@ (do (cond (not (and (not= a-roll nil) (not= b-roll nil))) (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s)."))) - (u/init-file dist) - ;; print column names - (.printJoinCol a b a-keys b-keys dist) - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) - (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 4 limit))))) + (JoinedDataFrame. a b a-keys b-keys a-roll b-roll 4 limit))))) ;; all of the code is the same as above except for the last line (defn rolling-join-backward - [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}] + [a b a-keys b-keys a-roll b-roll & {:keys [limit] :or {limit nil}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) @@ -566,8 +574,31 @@ (do (cond (not (and (not= a-roll nil) (not= b-roll nil))) (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s)."))) - (u/init-file dist) - ;; print column names - (.printJoinCol a b a-keys b-keys dist) - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) - (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 5 nil))))) + (JoinedDataFrame. a b a-keys b-keys a-roll b-roll 5 limit))))) + +(defn compute + [this num-worker output-dir & {:keys [exception order select exclude] :or {exception false order true select nil exclude nil}}] + (assert (or (nil? select) (nil? exclude)) "can only specify either of them") + (u/init-file output-dir) + ;; check which type of dataframe this is + (if (= (type this) clojask.dataframe.DataFrame) + (if (= (.getAggreFunc (:row-info this)) []) + (let [exclude (if (coll? exclude) exclude [exclude]) + select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))] + (.compute this num-worker output-dir exception order select)) + (if (not= (.getGroupbyKeys (:row-info this)) []) + (let [exclude (if (coll? exclude) exclude [exclude]) + select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))] + (.computeGroupAggre this num-worker output-dir exception)) + (.computeAggre this num-worker output-dir exception))) + (if (= (type this) clojask.dataframe.JoinedDataFrame) + (.compute this num-worker output-dir exception order select exclude) + (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")) + ))) + +(defn get-col-names + "Get the names for the columns in sequence" + [this] + ;; to-do: should implement both for the DataFrame and JoinedDataFrame + (.getColNames this) + ) diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj index 964ae2a..50aef35 100644 --- a/src/main/clojure/clojask/onyx_comps.clj +++ b/src/main/clojure/clojask/onyx_comps.clj @@ -61,17 +61,15 @@ ;; (zipmap keys ))) (defn worker-func-gen - [df exception] + [df exception index] (reset! dataframe df) (let [operations (.getDesc (:col-info (deref dataframe))) types (.getType (:col-info (deref dataframe))) filters (.getFilters (:row-info df)) indices-deleted (.getDeletedCol (:col-info (deref dataframe))) indices-wo-del (vec (take (count operations) (iterate inc 0))) - indices (if (empty? indices-deleted) - indices-wo-del ; no columns deleted - (vec (set/difference (set indices-wo-del) (set indices-deleted))) ; minus column indices deleted - )] + indices index] + ;; (println indices) (if exception (defn worker-func [seg] @@ -475,11 +473,11 @@ (defn start-onyx "start the onyx cluster with the specification inside dataframe" - [num-work batch-size dataframe dist exception order] + [num-work batch-size dataframe dist exception order index] (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception) ;;need some work + (worker-func-gen dataframe exception index) ;;need some work (catalog-gen num-work batch-size) (lifecycle-gen (.path dataframe) dist order) (flow-cond-gen num-work) @@ -510,7 +508,7 @@ (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception) ;;need some work + (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work (catalog-aggre-gen num-work batch-size) (lifecycle-aggre-gen (.path dataframe) dist) (flow-cond-gen num-work) @@ -542,7 +540,7 @@ (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception) ;;need some work + (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work (catalog-groupby-gen num-work batch-size) (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe))) (flow-cond-gen num-work) @@ -575,7 +573,7 @@ (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception) ;;need some work + (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work (catalog-join-gen num-work batch-size) (lifecycle-join-gen (.path dataframe) dist dataframe b a-keys b-keys a-roll b-roll join-type) (flow-cond-gen num-work) From 8f57b0e5a05f4eef4cd8025508c7a6bd32a713f3 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Mon, 20 Dec 2021 14:41:16 +0800 Subject: [PATCH 07/33] Implemented getColNames for Join DF --- src/main/clojure/clojask/dataframe.clj | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 27f6d73..b7b15c1 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -485,7 +485,14 @@ prefix] JDFIntf (getColNames - [this]) + [this] + (let [a-col-prefix (first prefix) + b-col-prefix (last prefix) + a-col-set (.getColNames a) + b-col-set (.getColNames b) + a-col-header (mapv #(str a-col-prefix "_" %) a-col-set) + b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)] + (conj a-col-header b-col-header))) (compute [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude] From d18a8ba5cd05d9a0707b9078ec6f9f10157f3d18 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Mon, 20 Dec 2021 20:20:05 +0800 Subject: [PATCH 08/33] Amend test file for new Join API syntax --- test/clojask/core_test.clj | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index 158146f..c4b1cef 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -66,30 +66,30 @@ (testing "Join dataframes APIs" (def x (dataframe "test/clojask/Employees-example.csv")) (def y (dataframe "test/clojask/Employees-example.csv")) - (is (= "success" (left-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false))) - (is (= "success" (right-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false))) - (is (= "success" (inner-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false))) - (is (= "success" (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary" 8 "resources/test.csv" :exception false))) + (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) + ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) + ;; (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) + ;; (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"]) "Salary" "Salary" 8 "resources/test.csv" :exception false))) )) (deftest join-api-output-test (testing "Join dataframes APIs" (def x (dataframe "test/clojask/Employees-example.csv")) (def y (dataframe "test/clojask/Employees-info-example.csv")) - (left-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-4.csv" :exception false) + (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false) (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")] (is (= "" (:out result)))) - (right-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-5.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] - (is (= "" (:out result)))) - (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] - (is (= "" (:out result)))) - (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate" 8 "test/clojask/test_outputs/1-7.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")] - (is (= "" (:out result)))) - (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate" 8 "test/clojask/test_outputs/1-8.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")] - (is (= "" (:out result)))) + ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) + ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] + ;; (is (= "" (:out result)))) + ;; (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) + ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] + ;; (is (= "" (:out result)))) + ;; (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false) + ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")] + ;; (is (= "" (:out result)))) + ;; (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false) + ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")] + ;; (is (= "" (:out result)))) )) From d36dc7fd92dd0673a5654a8c765c44711fbbb902 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Mon, 20 Dec 2021 22:14:40 +0800 Subject: [PATCH 09/33] Implement Join DF printCol, amend bug in getColNames --- src/main/clojure/clojask/dataframe.clj | 30 ++++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index b7b15c1..ec6f542 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -158,6 +158,7 @@ (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n"))))) + ;; !! deprecated (printJoinCol ;; print column names, called by join APIs [this b-df this-keys b-keys output-path col-prefix] @@ -470,7 +471,9 @@ ;; ============= Below is the definition for the joineddataframe ================ (definterface JDFIntf + (checkOutputPath [output-path] "check if output path is of string type") (getColNames [] "get the names of all the columns") + (printCol [output-path] "print column names to output file") (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude])) (defrecord JoinedDataFrame @@ -484,6 +487,7 @@ limit prefix] JDFIntf + (getColNames [this] (let [a-col-prefix (first prefix) @@ -492,16 +496,24 @@ b-col-set (.getColNames b) a-col-header (mapv #(str a-col-prefix "_" %) a-col-set) b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)] - (conj a-col-header b-col-header))) + (concat a-col-header b-col-header))) + (printCol + ;; print column names, called by compute + [this output-path] + (let [col-set (.getColNames this)] + (with-open [wrtr (io/writer output-path)] + (.write wrtr (str (str/join "," col-set) "\n"))))) + (compute - [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude] - (let [] - (u/init-file output-dir) - ;; print column names - ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) - (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit)))) + [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude] + (let [] + (u/init-file output-dir) + ;; print column names + ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done + (.printCol this output-dir) + (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) + (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit)))) (defn inner-join [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}] @@ -620,6 +632,6 @@ (defn get-col-names "Get the names for the columns in sequence" [this] - ;; to-do: should implement both for the DataFrame and JoinedDataFrame + ;; to-do: should implement both for the DataFrame and JoinedDataFrame => Done (.getColNames this) ) From 69faacb152d6f35aae4e20a11391d3de4c6e54b1 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Mon, 20 Dec 2021 22:50:50 +0800 Subject: [PATCH 10/33] Amend getColNames logic to retrieve columns following the order of indices --- src/main/clojure/clojask/dataframe.clj | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index ec6f542..39eeca2 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -136,9 +136,11 @@ (getColNames [this] (let [index-key (.getIndexKey (:col-info this)) - index (.getColIndex this) - header (mapv index-key index)] - header)) + index (.getColIndex this)] + ;header (mapv index-key index)] + ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2]) + (mapv (fn [i] (get index-key i)) index) + )) (printCol ;; print column names, called by compute @@ -263,7 +265,7 @@ (if (<= num-worker 8) (try (.final this) - (.printCol this output-dir) ;; to-do: based on the index + (.printCol this output-dir) ;; to-do: based on the index => Done (let [res (start-onyx num-worker batch-size this output-dir exception order index)] (if (= res "success") "success" From 88b549bcd042d1aac4e13cfeed3b2d356e6176a2 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 22 Dec 2021 14:13:36 +0800 Subject: [PATCH 11/33] Update debug code --- src/main/clojure/clojask/debug.clj | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/clojure/clojask/debug.clj b/src/main/clojure/clojask/debug.clj index 331d428..c00a19f 100644 --- a/src/main/clojure/clojask/debug.clj +++ b/src/main/clojure/clojask/debug.clj @@ -13,15 +13,16 @@ ;(def x "Hello world") ;(-> (clojure.core/format "Expression '%s' not defined." x)(MyOwnException.)(throw)) - (def y (dataframe "resources/Employees.csv" :have-col true)) - ;(def y (dataframe "resources/Employees-info.csv" :have-col true)) - ;(time (left-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false)) + (def x (dataframe "resources/Employees.csv" :have-col true)) + (def y (dataframe "resources/Employees-info.csv" :have-col true)) + (def z (left-join x y ["Employee"] ["Employee"])) + (time (compute x 8 "resources/test.csv" :exception true)) + ;(time (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "Employee" "Employee" 8 "resources/test.csv" :exception false)) - (select-col y ["Salary" "EmployeeName"]) + ;(select-col y ["Salary" "EmployeeName"]) ;(delete-col y ["Salary" "EmployeeName"]) - (print-df y) - (time (compute y 8 "resources/test.csv" :exception true)) + ;(print-df y) ;(println (.getKeys (.col-info y))) ;(set-type y "Salary" "double") From 7276719030e04ebd66323dbbcc2d054b0c5acda5 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 22 Dec 2021 14:51:23 +0800 Subject: [PATCH 12/33] Change getColNames to incorporate aggregated/grouped-by dataframes --- src/main/clojure/clojask/dataframe.clj | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 39eeca2..ffa8eeb 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -135,12 +135,17 @@ (getColNames [this] - (let [index-key (.getIndexKey (:col-info this)) - index (.getColIndex this)] - ;header (mapv index-key index)] - ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2]) - (mapv (fn [i] (get index-key i)) index) - )) + (if (and (= 0 (count (.getGroupbyKeys (:row-info this)))) (= 0 (count (.getAggreNewKeys (:row-info this))))) + ;; not aggregate + (let [index-key (.getIndexKey (:col-info this)) + index (.getColIndex this)] + ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2]) + (mapv (fn [i] (get index-key i)) index)) + ;; if aggregate + (let [groupby-key-index (.getGroupbyKeys (:row-info this)) + groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index)))) + aggre-new-keys (.getAggreNewKeys (:row-info this))] + (concat groupby-keys aggre-new-keys)))) (printCol ;; print column names, called by compute @@ -150,6 +155,7 @@ (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) + ;; !! deprecated (printAggreCol ;; print column names, called by computeAggre [this output-path] @@ -276,7 +282,8 @@ (computeAggre [this ^int num-worker ^String output-dir ^boolean exception] (.computeTypeCheck this num-worker output-dir) - (.printAggreCol this output-dir) ;; print column names to output-dir + ;(.printAggreCol this output-dir) ;; print column names to output-dir + (.printCol this output-dir) (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)] (if (= res "success") "success" @@ -288,7 +295,8 @@ (if (<= num-worker 8) (try (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)] - (.printAggreCol this output-dir) ;; print column names to output-dir + ;(.printAggreCol this output-dir) ;; print column names to output-dir + (.printCol this output-dir) (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) (if From 46f91d4b445aa5b992be20a59b1a2011c8da42e6 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Thu, 23 Dec 2021 18:27:57 +0800 Subject: [PATCH 13/33] select on DataFrame --- .../clojure/aggregate/aggre_onyx_comps.clj | 22 +++-- src/main/clojure/clojask/ColInfo.clj | 2 +- src/main/clojure/clojask/clojask_aggre.clj | 25 ++++-- src/main/clojure/clojask/dataframe.clj | 82 +++++++++++++------ src/main/clojure/clojask/onyx_comps.clj | 13 ++- src/main/clojure/clojask/utils.clj | 8 +- 6 files changed, 102 insertions(+), 50 deletions(-) diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj index 4cbfb48..fd6c0c1 100644 --- a/src/main/clojure/aggregate/aggre_onyx_comps.clj +++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj @@ -8,7 +8,7 @@ [onyx.test-helper :refer [with-test-env feedback-exception!]] [tech.v3.dataset :as ds] [clojure.data.csv :as csv] - [clojask.utils :refer [eval-res eval-res-ne filter-check]] + [clojask.utils :as u] [clojure.set :as set] [clojask.groupby :refer [read-csv-seq]]) (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) @@ -38,10 +38,11 @@ (defn worker-func-gen - [df exception] + [df exception aggre-funcs index formatter] (reset! dataframe df) - (let [aggre-funcs (.getAggreFunc (.row-info (deref dataframe))) - formatters (.getFormatter (.col-info (deref dataframe))) + (let [ + ;; aggre-funcs (.getAggreFunc (.row-info (deref dataframe))) + formatters formatter ;; key-index (.getKeyIndex (.col-info (deref dataframe))) ;; formatters (set/rename-keys formatters key-index) ] @@ -52,7 +53,10 @@ (let [data (read-csv-seq (:file seq)) pre (:d seq) data-map (-> (iterate inc 0) - (zipmap (apply map vector data)))] + (zipmap (apply map vector data))) + reorder (fn [a b] + ;; (println [a b]) + (u/gets (concat a b) index))] ;; (mapv (fn [_] ;; (let [func (first _) ;; index (nth _ 1)] @@ -62,7 +66,9 @@ res []] (if (= aggre-funcs []) ;; {:d (vec (concat pre res))} - {:d (mapv concat (repeat pre) (apply map vector res))} + (if (= res []) + {:d (u/gets [pre] index)} + {:d (mapv reorder (repeat pre) (apply map vector res))}) (let [func (first (first aggre-funcs)) index (nth (first aggre-funcs) 1) res-funcs (rest aggre-funcs) @@ -252,11 +258,11 @@ (defn start-onyx-aggre "start the onyx cluster with the specification inside dataframe" - [num-work batch-size dataframe dist exception] + [num-work batch-size dataframe dist exception aggre-func index formatter] (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception) ;;need some work + (worker-func-gen dataframe exception aggre-func index formatter) ;;need some work (catalog-gen num-work batch-size) (lifecycle-gen "./_clojask/grouped" dist) (flow-cond-gen num-work) diff --git a/src/main/clojure/clojask/ColInfo.clj b/src/main/clojure/clojask/ColInfo.clj index 8015ad0..0be2d10 100644 --- a/src/main/clojure/clojask/ColInfo.clj +++ b/src/main/clojure/clojask/ColInfo.clj @@ -107,7 +107,7 @@ (getKeys [this] - col-keys) + (mapv (fn [index] (get index-key index)) (take (count index-key) (iterate inc 0)))) (getKeyIndex [this] diff --git a/src/main/clojure/clojask/clojask_aggre.clj b/src/main/clojure/clojask/clojask_aggre.clj index d4f28ed..456cc58 100644 --- a/src/main/clojure/clojask/clojask_aggre.clj +++ b/src/main/clojure/clojask/clojask_aggre.clj @@ -4,14 +4,19 @@ [clojure.java.io :as io] [taoensso.timbre :refer [debug info] :as timbre] [clojure.string :as string] - [clojask.api.aggregate :refer [start]]) + [clojask.api.aggregate :refer [start]] + [clojask.utils :as u]) (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) (def df (atom nil)) +(def aggre-func (atom nil)) +(def select (atom nil)) (defn inject-dataframe - [dataframe] + [dataframe a b] (reset! df dataframe) + (reset! aggre-func a) + (reset! select b) ) (defn c-count @@ -39,7 +44,9 @@ :lifecycle/after-task-stop close-writer}) (defrecord ClojaskOutput - [memo] + [memo + aggre-func + select] p/Plugin (start [this event] ;; Initialize the plugin, generally by assoc'ing any initial state. @@ -52,7 +59,7 @@ (let [data (mapv (fn [_] (if (coll? _) _ [_])) (deref memo))] ;; (.write (:clojask/wtr event) (str data "\n")) (if (apply = (map count data)) - (mapv #(.write (:clojask/wtr event) (str (string/join "," %) "\n")) (apply map vector data)) + (mapv #(.write (:clojask/wtr event) (str (string/join "," (u/gets % select)) "\n")) (apply map vector data)) (throw (Exception. "aggregation result is not of the same length")))) this) @@ -86,7 +93,7 @@ ;; before write-batch is called repeatedly. true) - (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/aggre-func]} replica messenger] + (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger] ;; keys [:Departement] ;; Write the batch to your datasink. ;; In this case we are conjoining elements onto a collection. @@ -111,6 +118,8 @@ ;; from your task-map here, in order to improve the performance of your plugin ;; Extending the function below is likely good for most use cases. (defn output [pipeline-data] - (let [aggre-func (.getAggreFunc (:row-info (deref df)))] - (->ClojaskOutput (volatile! (doall (take (count aggre-func) - (repeat start))))))) \ No newline at end of file + (let [] + (->ClojaskOutput (volatile! (doall (take (count (deref aggre-func)) + (repeat start)))) + (deref aggre-func) + (deref select)))) \ No newline at end of file diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index ffa8eeb..88abdd5 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -41,8 +41,8 @@ (head [n] "return first n lines in dataframe") (filter [cols predicate]) (computeTypeCheck [num-worker output-dir]) - (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception]) - (computeAggre [^int num-worker ^String output-dir ^boolean exception]) + (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception select]) + (computeAggre [^int num-worker ^String output-dir ^boolean exception select]) (sort [a b] "sort the dataframe based on columns") (addFormatter [a b] "format the column as the last step of the computation") (preview [sample-size output-size format] "quickly return a vector of maps about the resultant dataframe") @@ -280,31 +280,60 @@ (throw (Clojask_OperationException. "Max number of worker nodes is 8."))))) (computeAggre - [this ^int num-worker ^String output-dir ^boolean exception] + [this ^int num-worker ^String output-dir ^boolean exception select] (.computeTypeCheck this num-worker output-dir) ;(.printAggreCol this output-dir) ;; print column names to output-dir (.printCol this output-dir) - (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)] + (let [aggre-keys (.getAggreFunc row-info) + select (if (coll? select) select [select]) + select (if (= select [nil]) + (vec (take (count aggre-keys) (iterate inc 0))) + (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + aggre-func (u/gets aggre-keys (vec (apply sorted-set select))) + select (mapv (fn [num] (count (remove #(>= % num) select))) select) + index (vec (apply sorted-set (mapv #(nth % 1) aggre-func))) + shift-func (fn [pair] + [(first pair) (let [num (nth pair 1)] + (.indexOf index num))]) + aggre-func (mapv shift-func aggre-func) + ;; test (println [select index aggre-func]) + res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)] (if (= res "success") "success" "failed"))) (computeGroupAggre - [this ^int num-worker ^String output-dir ^boolean exception] + [this ^int num-worker ^String output-dir ^boolean exception select] (.computeTypeCheck this num-worker output-dir) (if (<= num-worker 8) (try - (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)] + (let [groupby-keys (.getGroupbyKeys row-info) + aggre-keys (.getAggreFunc row-info) + select (if (coll? select) select [select]) + select (if (= select [nil]) + (vec (take (+ (count groupby-keys) (count aggre-keys)) (iterate inc 0))) + (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + ;; pre-index (remove #(>= % (count groupby-keys)) select) + data-index (mapv #(- % (count groupby-keys)) (remove #(< % (count groupby-keys)) select)) + groupby-index (vec (apply sorted-set (mapv #(nth % 1) (concat groupby-keys (u/gets aggre-keys data-index))))) + ;; test (println [groupby-keys aggre-keys select pre-index data-index]) + res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir (.printCol this output-dir) (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) - (if + (let [shift-func (fn [pair] + [(first pair) (let [index (nth pair 1)] + (.indexOf groupby-index index))]) + aggre-func (mapv shift-func (u/gets aggre-keys data-index)) + formatter (.getFormatter (.col-info this)) + formatter (set/rename-keys formatter (zipmap groupby-index (iterate inc 0)))] + (if ;; (internal-aggregate (.getAggreFunc (:row-info this)) output-dir (.getKeyIndex col-info) (.getGroupbyKeys (:row-info this)) (.getAggreOldKeys (:row-info this)) (.getAggreNewKeys (:row-info this))) - (start-onyx-aggre num-worker batch-size this output-dir exception) + (start-onyx-aggre num-worker batch-size this output-dir exception aggre-func select formatter) "success" - (throw (Clojask_OperationException. "Error in running start-onyx-aggre."))) - (throw (Clojask_OperationException. "Error in running start-onyx-groupby.")))) + (throw (Clojask_OperationException. "Error when aggregating.")))) + (throw (Clojask_OperationException. "Error when grouping by.")))) (catch Exception e e)) (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))) @@ -516,13 +545,17 @@ (.write wrtr (str (str/join "," col-set) "\n"))))) (compute - [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude] - (let [] + [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select] + (let [select (if (coll? select) select [select]) + select (if (= select [nil]) + (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0))) + (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + ] (u/init-file output-dir) ;; print column names ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done (.printCol this output-dir) - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception) + (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception []) ;; todo (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit)))) (defn inner-join @@ -541,7 +574,7 @@ b-file (io/file (:path b))] (if (<= (.length a-file) (.length b-file)) (JoinedDataFrame. a b a-keys b-keys nil nil 1 nil col-prefix) - (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil [(nth col-prefix 1) (nth col-prefix 2)]))) + (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil [(nth col-prefix 1) (nth col-prefix 0)]))) )) @@ -573,7 +606,7 @@ (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) - (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil [(nth col-prefix 1) (nth col-prefix 2)]))) + (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil [(nth col-prefix 1) (nth col-prefix 0)]))) (defn rolling-join-forward @@ -625,17 +658,16 @@ (u/init-file output-dir) ;; check which type of dataframe this is (if (= (type this) clojask.dataframe.DataFrame) - (if (= (.getAggreFunc (:row-info this)) []) - (let [exclude (if (coll? exclude) exclude [exclude]) - select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))] - (.compute this num-worker output-dir exception order select)) - (if (not= (.getGroupbyKeys (:row-info this)) []) - (let [exclude (if (coll? exclude) exclude [exclude]) - select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))] - (.computeGroupAggre this num-worker output-dir exception)) - (.computeAggre this num-worker output-dir exception))) + (let [exclude (if (coll? exclude) exclude [exclude]) + select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] + (assert (not= select []) "must select at least on column") + (if (= (.getAggreFunc (:row-info this)) []) + (.compute this num-worker output-dir exception order select) + (if (not= (.getGroupbyKeys (:row-info this)) []) + (.computeGroupAggre this num-worker output-dir exception select) + (.computeAggre this num-worker output-dir exception select)))) (if (= (type this) clojask.dataframe.JoinedDataFrame) - (.compute this num-worker output-dir exception order select exclude) + (.compute this num-worker output-dir exception order select) (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")) ))) diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj index 50aef35..dd5236c 100644 --- a/src/main/clojure/clojask/onyx_comps.clj +++ b/src/main/clojure/clojask/onyx_comps.clj @@ -66,8 +66,6 @@ (let [operations (.getDesc (:col-info (deref dataframe))) types (.getType (:col-info (deref dataframe))) filters (.getFilters (:row-info df)) - indices-deleted (.getDeletedCol (:col-info (deref dataframe))) - indices-wo-del (vec (take (count operations) (iterate inc 0))) indices index] ;; (println indices) (if exception @@ -504,16 +502,16 @@ (defn start-onyx-aggre-only "start the onyx cluster with the specification inside dataframe" - [num-work batch-size dataframe dist exception] + [num-work batch-size dataframe dist exception aggre-func index select] (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work + (worker-func-gen dataframe exception index) ;;need some work (catalog-aggre-gen num-work batch-size) (lifecycle-aggre-gen (.path dataframe) dist) (flow-cond-gen num-work) (input/inject-dataframe dataframe) - (aggre/inject-dataframe dataframe) + (aggre/inject-dataframe dataframe aggre-func select) (catch Exception e (throw (Exception. (str "[preparing stage] " (.getMessage e)))))) (try (let [submission (onyx.api/submit-job peer-config @@ -536,11 +534,12 @@ (defn start-onyx-groupby "start the onyx cluster with the specification inside dataframe" - [num-work batch-size dataframe dist groupby-keys exception] + [num-work batch-size dataframe dist groupby-keys groupby-index exception] + ;; (println groupby-index) (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work + (worker-func-gen dataframe exception groupby-index) ;;need some work (catalog-groupby-gen num-work batch-size) (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe))) (flow-cond-gen num-work) diff --git a/src/main/clojure/clojask/utils.clj b/src/main/clojure/clojask/utils.clj index e6d5cb1..cd3852b 100644 --- a/src/main/clojure/clojask/utils.clj +++ b/src/main/clojure/clojask/utils.clj @@ -11,6 +11,12 @@ (java.time.format DateTimeFormatter))) "Utility function used in dataframe" +(defn gets + "unlike core/get, get elements from indices" + [coll indices] + (mapv #(nth coll %) indices) + ) + (defn get-key [row types key-index key] (let [index (get key-index key)] @@ -277,4 +283,4 @@ (if (string? input) [[nil input]] nil)) - (catch Exception e nil))) \ No newline at end of file + (catch Exception e nil))) From 967918b46351ab767b86e54ddb4f6587c849f5f0 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Fri, 24 Dec 2021 00:25:22 +0800 Subject: [PATCH 14/33] select for aggregate, join --- .../clojure/aggregate/aggre_onyx_comps.clj | 2 +- src/main/clojure/clojask/clojask_groupby.clj | 12 +- src/main/clojure/clojask/clojask_join.clj | 71 ++------- src/main/clojure/clojask/dataframe.clj | 43 +++--- src/main/clojure/clojask/debug.clj | 2 +- src/main/clojure/clojask/groupby.clj | 7 +- src/main/clojure/clojask/join.clj | 137 ++---------------- src/main/clojure/clojask/onyx_comps.clj | 8 +- src/main/clojure/clojask/utils.clj | 129 +++++++++++------ 9 files changed, 154 insertions(+), 257 deletions(-) diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj index fd6c0c1..5dd3521 100644 --- a/src/main/clojure/aggregate/aggre_onyx_comps.clj +++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj @@ -67,7 +67,7 @@ (if (= aggre-funcs []) ;; {:d (vec (concat pre res))} (if (= res []) - {:d (u/gets [pre] index)} + {:d [(u/gets pre index)]} {:d (mapv reorder (repeat pre) (apply map vector res))}) (let [func (first (first aggre-funcs)) index (nth (first aggre-funcs) 1) diff --git a/src/main/clojure/clojask/clojask_groupby.clj b/src/main/clojure/clojask/clojask_groupby.clj index 19739c2..3405036 100644 --- a/src/main/clojure/clojask/clojask_groupby.clj +++ b/src/main/clojure/clojask/clojask_groupby.clj @@ -8,11 +8,13 @@ (def dataframe (atom nil)) (def groupby-keys (atom nil)) +(def write-index (atom nil)) (defn inject-dataframe - [df groupby-key] + [df groupby-key index] (reset! dataframe df) - (reset! groupby-keys groupby-key)) + (reset! groupby-keys groupby-key) + (reset! write-index index)) (defn- inject-into-eventmap [event lifecycle] @@ -35,7 +37,7 @@ (def writer-aggre-calls {:lifecycle/before-task-start inject-into-eventmap}) -(defrecord ClojaskGroupby [] +(defrecord ClojaskGroupby [write-index] p/Plugin (start [this event] ;; Initialize the plugin, generally by assoc'ing any initial state. @@ -90,7 +92,7 @@ ;(.write wtr (str msg "\n")) ;; !! define argument (debug) ;; (def groupby-keys [:Department :EmployeeName]) - (output-groupby dist (:d msg) groupby-keys key-index formatter))) + (output-groupby dist (:d msg) groupby-keys key-index formatter write-index))) (recur (rest batch))))) true)) @@ -101,4 +103,4 @@ ;; from your task-map here, in order to improve the performance of your plugin ;; Extending the function below is likely good for most use cases. (defn groupby [pipeline-data] - (->ClojaskGroupby)) \ No newline at end of file + (->ClojaskGroupby (deref write-index))) \ No newline at end of file diff --git a/src/main/clojure/clojask/clojask_join.clj b/src/main/clojure/clojask/clojask_join.clj index 3b2e093..fa9d9c1 100644 --- a/src/main/clojure/clojask/clojask_join.clj +++ b/src/main/clojure/clojask/clojask_join.clj @@ -11,13 +11,21 @@ (def b (atom nil)) (def a-keys (atom nil)) (def b-keys (atom nil)) +(def a-index (atom nil)) +(def b-index (atom nil)) +(def b-format (atom nil)) +(def join-index (atom nil)) (defn inject-dataframe - [d-a d-b a-key b-key] + [d-a d-b a-key b-key -a-index -b-index -join-index -b-format] (reset! a d-a) (reset! b d-b) (reset! a-keys a-key) - (reset! b-keys b-key)) + (reset! b-keys b-key) + (reset! a-index -a-index) + (reset! b-index -b-index) + (reset! b-format -b-format) + (reset! join-index -join-index)) (defn- inject-into-eventmap [event lifecycle] @@ -38,8 +46,6 @@ :clojask/b-map (:clojask/b-map lifecycle) :clojask/a-format a-format :clojask/b-format b-format - :clojask/a-index (take (count (:clojask/a-map lifecycle)) (iterate inc 0)) - :clojask/b-index (take (count (:clojask/b-map lifecycle)) (iterate inc 0)) :clojask/join-type (:clojask/join-type lifecycle)})) (defn- close-writer [event lifecycle] @@ -52,7 +58,7 @@ {:lifecycle/before-task-start inject-into-eventmap :lifecycle/after-task-stop close-writer}) -(defrecord ClojaskJoin [] +(defrecord ClojaskJoin [a-index b-index join-index] p/Plugin (start [this event] ;; Initialize the plugin, generally by assoc'ing any initial state. @@ -94,7 +100,7 @@ ;; before write-batch is called repeatedly. true) - (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll clojask/a-map clojask/b-map clojask/a-format clojask/b-format clojask/a-index clojask/b-index clojask/join-type]} replica messenger] + (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll clojask/a-map clojask/b-map clojask/a-format]} replica messenger] ;; keys [:Departement] ;; Write the batch to your datasink. ;; In this case we are conjoining elements onto a collection. @@ -107,58 +113,9 @@ ;(.write wtr (str msg "\n")) ;; !! define argument (debug) ;; (def groupby-keys [:Department :EmployeeName]) - (join/output-join wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index))) + (join/output-join wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index join-index))) (recur (rest batch))))) - ;; (case join-type - ;; 1 (loop [batch write-batch] - ;; (if-let [msg (first batch)] - ;; (do - ;; ;; (swap! example-datasink conj msg) - ;; (if (not= (:d msg) nil) - ;; (do - ;; ;(.write wtr (str msg "\n")) - ;; ;; !! define argument (debug) - ;; ;; (def groupby-keys [:Department :EmployeeName]) - ;; (join/output-join wtr (:d msg) a-keys a-map b-keys a-format b-format a-index b-index))) - - ;; (recur (rest batch))))) - ;; 2 (loop [batch write-batch] - ;; (if-let [msg (first batch)] - ;; (do - ;; ;; (swap! example-datasink conj msg) - ;; (if (not= (:d msg) nil) - ;; (do - ;; ;(.write wtr (str msg "\n")) - ;; ;; !! define argument (debug) - ;; ;; (def groupby-keys [:Department :EmployeeName]) - ;; (join/output-join-loo wtr (:d msg) a-keys a-map b-keys (count b-map) a-format b-format a-index b-index))) - - ;; (recur (rest batch))))) - ;; 4 (loop [batch write-batch] - ;; (if-let [msg (first batch)] - ;; (do - ;; ;; (swap! example-datasink conj msg) - ;; (if (not= (:d msg) nil) - ;; (do - ;; ;(.write wtr (str msg "\n")) - ;; ;; !! define argument (debug) - ;; ;; (def groupby-keys [:Department :EmployeeName]) - ;; (join/output-join-forward wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index))) - ;; (recur (rest batch))))) - - ;; 5 (loop [batch write-batch] - ;; (if-let [msg (first batch)] - ;; (do - ;; ;; (swap! example-datasink conj msg) - ;; (if (not= (:d msg) nil) - ;; (do - ;; ;(.write wtr (str msg "\n")) - ;; ;; !! define argument (debug) - ;; ;; (def groupby-keys [:Department :EmployeeName]) - ;; (join/output-join-backward wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index))) - ;; (recur (rest batch)))))) - ;; (.close wtr) true)) ;; Builder function for your output plugin. @@ -167,4 +124,4 @@ ;; from your task-map here, in order to improve the performance of your plugin ;; Extending the function below is likely good for most use cases. (defn join [pipeline-data] - (->ClojaskJoin)) \ No newline at end of file + (->ClojaskJoin (deref a-index) (deref b-index) (deref join-index))) \ No newline at end of file diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 88abdd5..1b70b47 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -283,7 +283,6 @@ [this ^int num-worker ^String output-dir ^boolean exception select] (.computeTypeCheck this num-worker output-dir) ;(.printAggreCol this output-dir) ;; print column names to output-dir - (.printCol this output-dir) (let [aggre-keys (.getAggreFunc row-info) select (if (coll? select) select [select]) select (if (= select [nil]) @@ -297,6 +296,7 @@ (.indexOf index num))]) aggre-func (mapv shift-func aggre-func) ;; test (println [select index aggre-func]) + tmp (.printCol this output-dir) ;; todo: based on "select" res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)] (if (= res "success") "success" @@ -319,7 +319,7 @@ ;; test (println [groupby-keys aggre-keys select pre-index data-index]) res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir - (.printCol this output-dir) + (.printCol this output-dir) ;; todo: based on "select" (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) (let [shift-func (fn [pair] @@ -513,7 +513,7 @@ (checkOutputPath [output-path] "check if output path is of string type") (getColNames [] "get the names of all the columns") (printCol [output-path] "print column names to output file") - (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude])) + (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select])) (defrecord JoinedDataFrame [^DataFrame a @@ -549,14 +549,24 @@ (let [select (if (coll? select) select [select]) select (if (= select [nil]) (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0))) - (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select))) + ;; a-write + b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select))) + b-index (if b-roll (vec (apply sorted-set (conj b-index b-roll))) b-index) + b-roll (if b-roll (count (remove #(>= % b-roll) b-index)) nil) + ;; b-write + ;; a-format + b-format (set/rename-keys (.getFormatter (.col-info b)) (zipmap b-index (iterate inc 0))) + write-index (mapv (fn [num] (count (remove #(>= % num) (concat a-index (mapv #(+ % (count (.getKeyIndex (.col-info a)))) b-index))))) select) + ;; test (println a-index b-index b-format write-index b-roll) ] (u/init-file output-dir) ;; print column names ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done - (.printCol this output-dir) - (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception []) ;; todo - (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit)))) + (.printCol this output-dir) ;; todo: based on "select" + (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo + (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index)))) (defn inner-join [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}] @@ -657,20 +667,19 @@ (assert (or (nil? select) (nil? exclude)) "can only specify either of them") (u/init-file output-dir) ;; check which type of dataframe this is - (if (= (type this) clojask.dataframe.DataFrame) - (let [exclude (if (coll? exclude) exclude [exclude]) - select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] - (assert (not= select []) "must select at least on column") + (let [exclude (if (coll? exclude) exclude [exclude]) + select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] + (assert (not= select []) "must select at least 1 column") + (if (= (type this) clojask.dataframe.DataFrame) (if (= (.getAggreFunc (:row-info this)) []) (.compute this num-worker output-dir exception order select) (if (not= (.getGroupbyKeys (:row-info this)) []) (.computeGroupAggre this num-worker output-dir exception select) - (.computeAggre this num-worker output-dir exception select)))) - (if (= (type this) clojask.dataframe.JoinedDataFrame) - (.compute this num-worker output-dir exception order select) - (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")) - ))) - + (.computeAggre this num-worker output-dir exception select))) + (if (= (type this) clojask.dataframe.JoinedDataFrame) + (.compute this num-worker output-dir exception order select) + (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))))) +) (defn get-col-names "Get the names for the columns in sequence" [this] diff --git a/src/main/clojure/clojask/debug.clj b/src/main/clojure/clojask/debug.clj index c00a19f..5f54231 100644 --- a/src/main/clojure/clojask/debug.clj +++ b/src/main/clojure/clojask/debug.clj @@ -1,6 +1,6 @@ (ns clojask.debug (:require [clojask.dataframe :refer :all] - ;; [clojask.utils :refer :all] + [clojask.utils :as u] [clojask.groupby :refer :all] [clojask.sort :as sort] [clojask.api.aggregate :as aggre] diff --git a/src/main/clojure/clojask/groupby.clj b/src/main/clojure/clojask/groupby.clj index 4903178..8d4f9ba 100644 --- a/src/main/clojure/clojask/groupby.clj +++ b/src/main/clojure/clojask/groupby.clj @@ -1,7 +1,8 @@ (ns clojask.groupby (:require [clojure.java.io :as io] [clojure-csv.core :as csv] - [clojure.core.async :as async])) + [clojure.core.async :as async] + [clojask.utils :as u])) "contains the utility functions to group by and aggregate" (defn compute-groupby @@ -48,7 +49,7 @@ (defn output-groupby "internal function called by output when aggregation is applied" - [dist msg groupby-keys key-index formatter] + [dist msg groupby-keys key-index formatter write-index] ;; msg this time is a vector ;; key-index contains the one to one correspondence of key value to index value, it is a map @@ -57,7 +58,7 @@ (let [output-filename (gen-groupby-filenames dist msg groupby-keys key-index formatter) ;; generate output filename groupby-wrtr (io/writer output-filename :append true)] ;; write as maps e.g. {:name "Tim", :salary 62, :tax 0.1, :bonus 12} - (.write groupby-wrtr (str msg "\n")) + (.write groupby-wrtr (str (u/gets msg write-index) "\n")) ;; write as csv format e.g. Tim,62,0.1,12 ;(.write groupby-wrtr (str (clojure.string/join "," (map msg (keys msg))) "\n")) diff --git a/src/main/clojure/clojask/join.clj b/src/main/clojure/clojask/join.clj index 68b7125..cd25a9c 100644 --- a/src/main/clojure/clojask/join.clj +++ b/src/main/clojure/clojask/join.clj @@ -4,17 +4,10 @@ [clojure.core.async :as async] ;; [clojask.onyx-comps :refer [start-onyx-groupby start-onyx-join]] [clojask.groupby :refer [read-csv-seq gen-groupby-filenames]] - [clojure.string :as str])) + [clojure.string :as str] + [clojask.utils :as u])) -(defn- group-inner-join - [a b c] - ;; a is readers to the file - ;; b is the filename - (with-open [wtr (io/writer c :append true)] - (doseq [a-row (read-csv-seq a)] - (doseq [b-row (read-csv-seq b)] - (.write wtr (str (vec (concat a-row b-row)) "\n")))))) (defn gen-join-filenames [dist a-row a-keys] @@ -26,7 +19,7 @@ (str dist a-val))) (defn output-join-inner - [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] + [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index] (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] ;; (println writer) ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true) @@ -46,12 +39,13 @@ (if-let [format (get b-format index)] (format (nth b-row index)) (nth b-row index)))] - (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))) + ;; (println [(vec a-row) (vec b-row) a-index b-index join-index]) + (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n")))) (.close filename))))) (defn output-join-loo "used for left join right join or outter join" - [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] + [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index] (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] ;; (println writer) ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true) @@ -68,112 +62,13 @@ (if-let [format (get b-format index)] (format (nth b-row index)) (nth b-row index)))] - (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n")))) (.close filename)) (let [a-row (for [index a-index] (if-let [format (get a-format index)] (format (nth a-row index)) (nth a-row index)))] - (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))))) - -;; (defn roll-join-get-line-forward -;; "get the max of all the smaller" -;; [bench filename index] -;; (def memo (volatile! nil)) -;; (def res (volatile! nil)) -;; (doseq [row (read-csv-seq filename)] -;; (let [val (nth row index)] -;; ;; (println [bench filename index row val]) -;; ;; | does here need to be =? -;; (if (and (<= (compare val bench) 0) (or (= @memo nil) (> (compare val @memo) 0))) -;; (do (vreset! memo val) -;; (vreset! res row))))) -;; @res) - -;; (defn roll-join-get-line-backward -;; "get the min of all the greater" -;; [bench filename index] -;; (def memo (volatile! nil)) -;; (def res (volatile! nil)) -;; (doseq [row (read-csv-seq filename)] -;; (let [val (nth row index)] -;; ;; | does here need to be =? -;; (if (and (>= (compare val bench) 0) (or (= @memo nil) (< (compare val @memo) 0))) -;; (do (vreset! memo val) -;; (vreset! res row))))) -;; @res) - -;; (doseq [file (rest (file-seq (clojure.java.io/file "./_clojask/grouped/")))] -;; (io/delete-file file)) - -;; (defn internal-rolling-join-forward -;; [a b a-dir b-dir a-keys b-keys a-roll b-roll] -;; ;; (let [a-reader (io/reader (:path a))] -;; ;; ()) -;; (s)) - - -;; (defn output-join-forward -;; "[writer a-row a-keys a-map b-keys count a-format b-format a-index b-index] " -;; [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] -;; (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] -;; ;; (println writer) -;; ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true) -;; (if (.exists (io/file filename)) -;; ;; (spit "_clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true) -;; (let [filename (io/reader filename)] -;; (if-let [b-row (roll-join-get-line-forward (nth a-row a-roll) filename b-roll)] ;; bench is a string -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index))) -;; b-row (for [index b-index] -;; (if-let [format (get b-format index)] -;; (format (nth b-row index)) -;; (nth b-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))) -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))) -;; (.close filename)) -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))))) - -;; (defn output-join-backward -;; "" -;; [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] -;; (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] -;; ;; (println writer) -;; ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true) -;; (if (.exists (io/file filename)) -;; ;; (spit "_clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true) -;; (let [filename (io/reader filename)] -;; (if-let [b-row (roll-join-get-line-backward (nth a-row a-roll) filename b-roll)] ;; bench is a string -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index))) -;; b-row (for [index b-index] -;; (if-let [format (get b-format index)] -;; (format (nth b-row index)) -;; (nth b-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))) -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))) -;; (.close filename)) -;; (let [a-row (for [index a-index] -;; (if-let [format (get a-format index)] -;; (format (nth a-row index)) -;; (nth a-row index)))] -;; (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n")))))) (defn defn-join [type limit] @@ -191,7 +86,7 @@ (do (vreset! memo val) (vreset! res row))))) @res)] - (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] + (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index] (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] (if (.exists (io/file filename)) (let [filename (io/reader filename)] @@ -204,18 +99,18 @@ (if-let [format (get b-format index)] (format (nth b-row index)) (nth b-row index)))] - (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n"))) (let [a-row (for [index a-index] (if-let [format (get a-format index)] (format (nth a-row index)) (nth a-row index)))] - (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n")))) (.close filename)) (let [a-row (for [index a-index] (if-let [format (get a-format index)] (format (nth a-row index)) (nth a-row index)))] - (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n"))))))) ;; 5 output-join-backward 5 (let [roll-join-get-line-backward (fn [bench filename index] (def memo (volatile! nil)) @@ -228,7 +123,7 @@ (vreset! res row))))) @res)] (fn - [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index] + [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index] (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)] ;; (println writer) ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true) @@ -244,16 +139,16 @@ (if-let [format (get b-format index)] (format (nth b-row index)) (nth b-row index)))] - (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n"))) (let [a-row (for [index a-index] (if-let [format (get a-format index)] (format (nth a-row index)) (nth a-row index)))] - (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n")))) (.close filename)) (let [a-row (for [index a-index] (if-let [format (get a-format index)] (format (nth a-row index)) (nth a-row index)))] - (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))))) + (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n"))))))) nil))) \ No newline at end of file diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj index dd5236c..48f5790 100644 --- a/src/main/clojure/clojask/onyx_comps.clj +++ b/src/main/clojure/clojask/onyx_comps.clj @@ -539,12 +539,12 @@ (try (workflow-gen num-work) (config-env) - (worker-func-gen dataframe exception groupby-index) ;;need some work + (worker-func-gen dataframe exception (vec (take (count (.getKeyIndex (.col-info dataframe))) (iterate inc 0)))) ;;need some work (catalog-groupby-gen num-work batch-size) (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe))) (flow-cond-gen num-work) (input/inject-dataframe dataframe) - (groupby/inject-dataframe dataframe groupby-keys) + (groupby/inject-dataframe dataframe groupby-keys groupby-index) (catch Exception e (throw (Exception. (str "[preparing stage (group by)] " (.getMessage e)))))) (try (let [submission (onyx.api/submit-job peer-config @@ -567,7 +567,7 @@ (defn start-onyx-join "start the onyx cluster with the specification inside dataframe" - [num-work batch-size dataframe b dist exception a-keys b-keys a-roll b-roll join-type & [limit]] + [num-work batch-size dataframe b dist exception a-keys b-keys a-roll b-roll join-type limit a-index b-index b-format write-index] ;; dataframe means a (try (workflow-gen num-work) @@ -577,7 +577,7 @@ (lifecycle-join-gen (.path dataframe) dist dataframe b a-keys b-keys a-roll b-roll join-type) (flow-cond-gen num-work) (input/inject-dataframe dataframe) - (join/inject-dataframe dataframe b a-keys b-keys) + (join/inject-dataframe dataframe b a-keys b-keys a-index b-index write-index b-format) (let [limit (or limit (fn [a b] true))] (defn-join join-type limit)) (catch Exception e (throw (Exception. (str "[preparing stage (join)] " (.getMessage e)))))) diff --git a/src/main/clojure/clojask/utils.clj b/src/main/clojure/clojask/utils.clj index cd3852b..29fa6d0 100644 --- a/src/main/clojure/clojask/utils.clj +++ b/src/main/clojure/clojask/utils.clj @@ -86,8 +86,6 @@ (if (= com nil) true (do - ;; (println row) - ;; (println (nth com 1)) (if (apply (first com) (get-val row types (nth com 1))) (recur rem) false))))))) @@ -111,28 +109,82 @@ (def fromString (atom (fn [_] (str _)))) -(def toDate - (atom (fn [string] - (try - (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd")) - (catch Exception e (throw e)))))) +;; (def toDate +;; (atom (fn [string] +;; (try +;; (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd")) +;; (catch Exception e (throw e)))))) -(def fromDate - (atom (fn [date] - (if (= (type date) java.time.LocalDate) - (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd")) - date)))) +;; (def fromDate +;; (atom (fn [date] +;; (if (= (type date) java.time.LocalDate) +;; (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd")) +;; date)))) + +;; (def toDateTime +;; (atom (fn [string] +;; (try +;; (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) +;; (catch Exception e (throw e)))))) + +;; (def fromDateTime +;; (atom (fn [date] +;; (if (= (type date) java.time.LocalDateTime) +;; (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) +;; date)))) + +;; (defn set-format-string +;; [string] +;; (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:")) +;; (let [format-string (subs string (inc (str/index-of string ":")))] +;; (reset! toDate +;; (fn [string] +;; (try +;; (LocalDate/parse string (DateTimeFormatter/ofPattern format-string)) +;; (catch Exception e (throw e))))) + +;; (reset! fromDate +;; (fn [date] +;; (if (= (type date) java.time.LocalDate) +;; (.format date (DateTimeFormatter/ofPattern format-string)) +;; date))) + +;; (reset! toDateTime +;; (fn [string] +;; (try +;; (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string)) +;; (catch Exception e (throw e))))) -(def toDateTime +;; (reset! fromDateTime +;; (fn [date] +;; (if (= (type date) java.time.LocalDateTime) +;; (.format date (DateTimeFormatter/ofPattern format-string)) +;; date)))) +;; )) + +;; ;; (def operation-type-map +;; ;; {toInt "int" +;; ;; toDouble "double" +;; ;; toString "string" +;; ;; toDate "date"}) + +;; (def type-operation-map +;; {"int" [toInt fromString] +;; "double" [toDouble fromString] +;; "string" [toString fromString] +;; "date" [toDate fromDate] +;; "datetime" [toDateTime fromDateTime]}) + +(def toDate (atom (fn [string] (try - (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) + (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string) (catch Exception e (throw e)))))) -(def fromDateTime +(def fromDate (atom (fn [date] - (if (= (type date) java.time.LocalDateTime) - (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) + (if (= (type date) java.util.Date) + (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date) date)))) (defn set-format-string @@ -142,52 +194,33 @@ (reset! toDate (fn [string] (try - (LocalDate/parse string (DateTimeFormatter/ofPattern format-string)) + (.parse (java.text.SimpleDateFormat. format-string) string) (catch Exception e (throw e))))) (reset! fromDate (fn [date] - (if (= (type date) java.time.LocalDate) - (.format date (DateTimeFormatter/ofPattern format-string)) - date))) - - (reset! toDateTime + (if (= (type date) java.util.Date) + (.format (java.text.SimpleDateFormat. format-string) date) + date)))) + (do + (reset! toDate (fn [string] (try - (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string)) + (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string) (catch Exception e (throw e))))) - (reset! fromDateTime + (reset! fromDate (fn [date] - (if (= (type date) java.time.LocalDateTime) - (.format date (DateTimeFormatter/ofPattern format-string)) - date)))) - ;; (do - ;; (reset! toDate - ;; (fn [string] - ;; (try - ;; (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string) - ;; (catch Exception e (throw e))))) - - ;; (reset! fromDate - ;; (fn [date] - ;; (if (= (type date) java.util.Date) - ;; (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date) - ;; date)))) - )) - -;; (def operation-type-map -;; {toInt "int" -;; toDouble "double" -;; toString "string" -;; toDate "date"}) + (if (= (type date) java.util.Date) + (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date) + date)))))) (def type-operation-map {"int" [toInt fromString] "double" [toDouble fromString] "string" [toString fromString] "date" [toDate fromDate] - "datetime" [toDateTime fromDateTime]}) + "datetime" [toDate fromDate]}) (defn type-detection [file] From 108b8427c8f783799777d5b327e3ab57e518af60 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Fri, 24 Dec 2021 14:44:52 +0800 Subject: [PATCH 15/33] Amend test file for Join APIs --- test/clojask/core_test.clj | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index c4b1cef..2c3e5cb 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -68,8 +68,8 @@ (def y (dataframe "test/clojask/Employees-example.csv")) (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - ;; (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - ;; (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"]) "Salary" "Salary" 8 "resources/test.csv" :exception false))) + (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) + (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))) )) (deftest join-api-output-test @@ -82,14 +82,14 @@ ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] ;; (is (= "" (:out result)))) - ;; (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) - ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] - ;; (is (= "" (:out result)))) - ;; (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false) - ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")] - ;; (is (= "" (:out result)))) - ;; (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false) - ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")] - ;; (is (= "" (:out result)))) + (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) + (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] + (is (= "" (:out result)))) + (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false) + (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")] + (is (= "" (:out result)))) + (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false) + (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")] + (is (= "" (:out result)))) )) From 1db584a7d972fd4c9985108b03a5d300a61ba92d Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Fri, 24 Dec 2021 15:13:07 +0800 Subject: [PATCH 16/33] printCol to take selected-col as argument --- src/main/clojure/clojask/dataframe.clj | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 1b70b47..5c16774 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -30,7 +30,7 @@ (colTypes [] "get column type in ColInfo") (getColIndex [] "get column indices, excluding deleted columns") (getColNames [] "get column names") - (printCol [output-path] "print column names to output file") + (printCol [output-path selected-col] "print column names to output file") (printAggreCol [output-path] "print column names to output file for aggregate") (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join") (delCol [col-to-del] "delete one or more columns in the dataframe") @@ -148,10 +148,10 @@ (concat groupby-keys aggre-new-keys)))) (printCol - ;; print column names, called by compute - [this output-path] + ;; print column names, called by compute and computeAggre + [this output-path selected-col] (.checkOutputPath this output-path) - (let [col-set (.getColNames this)] + (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) @@ -271,7 +271,7 @@ (if (<= num-worker 8) (try (.final this) - (.printCol this output-dir) ;; to-do: based on the index => Done + (.printCol this output-dir select) ;; to-do: based on the index => Done (let [res (start-onyx num-worker batch-size this output-dir exception order index)] (if (= res "success") "success" @@ -296,7 +296,7 @@ (.indexOf index num))]) aggre-func (mapv shift-func aggre-func) ;; test (println [select index aggre-func]) - tmp (.printCol this output-dir) ;; todo: based on "select" + tmp (.printCol this output-dir select) ;; todo: based on "select" res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)] (if (= res "success") "success" @@ -319,7 +319,7 @@ ;; test (println [groupby-keys aggre-keys select pre-index data-index]) res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir - (.printCol this output-dir) ;; todo: based on "select" + (.printCol this output-dir select) ;; todo: based on "select" (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) (let [shift-func (fn [pair] @@ -512,7 +512,7 @@ (definterface JDFIntf (checkOutputPath [output-path] "check if output path is of string type") (getColNames [] "get the names of all the columns") - (printCol [output-path] "print column names to output file") + (printCol [output-path selected-col] "print column names to output file") (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select])) (defrecord JoinedDataFrame @@ -539,8 +539,8 @@ (printCol ;; print column names, called by compute - [this output-path] - (let [col-set (.getColNames this)] + [this output-path selected-col] + (let [col-set (if (= selected-col [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-col))] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) @@ -564,7 +564,7 @@ (u/init-file output-dir) ;; print column names ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done - (.printCol this output-dir) ;; todo: based on "select" + (.printCol this output-dir select) ;; todo: based on "select" (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index)))) From 6f04e1dce2485576b29c797235eae8e75e29acf6 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sun, 26 Dec 2021 15:14:10 +0800 Subject: [PATCH 17/33] Update test file --- src/main/clojure/clojask/dataframe.clj | 7 +++---- test/clojask/core_test.clj | 16 ++++++++++++---- test/clojask/correct_outputs/1-9.csv | 8 ++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) create mode 100644 test/clojask/correct_outputs/1-9.csv diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 5c16774..c82879d 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -539,8 +539,8 @@ (printCol ;; print column names, called by compute - [this output-path selected-col] - (let [col-set (if (= selected-col [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-col))] + [this output-path selected-index] + (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) @@ -563,8 +563,7 @@ ] (u/init-file output-dir) ;; print column names - ;; (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done - (.printCol this output-dir select) ;; todo: based on "select" + (.printCol this output-dir select) ;; todo: based on "select" => Done (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index)))) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index 2c3e5cb..c93d012 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -56,10 +56,18 @@ (is (= (col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"])) (rename-col y ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]) (is (= (col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])) - (select-col y ["Employee" "new-Department" "EmployeeName"]) - (is (= (col-names y) ["Employee" "new-Department" "EmployeeName"])) - (delete-col y ["new-Department"]) - (is (= (col-names y) ["Employee" "EmployeeName"])) + ;; (select-col y ["Employee" "new-Department" "EmployeeName"]) + ;; (is (= (col-names y) ["Employee" "new-Department" "EmployeeName"])) + ;; (delete-col y ["new-Department"]) + ;; (is (= (col-names y) ["Employee" "EmployeeName"])) + )) + +(deftest col-select-output-test + (testing "Select column(s) argument" + (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) + (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false) + (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-9.csv)" "<(sort test/clojask/correct_outputs/1-9.csv)")] + (is (= "" (:out result)))) )) (deftest join-api-test diff --git a/test/clojask/correct_outputs/1-9.csv b/test/clojask/correct_outputs/1-9.csv new file mode 100644 index 0000000..e3ba23c --- /dev/null +++ b/test/clojask/correct_outputs/1-9.csv @@ -0,0 +1,8 @@ +Employee,EmployeeName +1,Alice +2,Bob +3,Carla +4,Daniel +5,Evelyn +6,Ferdinand +7,Amy From a43b59b08c91690983738eb7d9211773db4bdd41 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Tue, 28 Dec 2021 15:23:44 +0800 Subject: [PATCH 18/33] preview for joineddataframe --- src/main/clojure/clojask/dataframe.clj | 38 ++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index c82879d..0962eee 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -392,14 +392,6 @@ [dataframe sample-size return-size & {:keys [format] :or {format false}}] (.preview dataframe sample-size return-size format)) -(defn print-df - [dataframe & [sample-size return-size]] - (let [data (.preview dataframe (or sample-size 1000) (or return-size 10) false) - tmp (first data) - types (zipmap (keys tmp) (map u/get-type-string (vals tmp))) - data (conj (apply list data) types)] - (pprint/print-table data))) - (defn generate-col "Generate column names if there are none" [col-count] @@ -513,6 +505,7 @@ (checkOutputPath [output-path] "check if output path is of string type") (getColNames [] "get the names of all the columns") (printCol [output-path selected-col] "print column names to output file") + (preview [] "preview the column names") (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select])) (defrecord JoinedDataFrame @@ -539,17 +532,22 @@ (printCol ;; print column names, called by compute - [this output-path selected-index] - (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] - (with-open [wrtr (io/writer output-path)] - (.write wrtr (str (str/join "," col-set) "\n"))))) - + [this output-path selected-index] + (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] + (with-open [wrtr (io/writer output-path)] + (.write wrtr (str (str/join "," col-set) "\n"))))) + + (preview + [this] + (.getColNames this) + ) + (compute [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select] (let [select (if (coll? select) select [select]) select (if (= select [nil]) (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0))) - (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) + (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select))) ;; a-write b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select))) @@ -685,3 +683,15 @@ ;; to-do: should implement both for the DataFrame and JoinedDataFrame => Done (.getColNames this) ) + +(defn print-df + [dataframe & [sample-size return-size]] + (if (= (type dataframe) DataFrame) + (let [data (.preview dataframe (or sample-size 1000) (or return-size 10) false) + tmp (first data) + types (zipmap (keys tmp) (map u/get-type-string (vals tmp))) + data (conj (apply list data) types)] + (pprint/print-table data)) + (do + (println (str (str/join "," (.preview dataframe)))) + (println "The content of joined dataframe is not available.")))) From 6ad9907e8902e865e29b9877c8663cb987d517b2 Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Tue, 28 Dec 2021 18:00:04 +0800 Subject: [PATCH 19/33] doc select and join change --- doc/documentation.md | 135 +++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 55 deletions(-) diff --git a/doc/documentation.md b/doc/documentation.md index 1484ff6..1aa4fe6 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -207,46 +207,8 @@ You can also group by the combination of keys. (Use the above two rules together ;; get the min of the two columns grouped by ... ``` - - -- sort - - **Immediately** sort the dataframe - - | Argument | Type | Function | Remarks | - | ------------------ | ----------------------- | ------------------------ | ------------------------------------------------------------ | - | `dataframe` | Clojask.DataFrame | The operated object | | - | `trending list` | Collection (seq vector) | Indicates the sort order | Example: ["Salary" "+" "Employee" "-"] means that sort the Salary in ascending order, if equal sort the Employee in descending order | - | `output-directory` | String | The output path | | - - **Example** - - ```clojure - (sort y ["+" "Salary"] "resources/sort.csv") - ;; sort by Salary ascendingly - ``` - - -- compute - Compute the result. The pre-defined lazy operations will be executed in pipeline, ie the result of the previous operation becomes the argument of the next operation. - - | Argument | Type | Function | Remarks | - | ---------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | - | `dataframe` | Clojask.DataFrame | The operated object | | - | `num of workers` | int (max 8) | The number of worker instances (except the input and output nodes) | If this argument >= 2, will use [onyx](http://www.onyxplatform.org/) as the distributed platform | - | `output path` | String | The path of the output csv file | Could exist or not. | - | [`exception`] | boolean | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields | - - **Example** - - ```clojure - (compute x 8 "../resources/test.csv" :exception true) - ;; computes all the pre-registered operations - ``` - - - inner-join / left-join / right-join @@ -258,40 +220,52 @@ You can also group by the combination of keys. (Use the above two rules together *Will automatically pipeline the registered operations and filters like `compute`. You could think of join as first compute the two dataframes then join.* - | Argument | Type | Function | Remarks | - | ------------------- | ------------------- | ------------------------------------------------------------ | ------------------------------------------------- | - | `dataframe a` | Clojask.DataFrame | The operated object | | - | `dataframe b` | Clojask.DataFrame | The operated object | | - | `a join keys` | String / Collection | The keys of a to be aligned | Find the specification [here](#groupby-keys) | - | `b join keys` | String / Collection | The keys of b to be aligned | Find the specification [here](#groupby-keys) | - | `number of workers` | int (max 8) | Number of worker nodes doing the joining | | - | `distination file` | string | The file path to the distination | Will be emptied first | - | [`exception`] | boolean | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields | + | Argument | Type | Function | Remarks | + | ------------- | ------------------- | --------------------------- | -------------------------------------------- | + | `dataframe a` | Clojask.DataFrame | The operated object | | + | `dataframe b` | Clojask.DataFrame | The operated object | | + | `a join keys` | String / Collection | The keys of a to be aligned | Find the specification [here](#groupby-keys) | + | `b join keys` | String / Collection | The keys of b to be aligned | Find the specification [here](#groupby-keys) | - **Example** +**Return** + +A Clojask.JoinedDataFrame + +- Unlike Clojask.DataFrame, it only supports three operations: + - `print-df` + - `get-col-names` + - `compute` +- This means you cannot further apply complicated operations to a joined dataframe. An alternative is to first compute the result, then read it in as a new dataframe. + +**Example** ```clojure (def x (dataframe "path/to/a")) (def y (dataframe "path/to/b")) - (inner-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true) + (def z (inner-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"])) + (compute z 8 "path/to/output") ;; inner join x and y - (left-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true) + (def z (left-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"])) + (compute z 8 "path/to/output") ;; left join x and y - (right-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true) + (def z (right-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"])) + (compute z 8 "path/to/output") ;; right join x and y ``` + + - reorderCol / renameCol Reorder the columns / rename the column names in the dataframe - | Argument | Type | Function | Remarks | - | ------------------- | ------------------ | ------------------------------------------------------------ | ------------------------------------------------- | - | `dataframe a` | Clojask.DataFrame | The operated object | | - | `a columns` | Clojure.collection | The new set of column names | Should be existing headers in dataframe a if it is `reorderCol` | + | Argument | Type | Function | Remarks | + | ------------- | ------------------ | --------------------------- | ------------------------------------------------------------ | + | `dataframe a` | Clojask.DataFrame | The operated object | | + | `a columns` | Clojure.collection | The new set of column names | Should be existing headers in dataframe a if it is `reorderCol` | **Example** @@ -301,3 +275,54 @@ You can also group by the combination of keys. (Use the above two rules together (.renameCol y ["Employee" "new-Department" "EmployeeName" "Salary"]) ``` + + + +- sort + + **Immediately** sort the dataframe + + | Argument | Type | Function | Remarks | + | ------------------ | ----------------------- | ------------------------ | ------------------------------------------------------------ | + | `dataframe` | Clojask.DataFrame | The operated object | | + | `trending list` | Collection (seq vector) | Indicates the sort order | Example: ["Salary" "+" "Employee" "-"] means that sort the Salary in ascending order, if equal sort the Employee in descending order | + | `output-directory` | String | The output path | | + + **Example** + + ```clojure + (sort y ["+" "Salary"] "resources/sort.csv") + ;; sort by Salary ascendingly + ``` + + + +- compute + + Compute the result. The pre-defined lazy operations will be executed in pipeline, ie the result of the previous operation becomes the argument of the next operation. + + | Argument | Type | Function | Remarks | + | ---------------- | ------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | + | `dataframe` | Clojask.DataFrame | The operated object | | + | `num of workers` | int (max 8) | The number of worker instances (except the input and output nodes) | Use [onyx](http://www.onyxplatform.org/) as the distributed platform | + | `output path` | String | The path of the output csv file | Could exist or not. | + | [`exception`] | boolean | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields | + | [`select`] | String / Collection of strings | The name of the columns to select. Better to first refer to function `get-col-names` about all the names. (Similar to `SELECT` in sql ) | Can only specify either of select and exclude | + | [`exclude`] | String / Collection of strings | The name of the columns to exclude | Can only specify either of select and exclude | + + **Example** + + ```clojure + (compute x 8 "../resources/test.csv" :exception true) + ;; computes all the pre-registered operations + + (compute x 8 "../resources/test.csv" :select "col a") + ;; only select column a + + (compute x 8 "../resources/test.csv" :select ["col b" "col a"]) + ;; select two columns, column b and column a in order + + (compute x 8 "../resources/test.csv" :exclude ["col b" "col a"]) + ;; select all columns except column b and column a, other columns are in order + ``` + From 078b89b490d68a61fde22ed04c01fdcbffe687d5 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Thu, 30 Dec 2021 14:57:52 +0800 Subject: [PATCH 20/33] Amend right-join API and test file --- src/main/clojure/clojask/dataframe.clj | 3 ++- test/clojask/core_test.clj | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 0962eee..6a41052 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -602,7 +602,8 @@ (JoinedDataFrame. a b a-keys b-keys nil nil 2 nil col-prefix))) (defn right-join - [a b a-keys b-keys num-worker dist & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}] + [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}] + ;[a b a-keys b-keys num-worker dist & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}] (let [a-keys (u/proc-groupby-key a-keys) b-keys (u/proc-groupby-key b-keys) a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index c93d012..7bfcbb0 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -75,7 +75,7 @@ (def x (dataframe "test/clojask/Employees-example.csv")) (def y (dataframe "test/clojask/Employees-example.csv")) (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) + (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))) )) @@ -87,9 +87,9 @@ (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false) (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")] (is (= "" (:out result)))) - ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) - ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] - ;; (is (= "" (:out result)))) + (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) + (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] + (is (= "" (:out result)))) (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] (is (= "" (:out result)))) From 916667c40c61cf44429c3f3c643a0ccef19b453e Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Thu, 30 Dec 2021 21:43:33 +0800 Subject: [PATCH 21/33] Amend printCol bug in groupby-aggre --- src/main/clojure/clojask/dataframe.clj | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 6a41052..15f2db8 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -31,6 +31,7 @@ (getColIndex [] "get column indices, excluding deleted columns") (getColNames [] "get column names") (printCol [output-path selected-col] "print column names to output file") + (printColByIndex [output-path selected-index] "print column names to output file") (printAggreCol [output-path] "print column names to output file for aggregate") (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join") (delCol [col-to-del] "delete one or more columns in the dataframe") @@ -155,6 +156,13 @@ (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) + (printColByIndex + ;; print column names, called by compute + [this output-path selected-index] + (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] + (with-open [wrtr (io/writer output-path)] + (.write wrtr (str (str/join "," col-set) "\n"))))) + ;; !! deprecated (printAggreCol ;; print column names, called by computeAggre @@ -319,7 +327,7 @@ ;; test (println [groupby-keys aggre-keys select pre-index data-index]) res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir - (.printCol this output-dir select) ;; todo: based on "select" + (.printColByIndex this output-dir select) ;; todo: based on "select" (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) (let [shift-func (fn [pair] From 79019c8743b53487f2740caeb3100d830db40352 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Thu, 30 Dec 2021 23:05:44 +0800 Subject: [PATCH 22/33] Amend test file to run zsh shell --- test/clojask/core_test.clj | 49 ++++++++++++++++------------ test/clojask/correct_outputs/1-4.csv | 6 ++-- test/clojask/correct_outputs/1-5.csv | 6 ++-- test/clojask/correct_outputs/1-6.csv | 10 +++--- test/clojask/correct_outputs/1-7.csv | 8 ++--- test/clojask/correct_outputs/1-8.csv | 8 ++--- 6 files changed, 48 insertions(+), 39 deletions(-) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index 7bfcbb0..609d8ca 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -29,24 +29,27 @@ (set-type y "Salary" "double") (operate y - "Salary") (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-1.csv)" "<(sort test/clojask/correct_outputs/1-1.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort ./test/clojask/test_outputs/1-1.csv) <(sort ./test/clojask/correct_outputs/1-1.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) ;; filter and row-operation (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) (set-type y "Salary" "double") (filter y "Salary" (fn [salary] (<= salary 800))) (operate y str ["Employee" "Salary"] "new-col") (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-2.csv)" "<(sort test/clojask/correct_outputs/1-2.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort ./test/clojask/test_outputs/1-2.csv) <(sort ./test/clojask/correct_outputs/1-2.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) ;; groupby and aggregate (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) (set-type y "Salary" "double") (group-by y ["Department"]) - (aggregate y max ["Salary"] ["new-Salary"]) + (aggregate y max ["Salary"] ["new-salary"]) (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-3.csv)" "<(sort test/clojask/correct_outputs/1-3.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) )) (deftest col-api-test @@ -66,8 +69,9 @@ (testing "Select column(s) argument" (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-9.csv)" "<(sort test/clojask/correct_outputs/1-9.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-9.csv) <(sort test/clojask/correct_outputs/1-9.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) )) (deftest join-api-test @@ -85,19 +89,24 @@ (def x (dataframe "test/clojask/Employees-example.csv")) (def y (dataframe "test/clojask/Employees-info-example.csv")) (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")] - (is (= "" (:out result)))) - (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) + (compute (inner-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-6.csv" :exception false) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false) - (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")] - (is (= "" (:out result)))) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) )) diff --git a/test/clojask/correct_outputs/1-4.csv b/test/clojask/correct_outputs/1-4.csv index 8320b59..46ae2ba 100644 --- a/test/clojask/correct_outputs/1-4.csv +++ b/test/clojask/correct_outputs/1-4.csv @@ -1,8 +1,8 @@ 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate -1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 5,Evelyn,13,800,2020/12/03,,,, -3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 -7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11 4,Daniel,12,1000,2020/12/05,,,, +1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 +3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 6,Ferdinand,21,700,2020/12/05,,,, 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05 +7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11 diff --git a/test/clojask/correct_outputs/1-5.csv b/test/clojask/correct_outputs/1-5.csv index e2473e3..f39148a 100644 --- a/test/clojask/correct_outputs/1-5.csv +++ b/test/clojask/correct_outputs/1-5.csv @@ -1,5 +1,5 @@ -1_Employee,1_EmployeeName,1_DayOff,1_UpdateDate,2_Employee,2_EmployeeName,2_Department,2_Salary,2_UpdateDate -2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01 +2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12 -3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26 +2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01 +3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03 diff --git a/test/clojask/correct_outputs/1-6.csv b/test/clojask/correct_outputs/1-6.csv index 613eb2e..1177529 100644 --- a/test/clojask/correct_outputs/1-6.csv +++ b/test/clojask/correct_outputs/1-6.csv @@ -1,5 +1,5 @@ -1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate -2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05 -3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 -1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 -7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11 +2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate +2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01 +3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03 +1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12 +7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26 diff --git a/test/clojask/correct_outputs/1-7.csv b/test/clojask/correct_outputs/1-7.csv index fc7f3da..ef84168 100644 --- a/test/clojask/correct_outputs/1-7.csv +++ b/test/clojask/correct_outputs/1-7.csv @@ -1,8 +1,8 @@ 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate -3,Carla,12,900,2020/12/03,,,, -6,Ferdinand,21,700,2020/12/05,,,, +7,Amy,11,50000,2020/11/26,,,, +3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 +4,Daniel,12,1000,2020/12/05,,,, 5,Evelyn,13,800,2020/12/03,,,, 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 -4,Daniel,12,1000,2020/12/05,,,, -7,Amy,11,50000,2020/11/26,,,, 2,Bob,11,600,2020/12/01,,,, +6,Ferdinand,21,700,2020/12/05,,,, diff --git a/test/clojask/correct_outputs/1-8.csv b/test/clojask/correct_outputs/1-8.csv index 4b51863..974f391 100644 --- a/test/clojask/correct_outputs/1-8.csv +++ b/test/clojask/correct_outputs/1-8.csv @@ -1,8 +1,8 @@ 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate -3,Carla,12,900,2020/12/03,,,, -7,Amy,11,50000,2020/11/26,,,, +5,Evelyn,13,800,2020/12/03,,,, 1,Alice,11,300,2020/12/12,,,, 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05 -5,Evelyn,13,800,2020/12/03,,,, -4,Daniel,12,1000,2020/12/05,,,, 6,Ferdinand,21,700,2020/12/05,,,, +7,Amy,11,50000,2020/11/26,,,, +3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 +4,Daniel,12,1000,2020/12/05,,,, From 22a141909dd5def90af9ca818e83080b56f378bf Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sun, 2 Jan 2022 18:06:04 +0800 Subject: [PATCH 23/33] Change compute to return output dataframe --- src/main/clojure/clojask/dataframe.clj | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 15f2db8..320d920 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -678,14 +678,22 @@ (assert (not= select []) "must select at least 1 column") (if (= (type this) clojask.dataframe.DataFrame) (if (= (.getAggreFunc (:row-info this)) []) - (.compute this num-worker output-dir exception order select) + (do ;; simple compute + (.compute this num-worker output-dir exception order select) + (dataframe output-dir :have-col true)) ;; return output dataframe (if (not= (.getGroupbyKeys (:row-info this)) []) - (.computeGroupAggre this num-worker output-dir exception select) - (.computeAggre this num-worker output-dir exception select))) + (do ;; groupby-aggre + (.computeGroupAggre this num-worker output-dir exception select) + (dataframe output-dir :have-col true)) + (do ;; aggre + (.computeAggre this num-worker output-dir exception select) + (dataframe output-dir :have-col true)))) (if (= (type this) clojask.dataframe.JoinedDataFrame) - (.compute this num-worker output-dir exception order select) - (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))))) -) + (do ;; join + (.compute this num-worker output-dir exception order select) + (dataframe output-dir :have-col true)) + (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")))))) + (defn get-col-names "Get the names for the columns in sequence" [this] From e315e11443a072bea0f5196cc89935b643ba4252 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sun, 2 Jan 2022 18:06:25 +0800 Subject: [PATCH 24/33] Amend test file --- test/clojask/core_test.clj | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index 609d8ca..1e9cbec 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -3,6 +3,7 @@ [clojask.dataframe :refer :all] [clojask.utils :refer :all] [clojask.groupby :refer :all] + [clojask.api.gb-aggregate :as gb-aggre] [clojask.api.aggregate :as aggre] [clojask.sort :refer :all])) @@ -45,7 +46,7 @@ (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) (set-type y "Salary" "double") (group-by y ["Department"]) - (aggregate y max ["Salary"] ["new-salary"]) + (aggregate y gb-aggre/max ["Salary"] ["new-Salary"]) (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false) (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")] (is (= "" (:out result))) @@ -78,10 +79,10 @@ (testing "Join dataframes APIs" (def x (dataframe "test/clojask/Employees-example.csv")) (def y (dataframe "test/clojask/Employees-example.csv")) - (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))) - (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))) + (is (= clojask.dataframe.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) + (is (= clojask.dataframe.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) + (is (= clojask.dataframe.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) + (is (= clojask.dataframe.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))) )) (deftest join-api-output-test From b8e5f989c7125aaf4714326a1cf7a208e67dc2cd Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Wed, 5 Jan 2022 18:08:35 +0800 Subject: [PATCH 25/33] allow only group by --- src/main/clojure/clojask/dataframe.clj | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 0962eee..987734f 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -319,6 +319,8 @@ ;; test (println [groupby-keys aggre-keys select pre-index data-index]) res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir + (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of " + "the groupby keys.")) (.printCol this output-dir select) ;; todo: based on "select" (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) @@ -573,6 +575,8 @@ b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)] (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes."))) + (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) [])) + (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join."))) (cond (not (= (count a-keys) (count b-keys))) (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) @@ -593,6 +597,8 @@ b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)] (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes."))) + (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) [])) + (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join."))) (cond (not (= (count a-keys) (count b-keys))) (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (= (count col-prefix) 2)) @@ -609,6 +615,8 @@ b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)] (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes."))) + (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) [])) + (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join."))) (cond (not (= (count a-keys) (count b-keys))) (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) @@ -630,6 +638,8 @@ (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) (throw (Clojask_TypeException. "Input includes non-existent column name(s)."))) + (cond (or (not= (.getAggreFunc(:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) [])) + (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join."))) (let [[a-roll b-roll] [(get (.getKeyIndex (:col-info a)) a-roll) (get (.getKeyIndex (:col-info b)) b-roll)]] (do (cond (not (and (not= a-roll nil) (not= b-roll nil))) @@ -648,6 +658,8 @@ (throw (Clojask_TypeException. "Rolling keys should be strings"))) (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes."))) + (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) [])) + (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join."))) (cond (not (= (count a-keys) (count b-keys))) (throw (Clojask_TypeException. "The length of left keys and right keys should be equal."))) (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) @@ -668,7 +680,7 @@ select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] (assert (not= select []) "must select at least 1 column") (if (= (type this) clojask.dataframe.DataFrame) - (if (= (.getAggreFunc (:row-info this)) []) + (if (and (= (.getGroupbyKeys (:row-info this)) []) (= (.getAggreFunc (:row-info this)) [])) (.compute this num-worker output-dir exception order select) (if (not= (.getGroupbyKeys (:row-info this)) []) (.computeGroupAggre this num-worker output-dir exception select) From 60c2905fcc93819bc0830c3b9ab0a861f44b1048 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 5 Jan 2022 22:12:22 +0800 Subject: [PATCH 26/33] Delete deprecated printCol functions --- src/main/clojure/clojask/dataframe.clj | 28 +------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 320d920..06b8611 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -140,7 +140,6 @@ ;; not aggregate (let [index-key (.getIndexKey (:col-info this)) index (.getColIndex this)] - ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2]) (mapv (fn [i] (get index-key i)) index)) ;; if aggregate (let [groupby-key-index (.getGroupbyKeys (:row-info this)) @@ -157,36 +156,11 @@ (.write wrtr (str (str/join "," col-set) "\n"))))) (printColByIndex - ;; print column names, called by compute + ;; print column names, called by computeGroupByAggre [this output-path selected-index] (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] (with-open [wrtr (io/writer output-path)] (.write wrtr (str (str/join "," col-set) "\n"))))) - - ;; !! deprecated - (printAggreCol - ;; print column names, called by computeAggre - [this output-path] - (.checkOutputPath this output-path) - (let [groupby-key-index (.getGroupbyKeys (:row-info this)) - groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index)))) - aggre-new-keys (.getAggreNewKeys (:row-info this))] - (with-open [wrtr (io/writer output-path)] - (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n"))))) - - ;; !! deprecated - (printJoinCol - ;; print column names, called by join APIs - [this b-df this-keys b-keys output-path col-prefix] - (.checkOutputPath this output-path) - (let [a-col-prefix (first col-prefix) - b-col-prefix (last col-prefix) - a-col-set (.getColNames this) - b-col-set (.getColNames b-df) - a-col-header (map #(str a-col-prefix "_" %) a-col-set) - b-col-header (map #(str b-col-prefix "_" %) b-col-set)] - (with-open [wrtr (io/writer output-path)] - (.write wrtr (str (str/join "," (concat a-col-header b-col-header)) "\n"))))) (delCol [this col-to-del] From e91560ebc7f321b0a6f4994bab9e9baa3a96c14e Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 5 Jan 2022 22:13:15 +0800 Subject: [PATCH 27/33] Delete deprecated function declarations --- src/main/clojure/clojask/dataframe.clj | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 5878788..20a4e54 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -32,8 +32,6 @@ (getColNames [] "get column names") (printCol [output-path selected-col] "print column names to output file") (printColByIndex [output-path selected-index] "print column names to output file") - (printAggreCol [output-path] "print column names to output file for aggregate") - (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join") (delCol [col-to-del] "delete one or more columns in the dataframe") (reorderCol [new-col-order] "reorder columns in the dataframe") (renameCol [new-col-names] "rename columns in the dataframe") From 50816100afe726f10c4243e0a497cc25daecba08 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 5 Jan 2022 22:28:27 +0800 Subject: [PATCH 28/33] Unify printCol functions as one --- src/main/clojure/clojask/dataframe.clj | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 20a4e54..6208d17 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -30,8 +30,7 @@ (colTypes [] "get column type in ColInfo") (getColIndex [] "get column indices, excluding deleted columns") (getColNames [] "get column names") - (printCol [output-path selected-col] "print column names to output file") - (printColByIndex [output-path selected-index] "print column names to output file") + (printCol [output-path selected-index] "print column names to output file") (delCol [col-to-del] "delete one or more columns in the dataframe") (reorderCol [new-col-order] "reorder columns in the dataframe") (renameCol [new-col-names] "rename columns in the dataframe") @@ -145,16 +144,17 @@ aggre-new-keys (.getAggreNewKeys (:row-info this))] (concat groupby-keys aggre-new-keys)))) - (printCol - ;; print column names, called by compute and computeAggre - [this output-path selected-col] - (.checkOutputPath this output-path) - (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)] - (with-open [wrtr (io/writer output-path)] - (.write wrtr (str (str/join "," col-set) "\n"))))) + ;; !! deprecated + ;; (printCol + ;; ;; print column names, called by compute and computeAggre + ;; [this output-path selected-col] + ;; (.checkOutputPath this output-path) + ;; (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)] + ;; (with-open [wrtr (io/writer output-path)] + ;; (.write wrtr (str (str/join "," col-set) "\n"))))) - (printColByIndex - ;; print column names, called by computeGroupByAggre + (printCol + ;; print column names, called by compute, computeAggre and computeGroupByAggre [this output-path selected-index] (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] (with-open [wrtr (io/writer output-path)] @@ -251,7 +251,7 @@ (if (<= num-worker 8) (try (.final this) - (.printCol this output-dir select) ;; to-do: based on the index => Done + (.printCol this output-dir index) ;; to-do: based on the index => Done (let [res (start-onyx num-worker batch-size this output-dir exception order index)] (if (= res "success") "success" @@ -302,7 +302,7 @@ (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of " "the groupby keys.")) ;; (.printCol this output-dir select) ;; todo: based on "select" - (.printColByIndex this output-dir select) ;; todo: based on "select" + (.printCol this output-dir select) ;; todo: based on "select" (if (= res "success") ;; (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception)) (let [shift-func (fn [pair] From 6978df5ed74fb2ece62140ae4d629670b680dad4 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Wed, 5 Jan 2022 22:29:08 +0800 Subject: [PATCH 29/33] Update test file to include only calling group-by and only calling aggregate --- test/clojask/core_test.clj | 15 +++++++++++++++ test/clojask/correct_outputs/1-10.csv | 2 ++ test/clojask/correct_outputs/1-11.csv | 5 +++++ 3 files changed, 22 insertions(+) create mode 100644 test/clojask/correct_outputs/1-10.csv create mode 100644 test/clojask/correct_outputs/1-11.csv diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj index 1e9cbec..e60a92e 100644 --- a/test/clojask/core_test.clj +++ b/test/clojask/core_test.clj @@ -51,6 +51,21 @@ (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")] (is (= "" (:out result))) (is (= "" (:err result)))) + ;; aggregate only + (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) + (set-type y "Salary" "double") + (aggregate y aggre/max ["Salary"] ["new-Salary"]) + (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-10.csv) <(sort test/clojask/correct_outputs/1-10.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) + ;; groupby only + (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) + (group-by y ["Department"]) + (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false) + (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-11.csv) <(sort test/clojask/correct_outputs/1-11.csv)")] + (is (= "" (:out result))) + (is (= "" (:err result)))) )) (deftest col-api-test diff --git a/test/clojask/correct_outputs/1-10.csv b/test/clojask/correct_outputs/1-10.csv new file mode 100644 index 0000000..7bdcd99 --- /dev/null +++ b/test/clojask/correct_outputs/1-10.csv @@ -0,0 +1,2 @@ +new-Salary +50000.0 diff --git a/test/clojask/correct_outputs/1-11.csv b/test/clojask/correct_outputs/1-11.csv new file mode 100644 index 0000000..f1d7b1d --- /dev/null +++ b/test/clojask/correct_outputs/1-11.csv @@ -0,0 +1,5 @@ +Department +12 +21 +13 +11 From ad9f5aee2168b52b24d9b68e576bb32312a5eaaa Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sat, 8 Jan 2022 11:45:14 +0800 Subject: [PATCH 30/33] Added checkInputPathClash for basic compute function --- src/main/clojure/clojask/dataframe.clj | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 6208d17..5ce89d1 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -21,7 +21,9 @@ (definterface DFIntf (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select] "final evaluatation") + (getPath [] "get input path of dataframe") (checkOutputPath [output-path] "check if output path is of string type") + (checkInputPathClash [path] "check if path clashs with dataframe input path") (operate [operation colName] "operate an operation to column and replace in place") (operate [operation colName newCol] "operate an operation to column and add the result as new column") (setType [type colName] "types supported: int double string date") @@ -58,11 +60,33 @@ ^Boolean have-col] DFIntf + (getPath + [this] + path) + (checkOutputPath [this output-path] (cond (not (= java.lang.String (type output-path))) (throw (Clojask_TypeException. "Output path should be a string.")))) + (checkInputPathClash + [this path] + (defn get-path-str + [path] + (if (str/starts-with? path "./") + (str "file:///" (str/replace-first path "./" "")) + (if (str/starts-with? path "/") + (str "file:///" (str/replace-first path "./" "")) + (str "file:///" path)))) + (let [path-str (get-path-str path) + input-path-str (get-path-str (.getPath this)) + path-obj (java.nio.file.Paths/get (new java.net.URI path-str)) + input-path-obj (java.nio.file.Paths/get (new java.net.URI input-path-str)) + paths-equal (java.nio.file.Paths/.equals path-obj input-path-obj)] + (cond paths-equal + (throw (Clojask_OperationException. "Output path should be different from input path of dataframe argument."))) + )) + (operate ;; has assert [this operation colName] (if (nil? (.operate col-info operation colName)) @@ -656,7 +680,8 @@ (defn compute [this num-worker output-dir & {:keys [exception order select exclude] :or {exception false order true select nil exclude nil}}] (assert (or (nil? select) (nil? exclude)) "can only specify either of them") - (u/init-file output-dir) + ;; check if output-dir clashes with input file path + (.checkInputPathClash this output-dir) ;; check which type of dataframe this is (let [exclude (if (coll? exclude) exclude [exclude]) select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] From d949f9f45b342bc189a043a84f277c1e38de3714 Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sat, 8 Jan 2022 13:14:43 +0800 Subject: [PATCH 31/33] Added checkInputPathClash for JoinDF --- src/main/clojure/clojask/dataframe.clj | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 5ce89d1..7734db8 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -509,7 +509,7 @@ ;; ============= Below is the definition for the joineddataframe ================ (definterface JDFIntf - (checkOutputPath [output-path] "check if output path is of string type") + (checkInputPathClash [path] "check if paths clashes with dataframes a/b input path") (getColNames [] "get the names of all the columns") (printCol [output-path selected-col] "print column names to output file") (preview [] "preview the column names") @@ -526,6 +526,11 @@ limit prefix] JDFIntf + + (checkInputPathClash + [this path] + (.checkInputPathClash a path) + (.checkInputPathClash b path)) (getColNames [this] From 8e946c698cb4585fdaa3a219d0c3efe44fde097b Mon Sep 17 00:00:00 2001 From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com> Date: Sat, 8 Jan 2022 17:41:49 +0800 Subject: [PATCH 32/33] minor --- src/main/clojure/clojask/dataframe.clj | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index 6208d17..c564686 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -299,8 +299,9 @@ ;; test (println [groupby-keys aggre-keys select pre-index data-index]) res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)] ;(.printAggreCol this output-dir) ;; print column names to output-dir - (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of " - "the groupby keys.")) + (if (= aggre-keys []) + (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of " + "the groupby keys."))) ;; (.printCol this output-dir select) ;; todo: based on "select" (.printCol this output-dir select) ;; todo: based on "select" (if (= res "success") From 2b5a725f958212f8e7db9a2e56031c6000e53d6d Mon Sep 17 00:00:00 2001 From: Angel Woo Date: Sat, 8 Jan 2022 22:11:35 +0800 Subject: [PATCH 33/33] Solve bug of mis-deleting init-file line in compute --- src/main/clojure/clojask/dataframe.clj | 2 ++ src/main/clojure/clojask/groupby.clj | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj index d0c88c9..48bd325 100644 --- a/src/main/clojure/clojask/dataframe.clj +++ b/src/main/clojure/clojask/dataframe.clj @@ -688,6 +688,8 @@ (assert (or (nil? select) (nil? exclude)) "can only specify either of them") ;; check if output-dir clashes with input file path (.checkInputPathClash this output-dir) + ;; initialise file + (u/init-file output-dir) ;; check which type of dataframe this is (let [exclude (if (coll? exclude) exclude [exclude]) select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))] diff --git a/src/main/clojure/clojask/groupby.clj b/src/main/clojure/clojask/groupby.clj index 8d2c7b2..8805af2 100644 --- a/src/main/clojure/clojask/groupby.clj +++ b/src/main/clojure/clojask/groupby.clj @@ -1,6 +1,6 @@ (ns clojask.groupby (:require [clojure.java.io :as io] - [clojure-csv.core :as csv] + ;[clojure-csv.core :as csv] [clojask.utils :as u] [clojure.core.async :as async])) "contains the utility functions to group by and aggregate"