From fd541e542c608d266c1ed8cbed3470c6b342dec3 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Sun, 21 Nov 2021 15:26:30 +0800
Subject: [PATCH 01/33] change the location of log file

---
 src/main/clojure/aggregate/aggre_onyx_comps.clj | 6 ++++--
 src/main/clojure/clojask/onyx_comps.clj         | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj
index 5179f06..4cbfb48 100644
--- a/src/main/clojure/aggregate/aggre_onyx_comps.clj
+++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj
@@ -223,7 +223,8 @@
     {:zookeeper/address "127.0.0.1:2188"
      :zookeeper/server? true
      :zookeeper.server/port 2188
-     :onyx/tenancy-id id})
+     :onyx/tenancy-id id
+     :onyx.log/file "_clojask/clojask.log"})
 
   (def peer-config
     {:zookeeper/address "127.0.0.1:2188"
@@ -231,7 +232,8 @@
      :onyx.peer/job-scheduler :onyx.job-scheduler/balanced
      :onyx.messaging/impl :aeron
      :onyx.messaging/peer-port 40200
-     :onyx.messaging/bind-addr "localhost"})
+     :onyx.messaging/bind-addr "localhost"
+     :onyx.log/file "_clojask/clojask.log"})
 
   (def env (onyx.api/start-env env-config))
 
diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj
index 47898b4..964ae2a 100644
--- a/src/main/clojure/clojask/onyx_comps.clj
+++ b/src/main/clojure/clojask/onyx_comps.clj
@@ -446,7 +446,8 @@
     {:zookeeper/address "127.0.0.1:2188"
      :zookeeper/server? true
      :zookeeper.server/port 2188
-     :onyx/tenancy-id id})
+     :onyx/tenancy-id id
+     :onyx.log/file "_clojask/clojask.log"})
 
   (def peer-config
     {:zookeeper/address "127.0.0.1:2188"
@@ -454,7 +455,8 @@
      :onyx.peer/job-scheduler :onyx.job-scheduler/balanced
      :onyx.messaging/impl :aeron
      :onyx.messaging/peer-port 40200
-     :onyx.messaging/bind-addr "localhost"})
+     :onyx.messaging/bind-addr "localhost"
+     :onyx.log/file "_clojask/clojask.log"})
 
   (def env (onyx.api/start-env env-config))
 

From a767454b6b8da6aa45767e764739dfda69d8d735 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Sun, 21 Nov 2021 22:00:46 +0800
Subject: [PATCH 02/33] Update README.md

---
 README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 0de624c..297c3d7 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,56 @@
-# clojask
+# Clojask
 Clojure data frame with parallel computing on larger-than-memory datasets
 
-#### Run the main function in `core`:
+### Features
 
-```
-lein run
-```
+- **Unlimited size**
+
+  Theoretically speaking, it supports dataset larger than memory, even to infinity!
+
+- **Fast**
+
+  Faster than Dask in most operations, and the larger the dataframe is, the bigger the advantage
+
+- **All native types**
+
+  All the datatypes used to store data is native Clojure (or Java) types!
+
+- **From file to file**
+
+  Integrate IO inside the dataframe. No need to write your own read-in and output functions!
+
+- **Parallel**
 
-#### Run the tests in `test`:
+  Most operations could be executed into multiple threads or even machines. See the principle in [Onyx](http://www.onyxplatform.org/).
+
+- **Lazy operations**
+
+  Most operations will not be executed immediately. Dataframe will intelligently pipeline the operations altogether in computation.
+
+### Installation
+
+Available on [Clojars](https://clojars.org/com.github.clojure-finance/clojask). 
+
+Insert this line into your `project.clj` if using Leiningen.
 
 ```
-lein test
+[com.github.clojure-finance/clojask "1.0.0"]
 ```
 
+Insert this line into your `deps.edn` if using CLI.
 
-To run a particular test defined in the namespace:
 ```
-lein test :only core-test/df-api-test 
+com.github.clojure-finance/clojask {:mvn/version "1.0.0"}
 ```
 
-#### Requirements for the input file:
-- the first row should contain the column names
+### Documentation
+
+The detailed doc for every API can be found [here](https://clojure-finance.github.io/clojask-website/posts-output/API/).
+
+### Examples
+
+A separate repository for some typical usage of Clojask can be found [here](https://github.com/clojure-finance/clojask-examples).
+
+### Problem Feedback
+
+If your question is not answered in existing [issues](https://github.com/clojure-finance/clojask/issues). Feel free to create a new one.

From e4b98fccbe90c3a021b9650ae23e51b8780f4d5f Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Mon, 22 Nov 2021 21:50:30 +0800
Subject: [PATCH 03/33] Remove old example file

---
 examples/multi-threading.clj | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 examples/multi-threading.clj

diff --git a/examples/multi-threading.clj b/examples/multi-threading.clj
deleted file mode 100644
index baff528..0000000
--- a/examples/multi-threading.clj
+++ /dev/null
@@ -1,15 +0,0 @@
-(ns examples.timezone
-    (:require [clojask.dataframe :refer :all]
-              [clojure.core.async :as async]))
-  
-  (defn main
-    []
-    (def x (dataframe "resources/Employees-large.csv"))
-    (def y (dataframe "resources/Employees.csv"))
-
-    ;; create a thread for each operation
-    (async/thread (set-type x "double" "Department"))
-    (async/thread (set-type y "double" "Department"))
-
-    (time (left-join x y ["Employee"] ["Employee"] 4 "resources/test.csv" :exception false))
-    )
\ No newline at end of file

From e887f381fdf4ebbc8c987ab396e2abc3c15b9f78 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Fri, 10 Dec 2021 18:14:41 +0800
Subject: [PATCH 04/33] Amend printJoinCol to allow input for col-prefix as
 optional arg

---
 src/main/clojure/clojask/dataframe.clj | 34 ++++++++++++++------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 8f93903..4d91891 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -31,7 +31,7 @@
   (getColNames [])
   (printCol [output-path] "print column names to output file")
   (printAggreCol [output-path] "print column names to output file for aggregate")
-  (printJoinCol [b-df a-keys b-keys output-path] "print column names to output file for join")
+  (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join")
   (delCol [col-to-del] "delete column(s) in the dataframe")
   (reorderCol [new-col-order] "reorder columns in the dataframe")
   (renameCol [new-col-names] "reorder columns in the dataframe")
@@ -156,13 +156,15 @@
 
   (printJoinCol
   ;; print column names, called by join APIs
-    [this b-df this-keys b-keys output-path]
+    [this b-df this-keys b-keys output-path col-prefix]
     (cond (not (= java.lang.String (type output-path)))
           (throw (Clojask_TypeException. "Output path should be a string.")))
-    (let [a-col-set (.getColNames this)
+    (let [a-col-prefix (first col-prefix)
+          b-col-prefix (last col-prefix)
+          a-col-set (.getColNames this)
           b-col-set (.getColNames b-df)
-          a-col-header (map #(str "1_" %) a-col-set)
-          b-col-header (map #(str "2_" %) b-col-set)]
+          a-col-header (map #(str a-col-prefix "_" %) a-col-set)
+          b-col-header (map #(str b-col-prefix "_" %) b-col-set)]
         (with-open [wrtr (io/writer output-path)]
           (.write wrtr (str (str/join "," (concat a-col-header b-col-header)) "\n")))))
   
@@ -467,7 +469,7 @@
   result))
 
 (defn inner-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -480,13 +482,13 @@
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
     (u/init-file dist)
     ;; print column names
-    (.printJoinCol a b a-keys b-keys dist)
+    (.printJoinCol a b a-keys b-keys dist col-prefix)
     ;; first group b by keys
     (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
     (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 1)))
 
 (defn left-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -495,17 +497,19 @@
       (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes.")))
     (cond (not (= (count a-keys) (count b-keys))) 
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
+    (cond (not (= (count col-prefix) 2)) 
+      (throw (Clojask_TypeException. "The length of col-prefix should be equal to 2.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
     (u/init-file dist)
     ;; print column names
-    (.printJoinCol a b a-keys b-keys dist)
+    (.printJoinCol a b a-keys b-keys dist col-prefix)
     ;; first group b by keys
     (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
     (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 2)))
 
 (defn right-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys num-worker dist & {:keys [col-prefix exception] :or {col-prefix ["1" "2"] exception false}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -518,13 +522,13 @@
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
     (u/init-file dist)
     ;; print column names
-    (.printJoinCol b a a-keys b-keys dist)
+    (.printJoinCol b a a-keys b-keys dist col-prefix)
     ;; first group b by keys
     (start-onyx-groupby num-worker 10 a "./_clojask/join/b/" a-keys exception)
     (start-onyx-join num-worker 10 b a dist exception b-keys a-keys nil nil 2)))
 
 (defn rolling-join-forward
-  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}]
+  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [col-prefix exception limit] :or {col-prefix ["1" "2"] exception false limit nil}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -543,13 +547,13 @@
           (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s).")))
         (u/init-file dist)
         ;; print column names
-        (.printJoinCol a b a-keys b-keys dist)
+        (.printJoinCol a b a-keys b-keys dist col-prefix)
         (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
         (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 4 limit)))))
 
 ;; all of the code is the same as above except for the last line
 (defn rolling-join-backward
-  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}]
+  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [col-prefix exception limit] :or {col-prefix ["1" "2"] exception false limit nil}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -568,6 +572,6 @@
               (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s).")))
         (u/init-file dist)
         ;; print column names
-        (.printJoinCol a b a-keys b-keys dist)
+        (.printJoinCol a b a-keys b-keys dist col-prefix)
         (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
         (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 5 nil)))))

From e5e9ff9c04da868bf0b590d8ea34b02f6608e0cc Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sat, 18 Dec 2021 14:03:33 +0800
Subject: [PATCH 05/33] Refactor code in dataframe.clj

---
 src/main/clojure/clojask/dataframe.clj | 55 ++++++++++++++------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 4d91891..a571093 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -20,25 +20,27 @@
         '[com.clojask.exception Clojask_OperationException])
 
 (definterface DFIntf
-  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order])
+  (checkOutputPath [output-path] "check if output path is of string type")
   (operate [operation colName] "operate an operation to column and replace in place")
   (operate [operation colName newCol] "operate an operation to column and add the result as new column")
   (setType [type colName] "types supported: int double string date")
   (setParser [parser col] "add the parser for a col which acts like setType")
-  (colDesc [])
-  (colTypes [])
-  (getColIndex [])
-  (getColNames [])
+  (colDesc [] "get column description in ColInfo")
+  (colTypes [] "get column type in ColInfo")
+  (getColIndex [] "get column indices, excluding deleted columns")
+  (getColNames [] "get column names")
   (printCol [output-path] "print column names to output file")
   (printAggreCol [output-path] "print column names to output file for aggregate")
   (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join")
-  (delCol [col-to-del] "delete column(s) in the dataframe")
+  (delCol [col-to-del] "delete one or more columns in the dataframe")
   (reorderCol [new-col-order] "reorder columns in the dataframe")
-  (renameCol [new-col-names] "reorder columns in the dataframe")
+  (renameCol [new-col-names] "rename columns in the dataframe")
   (groupby [a] "group the dataframe by the key(s)")
   (aggregate [a c b] "aggregate the group-by result by the function")
-  (head [n])
+  (head [n] "return first n lines in dataframe")
   (filter [cols predicate])
+  (computeTypeCheck [num-worker output-dir])
+  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order])
   (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception])
   (computeAggre [^int num-worker ^String output-dir ^boolean exception])
   (sort [a b] "sort the dataframe based on columns")
@@ -58,6 +60,11 @@
             ^Boolean have-col]
   DFIntf
 
+  (checkOutputPath
+    [this output-path]
+    (cond (not (= java.lang.String (type output-path)))
+          (throw (Clojask_TypeException. "Output path should be a string."))))
+
   (operate ;; has assert
     [this operation colName]
     (if (nil? (.operate col-info operation colName))
@@ -136,8 +143,7 @@
   (printCol
   ;; print column names, called by compute
     [this output-path]
-    (cond (not (= java.lang.String (type output-path)))
-          (throw (Clojask_TypeException. "Output path should be a string.")))
+    (.checkOutputPath this output-path)
     (let [col-set (.getColNames this)]
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," col-set) "\n")))))
@@ -145,20 +151,17 @@
   (printAggreCol
   ;; print column names, called by computeAggre
     [this output-path]
-    (cond (not (= java.lang.String (type output-path)))
-          (throw (Clojask_TypeException. "Output path should be a string.")))
+    (.checkOutputPath this output-path)
     (let [groupby-key-index (.getGroupbyKeys (:row-info this))
           groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index))))
           aggre-new-keys (.getAggreNewKeys (:row-info this))]
-        ;(println (vec (map #(last %) groupby-key-index)))
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n")))))
 
   (printJoinCol
   ;; print column names, called by join APIs
     [this b-df this-keys b-keys output-path col-prefix]
-    (cond (not (= java.lang.String (type output-path)))
-          (throw (Clojask_TypeException. "Output path should be a string.")))
+    (.checkOutputPath this output-path)
     (let [a-col-prefix (first col-prefix)
           b-col-prefix (last col-prefix)
           a-col-set (.getColNames this)
@@ -233,7 +236,6 @@
     [this]
     (doseq [tmp (.getFormatter (:col-info this))]
       (.operate this (nth tmp 1) (get (.getIndexKey col-info) (nth tmp 0)))))
-    ;; currently put read file here
 
   (preview
     [this sample-size return-size format]
@@ -241,9 +243,18 @@
           (throw (Clojask_TypeException. "Arguments passed to preview must be integers.")))
     (preview/preview this sample-size return-size format))
 
+  (computeTypeCheck
+    [this num-worker output-dir]
+    (cond (not (= java.lang.String (type output-dir)))
+      (throw (Clojask_TypeException. "Output directory should be a string.")))
+    (cond (not (integer? num-worker))
+      (throw (Clojask_TypeException. "Number of workers should be an integer.")))
+    (if (> num-worker 8)
+      (throw (Clojask_OperationException. "Max number of worker nodes is 8."))))
+
   (compute
     [this ^int num-worker ^String output-dir ^boolean exception ^boolean order]
-    ;(assert (= java.lang.String (type output-dir)) "output path should be a string")
+    (.computeTypeCheck this num-worker output-dir)
     (if (<= num-worker 8)
       (try
         (.final this)
@@ -257,10 +268,7 @@
   
   (computeAggre
    [this ^int num-worker ^String output-dir ^boolean exception]
-   (cond (not (= java.lang.String (type output-dir)))
-         (throw (Clojask_TypeException. "Output-dir should be a string.")))
-   (if (> num-worker 8)
-     (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))
+   (.computeTypeCheck this num-worker output-dir)
    (.printAggreCol this output-dir) ;; print column names to output-dir
    (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)]
      (if (= res "success")
@@ -269,8 +277,7 @@
   
   (computeGroupAggre
     [this ^int num-worker ^String output-dir ^boolean exception]
-    (cond (not (= java.lang.String (type output-dir)))
-          (throw (Clojask_TypeException. "Output-dir should be a string.")))
+    (.computeTypeCheck this num-worker output-dir)
     (if (<= num-worker 8)
       (try
         (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)]
@@ -371,8 +378,6 @@
       (DataFrame. path 300 col-info row-info have-col))
     (catch Exception e
       (do
-        ;; (println "No such file or directory")
-        ;; (throw e)
         (throw (Clojask_OperationException. "no such file or directory"))
         nil))))
 

From cf47c0857132ce39b2083bf98eca528acb0cca58 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Sat, 18 Dec 2021 21:05:55 +0800
Subject: [PATCH 06/33] add select to compute and make join lazy

---
 src/main/clojure/clojask/dataframe.clj  | 145 ++++++++++++++----------
 src/main/clojure/clojask/onyx_comps.clj |  18 ++-
 2 files changed, 96 insertions(+), 67 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 8f93903..7486fb1 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -12,15 +12,15 @@
             [clojask.preview :as preview]
             [clojure.pprint :as pprint])
   (:import [clojask.ColInfo ColInfo]
-           [clojask.RowInfo RowInfo])
+           [clojask.RowInfo RowInfo]
+           [com.clojask.exception Clojask_TypeException]
+           [com.clojask.exception Clojask_OperationException])
   (:refer-clojure :exclude [filter group-by sort]))
 "The clojask lazy dataframe"
 
-(import '[com.clojask.exception Clojask_TypeException]
-        '[com.clojask.exception Clojask_OperationException])
 
 (definterface DFIntf
-  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order])
+  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select] "final evaluatation")
   (operate [operation colName] "operate an operation to column and replace in place")
   (operate [operation colName newCol] "operate an operation to column and add the result as new column")
   (setType [type colName] "types supported: int double string date")
@@ -240,18 +240,22 @@
     (preview/preview this sample-size return-size format))
 
   (compute
-    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order]
+    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select]
     ;(assert (= java.lang.String (type output-dir)) "output path should be a string")
-    (if (<= num-worker 8)
-      (try
-        (.final this)
-        (.printCol this output-dir) ;; print column names to output-dir
-        (let [res (start-onyx num-worker batch-size this output-dir exception order)]
-          (if (= res "success")
-            "success"
-            "failed"))
-        (catch Exception e e))
-      (throw (Clojask_OperationException. "Max number of worker nodes is 8."))))
+    (let [key-index (.getKeyIndex (:col-info this))
+          select (if (coll? select) select [select])
+          index (if (= select [nil]) (take (count key-index) (iterate inc 0)) (vals (select-keys key-index select)))]
+      (assert (= (count select) (count index)) (Clojask_OperationException. "Must select existing columns. You may check it using"))
+      (if (<= num-worker 8)
+        (try
+          (.final this)
+          (.printCol this output-dir) ;; to-do: based on the index
+          (let [res (start-onyx num-worker batch-size this output-dir exception order index)]
+            (if (= res "success")
+              "success"
+              "failed"))
+          (catch Exception e e))
+        (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))))
   
   (computeAggre
    [this ^int num-worker ^String output-dir ^boolean exception]
@@ -390,15 +394,6 @@
     (.errorPredetect this "this function cannot be appended into the current pipeline")
     result)))
 
-(defn compute
-  [this num-worker output-dir & {:keys [exception order] :or {exception false order true}}]
-  (u/init-file output-dir)
-  (if (= (.getAggreFunc (:row-info this)) [])
-    (.compute this num-worker output-dir exception order)
-    (if (not= (.getGroupbyKeys (:row-info this)) [])
-      (.computeGroupAggre this num-worker output-dir exception)
-      (.computeAggre this num-worker output-dir exception))))
-
 (defn group-by
   [this key]
   (let [result (.groupby this key)]
@@ -466,8 +461,35 @@
     (.errorPredetect this "invalid arguments passed to rename-col function")
   result))
 
+;; ============= Below is the definition for the joineddataframe ================
+(definterface JDFIntf
+  (getColNames [] "get the names of all the columns")
+  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]))
+
+(defrecord JoinedDataFrame
+           [^DataFrame a
+            ^DataFrame b
+            a-keys
+            b-keys
+            a-roll
+            b-roll
+            type
+            limit]
+  JDFIntf
+  (getColNames
+    [this])
+
+  (compute
+   [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]
+   (let []
+     (u/init-file output-dir)
+        ;; print column names
+    ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames
+     (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
+     (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit))))
+
 (defn inner-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -478,15 +500,15 @@
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
-    (u/init-file dist)
-    ;; print column names
-    (.printJoinCol a b a-keys b-keys dist)
-    ;; first group b by keys
-    (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
-    (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 1)))
+    (let [a-file (io/file (:path a))
+          b-file (io/file (:path b))]
+      (if (<= (.length a-file) (.length b-file))
+        (JoinedDataFrame. a b a-keys b-keys nil nil 1 nil)
+        (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil)))
+    ))
 
 (defn left-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -497,15 +519,10 @@
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
-    (u/init-file dist)
-    ;; print column names
-    (.printJoinCol a b a-keys b-keys dist)
-    ;; first group b by keys
-    (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
-    (start-onyx-join num-worker 10 a b dist exception a-keys b-keys nil nil 2)))
+    (JoinedDataFrame. a b a-keys b-keys nil nil 2 nil)))
 
 (defn right-join
-  [a b a-keys b-keys num-worker dist & {:keys [exception] :or {exception false}}]
+  [a b a-keys b-keys]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -516,15 +533,10 @@
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
-    (u/init-file dist)
-    ;; print column names
-    (.printJoinCol b a a-keys b-keys dist)
-    ;; first group b by keys
-    (start-onyx-groupby num-worker 10 a "./_clojask/join/b/" a-keys exception)
-    (start-onyx-join num-worker 10 b a dist exception b-keys a-keys nil nil 2)))
+    (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil)))
 
 (defn rolling-join-forward
-  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}]
+  [a b a-keys b-keys a-roll b-roll & {:keys [limit] :or {limit nil}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -541,15 +553,11 @@
       (do
         (cond (not (and (not= a-roll nil) (not= b-roll nil)))
           (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s).")))
-        (u/init-file dist)
-        ;; print column names
-        (.printJoinCol a b a-keys b-keys dist)
-        (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
-        (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 4 limit)))))
+        (JoinedDataFrame. a b a-keys b-keys a-roll b-roll 4 limit)))))
 
 ;; all of the code is the same as above except for the last line
 (defn rolling-join-backward
-  [a b a-keys b-keys a-roll b-roll num-worker dist & {:keys [exception limit] :or {exception false limit nil}}]
+  [a b a-keys b-keys a-roll b-roll & {:keys [limit] :or {limit nil}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
@@ -566,8 +574,31 @@
       (do
         (cond (not (and (not= a-roll nil) (not= b-roll nil)))
               (throw (Clojask_TypeException. "Rolling keys include non-existent column name(s).")))
-        (u/init-file dist)
-        ;; print column names
-        (.printJoinCol a b a-keys b-keys dist)
-        (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
-        (start-onyx-join num-worker 10 a b dist exception a-keys b-keys a-roll b-roll 5 nil)))))
+        (JoinedDataFrame. a b a-keys b-keys a-roll b-roll 5 limit)))))
+
+(defn compute
+  [this num-worker output-dir & {:keys [exception order select exclude] :or {exception false order true select nil exclude nil}}]
+  (assert (or (nil? select) (nil? exclude)) "can only specify either of them")
+  (u/init-file output-dir)
+  ;; check which type of dataframe this is
+  (if (= (type this) clojask.dataframe.DataFrame)
+    (if (= (.getAggreFunc (:row-info this)) [])
+      (let [exclude (if (coll? exclude) exclude [exclude])
+            select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))]
+        (.compute this num-worker output-dir exception order select))
+      (if (not= (.getGroupbyKeys (:row-info this)) [])
+        (let [exclude (if (coll? exclude) exclude [exclude])
+              select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))]
+          (.computeGroupAggre this num-worker output-dir exception))
+        (.computeAggre this num-worker output-dir exception)))
+    (if (= (type this) clojask.dataframe.JoinedDataFrame)
+      (.compute this num-worker output-dir exception order select exclude)
+      (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))
+      )))
+
+(defn get-col-names
+  "Get the names for the columns in sequence"
+  [this]
+  ;; to-do: should implement both for the DataFrame and JoinedDataFrame
+  (.getColNames this)
+  )
diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj
index 964ae2a..50aef35 100644
--- a/src/main/clojure/clojask/onyx_comps.clj
+++ b/src/main/clojure/clojask/onyx_comps.clj
@@ -61,17 +61,15 @@
 ;;     (zipmap keys )))
 
 (defn worker-func-gen
-  [df exception]
+  [df exception index]
   (reset! dataframe df)
   (let [operations (.getDesc (:col-info (deref dataframe)))
         types (.getType (:col-info (deref dataframe)))
         filters (.getFilters (:row-info df))
         indices-deleted (.getDeletedCol (:col-info (deref dataframe)))
         indices-wo-del (vec (take (count operations) (iterate inc 0)))
-        indices (if (empty? indices-deleted) 
-                    indices-wo-del ; no columns deleted
-                    (vec (set/difference (set indices-wo-del) (set indices-deleted))) ; minus column indices deleted
-                    )]
+        indices index]
+    ;; (println indices)
     (if exception
       (defn worker-func
         [seg]
@@ -475,11 +473,11 @@
 
 (defn start-onyx
   "start the onyx cluster with the specification inside dataframe"
-  [num-work batch-size dataframe dist exception order]
+  [num-work batch-size dataframe dist exception order index]
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception) ;;need some work
+    (worker-func-gen dataframe exception index) ;;need some work
     (catalog-gen num-work batch-size)
     (lifecycle-gen (.path dataframe) dist order)
     (flow-cond-gen num-work)
@@ -510,7 +508,7 @@
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception) ;;need some work
+    (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work
     (catalog-aggre-gen num-work batch-size)
     (lifecycle-aggre-gen (.path dataframe) dist)
     (flow-cond-gen num-work)
@@ -542,7 +540,7 @@
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception) ;;need some work
+    (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work
     (catalog-groupby-gen num-work batch-size)
     (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe)))
     (flow-cond-gen num-work)
@@ -575,7 +573,7 @@
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception) ;;need some work
+    (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work
     (catalog-join-gen num-work batch-size)
     (lifecycle-join-gen (.path dataframe) dist dataframe b a-keys b-keys a-roll b-roll join-type)
     (flow-cond-gen num-work)

From 8f57b0e5a05f4eef4cd8025508c7a6bd32a713f3 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Mon, 20 Dec 2021 14:41:16 +0800
Subject: [PATCH 07/33] Implemented getColNames for Join DF

---
 src/main/clojure/clojask/dataframe.clj | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 27f6d73..b7b15c1 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -485,7 +485,14 @@
             prefix]
   JDFIntf
   (getColNames
-    [this])
+    [this]
+    (let [a-col-prefix (first prefix)
+          b-col-prefix (last prefix)
+          a-col-set (.getColNames a)
+          b-col-set (.getColNames b)
+          a-col-header (mapv #(str a-col-prefix "_" %) a-col-set)
+          b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)]
+      (conj a-col-header b-col-header)))
 
   (compute
    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]

From d18a8ba5cd05d9a0707b9078ec6f9f10157f3d18 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Mon, 20 Dec 2021 20:20:05 +0800
Subject: [PATCH 08/33] Amend test file for new Join API syntax

---
 test/clojask/core_test.clj | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index 158146f..c4b1cef 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -66,30 +66,30 @@
     (testing "Join dataframes APIs"
     (def x (dataframe "test/clojask/Employees-example.csv"))
     (def y (dataframe "test/clojask/Employees-example.csv"))
-    (is (= "success" (left-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false)))
-    (is (= "success" (right-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false)))
-    (is (= "success" (inner-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false)))
-    (is (= "success" (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary" 8 "resources/test.csv" :exception false)))
+    (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
+    ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
+    ;; (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
+    ;; (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"]) "Salary" "Salary" 8 "resources/test.csv" :exception false)))
     ))
 
 (deftest join-api-output-test
     (testing "Join dataframes APIs"
     (def x (dataframe "test/clojask/Employees-example.csv"))
     (def y (dataframe "test/clojask/Employees-info-example.csv"))
-    (left-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-4.csv" :exception false)
+    (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false)
     (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")]
         (is (= "" (:out result))))
-    (right-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-5.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
-        (is (= "" (:out result))))
-    (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
-        (is (= "" (:out result))))
-    (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate" 8 "test/clojask/test_outputs/1-7.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")]
-        (is (= "" (:out result))))
-    (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate" 8 "test/clojask/test_outputs/1-8.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")]
-        (is (= "" (:out result))))
+    ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
+    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
+    ;;     (is (= "" (:out result))))
+    ;; (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
+    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
+    ;;     (is (= "" (:out result))))
+    ;; (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false)
+    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")]
+    ;;     (is (= "" (:out result))))
+    ;; (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false)
+    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")]
+    ;;     (is (= "" (:out result))))
     ))
 

From d36dc7fd92dd0673a5654a8c765c44711fbbb902 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Mon, 20 Dec 2021 22:14:40 +0800
Subject: [PATCH 09/33] Implement Join DF printCol, amend bug in getColNames

---
 src/main/clojure/clojask/dataframe.clj | 30 ++++++++++++++++++--------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index b7b15c1..ec6f542 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -158,6 +158,7 @@
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n")))))
 
+  ;; !! deprecated
   (printJoinCol
   ;; print column names, called by join APIs
     [this b-df this-keys b-keys output-path col-prefix]
@@ -470,7 +471,9 @@
 
 ;; ============= Below is the definition for the joineddataframe ================
 (definterface JDFIntf
+  (checkOutputPath [output-path] "check if output path is of string type")
   (getColNames [] "get the names of all the columns")
+  (printCol [output-path] "print column names to output file")
   (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]))
 
 (defrecord JoinedDataFrame
@@ -484,6 +487,7 @@
             limit
             prefix]
   JDFIntf
+
   (getColNames
     [this]
     (let [a-col-prefix (first prefix)
@@ -492,16 +496,24 @@
           b-col-set (.getColNames b)
           a-col-header (mapv #(str a-col-prefix "_" %) a-col-set)
           b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)]
-      (conj a-col-header b-col-header)))
+      (concat a-col-header b-col-header)))
 
+  (printCol
+    ;; print column names, called by compute
+      [this output-path]
+      (let [col-set (.getColNames this)]
+        (with-open [wrtr (io/writer output-path)]
+          (.write wrtr (str (str/join "," col-set) "\n")))))
+        
   (compute
-   [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]
-   (let []
-     (u/init-file output-dir)
-        ;; print column names
-    ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames
-     (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
-     (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit))))
+    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]
+    (let []
+      (u/init-file output-dir)
+      ;; print column names
+      ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done
+      (.printCol this output-dir)
+      (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
+      (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit))))
 
 (defn inner-join
   [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}]
@@ -620,6 +632,6 @@
 (defn get-col-names
   "Get the names for the columns in sequence"
   [this]
-  ;; to-do: should implement both for the DataFrame and JoinedDataFrame
+  ;; to-do: should implement both for the DataFrame and JoinedDataFrame => Done
   (.getColNames this)
   )

From 69faacb152d6f35aae4e20a11391d3de4c6e54b1 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Mon, 20 Dec 2021 22:50:50 +0800
Subject: [PATCH 10/33] Amend getColNames logic to retrieve columns following
 the order of indices

---
 src/main/clojure/clojask/dataframe.clj | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index ec6f542..39eeca2 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -136,9 +136,11 @@
   (getColNames
     [this]
     (let [index-key (.getIndexKey (:col-info this))
-          index (.getColIndex this)
-          header (mapv index-key index)]
-      header))
+          index (.getColIndex this)]
+          ;header (mapv index-key index)]
+      ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2])
+      (mapv (fn [i] (get index-key i)) index)
+      ))
 
   (printCol
   ;; print column names, called by compute
@@ -263,7 +265,7 @@
       (if (<= num-worker 8)
         (try
           (.final this)
-          (.printCol this output-dir) ;; to-do: based on the index
+          (.printCol this output-dir) ;; to-do: based on the index => Done
           (let [res (start-onyx num-worker batch-size this output-dir exception order index)]
             (if (= res "success")
               "success"

From 88b549bcd042d1aac4e13cfeed3b2d356e6176a2 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 22 Dec 2021 14:13:36 +0800
Subject: [PATCH 11/33] Update debug code

---
 src/main/clojure/clojask/debug.clj | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/main/clojure/clojask/debug.clj b/src/main/clojure/clojask/debug.clj
index 331d428..c00a19f 100644
--- a/src/main/clojure/clojask/debug.clj
+++ b/src/main/clojure/clojask/debug.clj
@@ -13,15 +13,16 @@
   ;(def x "Hello world")
   ;(-> (clojure.core/format "Expression '%s' not defined." x)(MyOwnException.)(throw))
 
-  (def y (dataframe "resources/Employees.csv" :have-col true))
-  ;(def y (dataframe "resources/Employees-info.csv" :have-col true))
-  ;(time (left-join x y ["Employee"] ["Employee"] 8 "resources/test.csv" :exception false))
+  (def x (dataframe "resources/Employees.csv" :have-col true))
+  (def y (dataframe "resources/Employees-info.csv" :have-col true))
+  (def z (left-join x y ["Employee"] ["Employee"]))
+  (time (compute x 8 "resources/test.csv" :exception true))
+
   ;(time (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "Employee" "Employee" 8 "resources/test.csv" :exception false))
 
-  (select-col y ["Salary" "EmployeeName"])
+  ;(select-col y ["Salary" "EmployeeName"])
   ;(delete-col y ["Salary" "EmployeeName"])
-  (print-df y)
-  (time (compute y 8 "resources/test.csv" :exception true))
+  ;(print-df y)
 
   ;(println (.getKeys (.col-info y)))
   ;(set-type y "Salary" "double")

From 7276719030e04ebd66323dbbcc2d054b0c5acda5 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 22 Dec 2021 14:51:23 +0800
Subject: [PATCH 12/33] Change getColNames to incorporate aggregated/grouped-by
 dataframes

---
 src/main/clojure/clojask/dataframe.clj | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 39eeca2..ffa8eeb 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -135,12 +135,17 @@
 
   (getColNames
     [this]
-    (let [index-key (.getIndexKey (:col-info this))
-          index (.getColIndex this)]
-          ;header (mapv index-key index)]
-      ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2])
-      (mapv (fn [i] (get index-key i)) index)
-      ))
+    (if (and (= 0 (count (.getGroupbyKeys (:row-info this)))) (= 0 (count (.getAggreNewKeys (:row-info this)))))
+        ;; not aggregate
+        (let [index-key (.getIndexKey (:col-info this))
+              index (.getColIndex this)]
+              ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2])
+              (mapv (fn [i] (get index-key i)) index))
+        ;; if aggregate
+        (let [groupby-key-index (.getGroupbyKeys (:row-info this))
+              groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index))))
+              aggre-new-keys (.getAggreNewKeys (:row-info this))]
+              (concat groupby-keys aggre-new-keys))))
 
   (printCol
   ;; print column names, called by compute
@@ -150,6 +155,7 @@
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," col-set) "\n")))))
 
+  ;; !! deprecated
   (printAggreCol
   ;; print column names, called by computeAggre
     [this output-path]
@@ -276,7 +282,8 @@
   (computeAggre
    [this ^int num-worker ^String output-dir ^boolean exception]
    (.computeTypeCheck this num-worker output-dir)
-   (.printAggreCol this output-dir) ;; print column names to output-dir
+   ;(.printAggreCol this output-dir) ;; print column names to output-dir
+   (.printCol this output-dir)
    (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)]
      (if (= res "success")
        "success"
@@ -288,7 +295,8 @@
     (if (<= num-worker 8)
       (try
         (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)]
-          (.printAggreCol this output-dir) ;; print column names to output-dir
+          ;(.printAggreCol this output-dir) ;; print column names to output-dir
+          (.printCol this output-dir)
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
             (if

From 46f91d4b445aa5b992be20a59b1a2011c8da42e6 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Thu, 23 Dec 2021 18:27:57 +0800
Subject: [PATCH 13/33] select on DataFrame

---
 .../clojure/aggregate/aggre_onyx_comps.clj    | 22 +++--
 src/main/clojure/clojask/ColInfo.clj          |  2 +-
 src/main/clojure/clojask/clojask_aggre.clj    | 25 ++++--
 src/main/clojure/clojask/dataframe.clj        | 82 +++++++++++++------
 src/main/clojure/clojask/onyx_comps.clj       | 13 ++-
 src/main/clojure/clojask/utils.clj            |  8 +-
 6 files changed, 102 insertions(+), 50 deletions(-)

diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj
index 4cbfb48..fd6c0c1 100644
--- a/src/main/clojure/aggregate/aggre_onyx_comps.clj
+++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj
@@ -8,7 +8,7 @@
             [onyx.test-helper :refer [with-test-env feedback-exception!]]
             [tech.v3.dataset :as ds]
             [clojure.data.csv :as csv]
-            [clojask.utils :refer [eval-res eval-res-ne filter-check]]
+            [clojask.utils :as u]
             [clojure.set :as set]
             [clojask.groupby :refer [read-csv-seq]])
   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
@@ -38,10 +38,11 @@
 
 
 (defn worker-func-gen
-  [df exception]
+  [df exception aggre-funcs index formatter]
   (reset! dataframe df)
-  (let [aggre-funcs (.getAggreFunc (.row-info (deref dataframe)))
-        formatters (.getFormatter (.col-info (deref dataframe)))
+  (let [
+        ;; aggre-funcs (.getAggreFunc (.row-info (deref dataframe)))
+        formatters formatter
         ;; key-index (.getKeyIndex (.col-info (deref dataframe)))
         ;; formatters (set/rename-keys formatters key-index)
         ]
@@ -52,7 +53,10 @@
       (let [data (read-csv-seq (:file seq))
             pre (:d seq)
             data-map (-> (iterate inc 0)
-                         (zipmap (apply map vector data)))]
+                         (zipmap (apply map vector data)))
+            reorder (fn [a b]
+                      ;; (println [a b])
+                      (u/gets (concat a b) index))]
         ;; (mapv (fn [_]
         ;;        (let [func (first _)
         ;;              index (nth _ 1)]
@@ -62,7 +66,9 @@
                res []]
           (if (= aggre-funcs [])
             ;; {:d (vec (concat pre res))}
-            {:d (mapv concat (repeat pre) (apply map vector res))}
+            (if (= res [])
+              {:d (u/gets [pre] index)}
+              {:d (mapv reorder (repeat pre) (apply map vector res))})
             (let [func (first (first aggre-funcs))
                   index (nth (first aggre-funcs) 1)
                   res-funcs (rest aggre-funcs)
@@ -252,11 +258,11 @@
 
 (defn start-onyx-aggre
   "start the onyx cluster with the specification inside dataframe"
-  [num-work batch-size dataframe dist exception]
+  [num-work batch-size dataframe dist exception aggre-func index formatter]
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception) ;;need some work
+    (worker-func-gen dataframe exception aggre-func index formatter) ;;need some work
     (catalog-gen num-work batch-size)
     (lifecycle-gen "./_clojask/grouped" dist)
     (flow-cond-gen num-work)
diff --git a/src/main/clojure/clojask/ColInfo.clj b/src/main/clojure/clojask/ColInfo.clj
index 8015ad0..0be2d10 100644
--- a/src/main/clojure/clojask/ColInfo.clj
+++ b/src/main/clojure/clojask/ColInfo.clj
@@ -107,7 +107,7 @@
 
   (getKeys
     [this]
-    col-keys)
+    (mapv (fn [index] (get index-key index)) (take (count index-key) (iterate inc 0))))
 
   (getKeyIndex
    [this]
diff --git a/src/main/clojure/clojask/clojask_aggre.clj b/src/main/clojure/clojask/clojask_aggre.clj
index d4f28ed..456cc58 100644
--- a/src/main/clojure/clojask/clojask_aggre.clj
+++ b/src/main/clojure/clojask/clojask_aggre.clj
@@ -4,14 +4,19 @@
             [clojure.java.io :as io]
             [taoensso.timbre :refer [debug info] :as timbre]
             [clojure.string :as string]
-            [clojask.api.aggregate :refer [start]])
+            [clojask.api.aggregate :refer [start]]
+            [clojask.utils :as u])
   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
 
 (def df (atom nil))
+(def aggre-func (atom nil))
+(def select (atom nil))
 
 (defn inject-dataframe
-  [dataframe]
+  [dataframe a b]
   (reset! df dataframe)
+  (reset! aggre-func a)
+  (reset! select b)
   )
 
 (defn c-count
@@ -39,7 +44,9 @@
    :lifecycle/after-task-stop close-writer})
 
 (defrecord ClojaskOutput
-           [memo]
+           [memo
+            aggre-func
+            select]
   p/Plugin
   (start [this event]
     ;; Initialize the plugin, generally by assoc'ing any initial state.
@@ -52,7 +59,7 @@
         (let [data (mapv (fn [_] (if (coll? _) _ [_])) (deref memo))]
           ;; (.write (:clojask/wtr event) (str data "\n"))
           (if (apply = (map count data))
-            (mapv #(.write (:clojask/wtr event) (str (string/join "," %) "\n")) (apply map vector data))
+            (mapv #(.write (:clojask/wtr event) (str (string/join "," (u/gets % select)) "\n")) (apply map vector data))
             (throw (Exception. "aggregation result is not of the same length"))))
         this)
 
@@ -86,7 +93,7 @@
     ;; before write-batch is called repeatedly.
     true)
 
-  (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/aggre-func]} replica messenger]
+  (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger]
               ;;  keys [:Departement]
     ;; Write the batch to your datasink.
     ;; In this case we are conjoining elements onto a collection.
@@ -111,6 +118,8 @@
 ;; from your task-map here, in order to improve the performance of your plugin
 ;; Extending the function below is likely good for most use cases.
 (defn output [pipeline-data]
-  (let [aggre-func (.getAggreFunc (:row-info (deref df)))]
-   (->ClojaskOutput (volatile! (doall (take (count aggre-func)
-                                     (repeat start)))))))
\ No newline at end of file
+  (let []
+   (->ClojaskOutput (volatile! (doall (take (count (deref aggre-func))
+                                     (repeat start))))
+                    (deref aggre-func)
+                    (deref select))))
\ No newline at end of file
diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index ffa8eeb..88abdd5 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -41,8 +41,8 @@
   (head [n] "return first n lines in dataframe")
   (filter [cols predicate])
   (computeTypeCheck [num-worker output-dir])
-  (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception])
-  (computeAggre [^int num-worker ^String output-dir ^boolean exception])
+  (computeGroupAggre [^int num-worker ^String output-dir ^boolean exception select])
+  (computeAggre [^int num-worker ^String output-dir ^boolean exception select])
   (sort [a b] "sort the dataframe based on columns")
   (addFormatter [a b] "format the column as the last step of the computation")
   (preview [sample-size output-size format] "quickly return a vector of maps about the resultant dataframe")
@@ -280,31 +280,60 @@
         (throw (Clojask_OperationException. "Max number of worker nodes is 8.")))))
   
   (computeAggre
-   [this ^int num-worker ^String output-dir ^boolean exception]
+   [this ^int num-worker ^String output-dir ^boolean exception select]
    (.computeTypeCheck this num-worker output-dir)
    ;(.printAggreCol this output-dir) ;; print column names to output-dir
    (.printCol this output-dir)
-   (let [res (start-onyx-aggre-only num-worker batch-size this output-dir exception)]
+   (let [aggre-keys (.getAggreFunc row-info)
+         select (if (coll? select) select [select])
+         select (if (= select [nil])
+                  (vec (take (count aggre-keys) (iterate inc 0)))
+                  (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
+         aggre-func (u/gets aggre-keys (vec (apply sorted-set select)))
+         select (mapv (fn [num] (count (remove #(>= % num) select))) select)
+         index (vec (apply sorted-set (mapv #(nth % 1) aggre-func)))
+         shift-func (fn [pair]
+                      [(first pair) (let [num (nth pair 1)]
+                                      (.indexOf index num))])
+         aggre-func (mapv shift-func aggre-func)
+        ;;  test (println [select index aggre-func])
+         res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)]
      (if (= res "success")
        "success"
        "failed")))
   
   (computeGroupAggre
-    [this ^int num-worker ^String output-dir ^boolean exception]
+    [this ^int num-worker ^String output-dir ^boolean exception select]
     (.computeTypeCheck this num-worker output-dir)
     (if (<= num-worker 8)
       (try
-        (let [res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" (.getGroupbyKeys (:row-info this)) exception)]
+        (let [groupby-keys (.getGroupbyKeys row-info)
+              aggre-keys (.getAggreFunc row-info)
+              select (if (coll? select) select [select])
+              select (if (= select [nil])
+                       (vec (take (+ (count groupby-keys) (count aggre-keys)) (iterate inc 0)))
+                       (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
+              ;; pre-index (remove #(>= % (count groupby-keys)) select)
+              data-index (mapv #(- % (count groupby-keys)) (remove #(< % (count groupby-keys)) select))
+              groupby-index (vec (apply sorted-set (mapv #(nth % 1) (concat groupby-keys (u/gets aggre-keys data-index)))))
+              ;; test (println [groupby-keys aggre-keys select pre-index data-index])
+              res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
           (.printCol this output-dir)
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
-            (if
+            (let [shift-func (fn [pair]
+                               [(first pair) (let [index (nth pair 1)]
+                                               (.indexOf groupby-index index))])
+                  aggre-func (mapv shift-func (u/gets aggre-keys data-index))
+                  formatter (.getFormatter (.col-info this))
+                  formatter (set/rename-keys formatter (zipmap groupby-index (iterate inc 0)))]
+             (if
             ;;  (internal-aggregate (.getAggreFunc (:row-info this)) output-dir (.getKeyIndex col-info) (.getGroupbyKeys (:row-info this)) (.getAggreOldKeys (:row-info this)) (.getAggreNewKeys (:row-info this)))
-             (start-onyx-aggre num-worker batch-size this output-dir exception)
+             (start-onyx-aggre num-worker batch-size this output-dir exception aggre-func select formatter)
               "success"
-              (throw (Clojask_OperationException. "Error in running start-onyx-aggre.")))
-            (throw (Clojask_OperationException. "Error in running start-onyx-groupby."))))
+              (throw (Clojask_OperationException. "Error when aggregating."))))
+            (throw (Clojask_OperationException. "Error when grouping by."))))
         (catch Exception e e))
       (throw (Clojask_OperationException. "Max number of worker nodes is 8."))))
 
@@ -516,13 +545,17 @@
           (.write wrtr (str (str/join "," col-set) "\n")))))
         
   (compute
-    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]
-    (let []
+    [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select]
+    (let [select (if (coll? select) select [select])
+          select (if (= select [nil])
+                   (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0)))
+                   (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
+          ]
       (u/init-file output-dir)
       ;; print column names
       ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done
       (.printCol this output-dir)
-      (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception)
+      (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception []) ;; todo
       (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit))))
 
 (defn inner-join
@@ -541,7 +574,7 @@
           b-file (io/file (:path b))]
       (if (<= (.length a-file) (.length b-file))
         (JoinedDataFrame. a b a-keys b-keys nil nil 1 nil col-prefix)
-        (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil [(nth col-prefix 1) (nth col-prefix 2)])))
+        (JoinedDataFrame. b a b-keys a-keys nil nil 1 nil [(nth col-prefix 1) (nth col-prefix 0)])))
     ))
 
 
@@ -573,7 +606,7 @@
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
-    (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil [(nth col-prefix 1) (nth col-prefix 2)])))
+    (JoinedDataFrame. b a b-keys a-keys nil nil 2 nil [(nth col-prefix 1) (nth col-prefix 0)])))
 
 
 (defn rolling-join-forward
@@ -625,17 +658,16 @@
   (u/init-file output-dir)
   ;; check which type of dataframe this is
   (if (= (type this) clojask.dataframe.DataFrame)
-    (if (= (.getAggreFunc (:row-info this)) [])
-      (let [exclude (if (coll? exclude) exclude [exclude])
-            select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))]
-        (.compute this num-worker output-dir exception order select))
-      (if (not= (.getGroupbyKeys (:row-info this)) [])
-        (let [exclude (if (coll? exclude) exclude [exclude])
-              select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (keys (.getKeyIndex (:col-info this))))) nil))]
-          (.computeGroupAggre this num-worker output-dir exception))
-        (.computeAggre this num-worker output-dir exception)))
+    (let [exclude (if (coll? exclude) exclude [exclude])
+          select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]
+      (assert (not= select []) "must select at least on column")
+      (if (= (.getAggreFunc (:row-info this)) [])
+        (.compute this num-worker output-dir exception order select)
+        (if (not= (.getGroupbyKeys (:row-info this)) [])
+          (.computeGroupAggre this num-worker output-dir exception select)
+          (.computeAggre this num-worker output-dir exception select))))
     (if (= (type this) clojask.dataframe.JoinedDataFrame)
-      (.compute this num-worker output-dir exception order select exclude)
+      (.compute this num-worker output-dir exception order select)
       (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))
       )))
 
diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj
index 50aef35..dd5236c 100644
--- a/src/main/clojure/clojask/onyx_comps.clj
+++ b/src/main/clojure/clojask/onyx_comps.clj
@@ -66,8 +66,6 @@
   (let [operations (.getDesc (:col-info (deref dataframe)))
         types (.getType (:col-info (deref dataframe)))
         filters (.getFilters (:row-info df))
-        indices-deleted (.getDeletedCol (:col-info (deref dataframe)))
-        indices-wo-del (vec (take (count operations) (iterate inc 0)))
         indices index]
     ;; (println indices)
     (if exception
@@ -504,16 +502,16 @@
 
 (defn start-onyx-aggre-only
   "start the onyx cluster with the specification inside dataframe"
-  [num-work batch-size dataframe dist exception]
+  [num-work batch-size dataframe dist exception aggre-func index select]
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work
+    (worker-func-gen dataframe exception index) ;;need some work
     (catalog-aggre-gen num-work batch-size)
     (lifecycle-aggre-gen (.path dataframe) dist)
     (flow-cond-gen num-work)
     (input/inject-dataframe dataframe)
-    (aggre/inject-dataframe dataframe)
+    (aggre/inject-dataframe dataframe aggre-func select)
     (catch Exception e (throw (Exception. (str "[preparing stage] " (.getMessage e))))))
   (try
     (let [submission (onyx.api/submit-job peer-config
@@ -536,11 +534,12 @@
 
 (defn start-onyx-groupby
   "start the onyx cluster with the specification inside dataframe"
-  [num-work batch-size dataframe dist groupby-keys exception]
+  [num-work batch-size dataframe dist groupby-keys groupby-index exception]
+  ;; (println groupby-index)
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception (take (count (.getKeyIndex (:col-info dataframe))) (iterate inc 0))) ;;need some work
+    (worker-func-gen dataframe exception groupby-index) ;;need some work
     (catalog-groupby-gen num-work batch-size)
     (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe)))
     (flow-cond-gen num-work)
diff --git a/src/main/clojure/clojask/utils.clj b/src/main/clojure/clojask/utils.clj
index e6d5cb1..cd3852b 100644
--- a/src/main/clojure/clojask/utils.clj
+++ b/src/main/clojure/clojask/utils.clj
@@ -11,6 +11,12 @@
            (java.time.format DateTimeFormatter)))
 "Utility function used in dataframe"
 
+(defn gets
+  "unlike core/get, get elements from indices"
+  [coll indices]
+  (mapv #(nth coll %) indices)
+  )
+
 (defn get-key
   [row types key-index key]
   (let [index (get key-index key)]
@@ -277,4 +283,4 @@
       (if (string? input)
         [[nil input]]
         nil))
-    (catch Exception e nil)))
\ No newline at end of file
+    (catch Exception e nil)))

From 967918b46351ab767b86e54ddb4f6587c849f5f0 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Fri, 24 Dec 2021 00:25:22 +0800
Subject: [PATCH 14/33] select for aggregate, join

---
 .../clojure/aggregate/aggre_onyx_comps.clj    |   2 +-
 src/main/clojure/clojask/clojask_groupby.clj  |  12 +-
 src/main/clojure/clojask/clojask_join.clj     |  71 ++-------
 src/main/clojure/clojask/dataframe.clj        |  43 +++---
 src/main/clojure/clojask/debug.clj            |   2 +-
 src/main/clojure/clojask/groupby.clj          |   7 +-
 src/main/clojure/clojask/join.clj             | 137 ++----------------
 src/main/clojure/clojask/onyx_comps.clj       |   8 +-
 src/main/clojure/clojask/utils.clj            | 129 +++++++++++------
 9 files changed, 154 insertions(+), 257 deletions(-)

diff --git a/src/main/clojure/aggregate/aggre_onyx_comps.clj b/src/main/clojure/aggregate/aggre_onyx_comps.clj
index fd6c0c1..5dd3521 100644
--- a/src/main/clojure/aggregate/aggre_onyx_comps.clj
+++ b/src/main/clojure/aggregate/aggre_onyx_comps.clj
@@ -67,7 +67,7 @@
           (if (= aggre-funcs [])
             ;; {:d (vec (concat pre res))}
             (if (= res [])
-              {:d (u/gets [pre] index)}
+              {:d [(u/gets pre index)]}
               {:d (mapv reorder (repeat pre) (apply map vector res))})
             (let [func (first (first aggre-funcs))
                   index (nth (first aggre-funcs) 1)
diff --git a/src/main/clojure/clojask/clojask_groupby.clj b/src/main/clojure/clojask/clojask_groupby.clj
index 19739c2..3405036 100644
--- a/src/main/clojure/clojask/clojask_groupby.clj
+++ b/src/main/clojure/clojask/clojask_groupby.clj
@@ -8,11 +8,13 @@
 
 (def dataframe (atom nil))
 (def groupby-keys (atom nil))
+(def write-index (atom nil))
 
 (defn inject-dataframe
-  [df groupby-key]
+  [df groupby-key index]
   (reset! dataframe df)
-  (reset! groupby-keys groupby-key))
+  (reset! groupby-keys groupby-key)
+  (reset! write-index index))
 
 (defn- inject-into-eventmap
   [event lifecycle]
@@ -35,7 +37,7 @@
 (def writer-aggre-calls
   {:lifecycle/before-task-start inject-into-eventmap})
 
-(defrecord ClojaskGroupby []
+(defrecord ClojaskGroupby [write-index]
   p/Plugin
   (start [this event]
     ;; Initialize the plugin, generally by assoc'ing any initial state.
@@ -90,7 +92,7 @@
                 ;(.write wtr (str msg "\n"))
                 ;; !! define argument (debug)
             ;;   (def groupby-keys [:Department :EmployeeName])
-              (output-groupby dist (:d msg) groupby-keys key-index formatter)))
+              (output-groupby dist (:d msg) groupby-keys key-index formatter write-index)))
 
           (recur (rest batch)))))
     true))
@@ -101,4 +103,4 @@
 ;; from your task-map here, in order to improve the performance of your plugin
 ;; Extending the function below is likely good for most use cases.
 (defn groupby [pipeline-data]
-  (->ClojaskGroupby))
\ No newline at end of file
+  (->ClojaskGroupby (deref write-index)))
\ No newline at end of file
diff --git a/src/main/clojure/clojask/clojask_join.clj b/src/main/clojure/clojask/clojask_join.clj
index 3b2e093..fa9d9c1 100644
--- a/src/main/clojure/clojask/clojask_join.clj
+++ b/src/main/clojure/clojask/clojask_join.clj
@@ -11,13 +11,21 @@
 (def b (atom nil))
 (def a-keys (atom nil))
 (def b-keys (atom nil))
+(def a-index (atom nil))
+(def b-index (atom nil))
+(def b-format (atom nil))
+(def join-index (atom nil))
 
 (defn inject-dataframe
-  [d-a d-b a-key b-key]
+  [d-a d-b a-key b-key -a-index -b-index -join-index -b-format]
   (reset! a d-a)
   (reset! b d-b)
   (reset! a-keys a-key)
-  (reset! b-keys b-key))
+  (reset! b-keys b-key)
+  (reset! a-index -a-index)
+  (reset! b-index -b-index)
+  (reset! b-format -b-format)
+  (reset! join-index -join-index))
 
 (defn- inject-into-eventmap
   [event lifecycle]
@@ -38,8 +46,6 @@
      :clojask/b-map (:clojask/b-map lifecycle)
      :clojask/a-format a-format
      :clojask/b-format b-format
-     :clojask/a-index (take (count (:clojask/a-map lifecycle)) (iterate inc 0))
-     :clojask/b-index (take (count (:clojask/b-map lifecycle)) (iterate inc 0))
      :clojask/join-type (:clojask/join-type lifecycle)}))
 
 (defn- close-writer [event lifecycle]
@@ -52,7 +58,7 @@
   {:lifecycle/before-task-start inject-into-eventmap
   :lifecycle/after-task-stop close-writer})
 
-(defrecord ClojaskJoin []
+(defrecord ClojaskJoin [a-index b-index join-index]
   p/Plugin
   (start [this event]
     ;; Initialize the plugin, generally by assoc'ing any initial state.
@@ -94,7 +100,7 @@
     ;; before write-batch is called repeatedly.
     true)
 
-  (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll  clojask/a-map clojask/b-map clojask/a-format clojask/b-format clojask/a-index clojask/b-index clojask/join-type]} replica messenger]
+  (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll  clojask/a-map clojask/b-map clojask/a-format]} replica messenger]
               ;;  keys [:Departement]
     ;; Write the batch to your datasink.
     ;; In this case we are conjoining elements onto a collection.
@@ -107,58 +113,9 @@
                 ;(.write wtr (str msg "\n"))
                 ;; !! define argument (debug)
             ;;   (def groupby-keys [:Department :EmployeeName])
-              (join/output-join wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index)))
+              (join/output-join wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index join-index)))
 
           (recur (rest batch)))))
-    ;; (case join-type
-    ;;   1 (loop [batch write-batch]
-    ;;             (if-let [msg (first batch)]
-    ;;               (do
-    ;;       ;; (swap! example-datasink conj msg)
-    ;;                 (if (not= (:d msg) nil)
-    ;;                   (do
-    ;;             ;(.write wtr (str msg "\n"))
-    ;;             ;; !! define argument (debug)
-    ;;         ;;   (def groupby-keys [:Department :EmployeeName])
-    ;;                     (join/output-join wtr (:d msg) a-keys a-map b-keys a-format b-format a-index b-index)))
-
-    ;;                 (recur (rest batch)))))
-    ;;   2 (loop [batch write-batch]
-    ;;            (if-let [msg (first batch)]
-    ;;              (do
-    ;;       ;; (swap! example-datasink conj msg)
-    ;;                (if (not= (:d msg) nil)
-    ;;                  (do
-    ;;             ;(.write wtr (str msg "\n"))
-    ;;             ;; !! define argument (debug)
-    ;;         ;;   (def groupby-keys [:Department :EmployeeName])
-    ;;                    (join/output-join-loo wtr (:d msg) a-keys a-map b-keys (count b-map) a-format b-format a-index b-index)))
-
-    ;;                (recur (rest batch)))))
-    ;;   4 (loop [batch write-batch]
-    ;;               (if-let [msg (first batch)]
-    ;;                 (do
-    ;;       ;; (swap! example-datasink conj msg)
-    ;;                   (if (not= (:d msg) nil)
-    ;;                     (do
-    ;;             ;(.write wtr (str msg "\n"))
-    ;;             ;; !! define argument (debug)
-    ;;         ;;   (def groupby-keys [:Department :EmployeeName])
-    ;;                       (join/output-join-forward wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index)))
-    ;;                   (recur (rest batch)))))
-      
-    ;;   5 (loop [batch write-batch]
-    ;;       (if-let [msg (first batch)]
-    ;;         (do
-    ;;       ;; (swap! example-datasink conj msg)
-    ;;           (if (not= (:d msg) nil)
-    ;;             (do
-    ;;             ;(.write wtr (str msg "\n"))
-    ;;             ;; !! define argument (debug)
-    ;;         ;;   (def groupby-keys [:Department :EmployeeName])
-    ;;               (join/output-join-backward wtr (:d msg) a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index)))
-    ;;           (recur (rest batch))))))
-    ;; (.close wtr)
     true))
 
 ;; Builder function for your output plugin.
@@ -167,4 +124,4 @@
 ;; from your task-map here, in order to improve the performance of your plugin
 ;; Extending the function below is likely good for most use cases.
 (defn join [pipeline-data]
-  (->ClojaskJoin))
\ No newline at end of file
+  (->ClojaskJoin (deref a-index) (deref b-index) (deref join-index)))
\ No newline at end of file
diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 88abdd5..1b70b47 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -283,7 +283,6 @@
    [this ^int num-worker ^String output-dir ^boolean exception select]
    (.computeTypeCheck this num-worker output-dir)
    ;(.printAggreCol this output-dir) ;; print column names to output-dir
-   (.printCol this output-dir)
    (let [aggre-keys (.getAggreFunc row-info)
          select (if (coll? select) select [select])
          select (if (= select [nil])
@@ -297,6 +296,7 @@
                                       (.indexOf index num))])
          aggre-func (mapv shift-func aggre-func)
         ;;  test (println [select index aggre-func])
+         tmp (.printCol this output-dir) ;; todo: based on "select"
          res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)]
      (if (= res "success")
        "success"
@@ -319,7 +319,7 @@
               ;; test (println [groupby-keys aggre-keys select pre-index data-index])
               res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
-          (.printCol this output-dir)
+          (.printCol this output-dir) ;; todo: based on "select"
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
             (let [shift-func (fn [pair]
@@ -513,7 +513,7 @@
   (checkOutputPath [output-path] "check if output path is of string type")
   (getColNames [] "get the names of all the columns")
   (printCol [output-path] "print column names to output file")
-  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select exclude]))
+  (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select]))
 
 (defrecord JoinedDataFrame
            [^DataFrame a
@@ -549,14 +549,24 @@
     (let [select (if (coll? select) select [select])
           select (if (= select [nil])
                    (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0)))
-                   (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
+                   (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) 
+          a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select)))
+          ;; a-write 
+          b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select)))
+          b-index (if b-roll (vec (apply sorted-set (conj b-index b-roll))) b-index)
+          b-roll (if b-roll (count (remove #(>= % b-roll) b-index)) nil)
+          ;; b-write
+          ;; a-format
+          b-format (set/rename-keys (.getFormatter (.col-info b)) (zipmap b-index (iterate inc 0)))
+          write-index (mapv (fn [num] (count (remove #(>= % num) (concat a-index (mapv #(+ % (count (.getKeyIndex (.col-info a)))) b-index))))) select)
+          ;; test (println a-index b-index b-format write-index b-roll)
           ]
       (u/init-file output-dir)
       ;; print column names
       ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done
-      (.printCol this output-dir)
-      (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys exception []) ;; todo
-      (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit))))
+      (.printCol this output-dir) ;; todo: based on "select"
+      (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo
+      (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index))))
 
 (defn inner-join
   [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}]
@@ -657,20 +667,19 @@
   (assert (or (nil? select) (nil? exclude)) "can only specify either of them")
   (u/init-file output-dir)
   ;; check which type of dataframe this is
-  (if (= (type this) clojask.dataframe.DataFrame)
-    (let [exclude (if (coll? exclude) exclude [exclude])
-          select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]
-      (assert (not= select []) "must select at least on column")
+  (let [exclude (if (coll? exclude) exclude [exclude])
+        select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]
+    (assert (not= select []) "must select at least 1 column")
+    (if (= (type this) clojask.dataframe.DataFrame)
       (if (= (.getAggreFunc (:row-info this)) [])
         (.compute this num-worker output-dir exception order select)
         (if (not= (.getGroupbyKeys (:row-info this)) [])
           (.computeGroupAggre this num-worker output-dir exception select)
-          (.computeAggre this num-worker output-dir exception select))))
-    (if (= (type this) clojask.dataframe.JoinedDataFrame)
-      (.compute this num-worker output-dir exception order select)
-      (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))
-      )))
-
+          (.computeAggre this num-worker output-dir exception select)))
+      (if (= (type this) clojask.dataframe.JoinedDataFrame)
+        (.compute this num-worker output-dir exception order select)
+        (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")))))
+)
 (defn get-col-names
   "Get the names for the columns in sequence"
   [this]
diff --git a/src/main/clojure/clojask/debug.clj b/src/main/clojure/clojask/debug.clj
index c00a19f..5f54231 100644
--- a/src/main/clojure/clojask/debug.clj
+++ b/src/main/clojure/clojask/debug.clj
@@ -1,6 +1,6 @@
 (ns clojask.debug
   (:require [clojask.dataframe :refer :all]
-            ;; [clojask.utils :refer :all]
+            [clojask.utils :as u]
             [clojask.groupby :refer :all]
             [clojask.sort :as sort]
             [clojask.api.aggregate :as aggre]
diff --git a/src/main/clojure/clojask/groupby.clj b/src/main/clojure/clojask/groupby.clj
index 4903178..8d4f9ba 100644
--- a/src/main/clojure/clojask/groupby.clj
+++ b/src/main/clojure/clojask/groupby.clj
@@ -1,7 +1,8 @@
 (ns clojask.groupby
   (:require [clojure.java.io :as io]
             [clojure-csv.core :as csv]
-            [clojure.core.async :as async]))
+            [clojure.core.async :as async]
+            [clojask.utils :as u]))
 "contains the utility functions to group by and aggregate"
 
 (defn compute-groupby
@@ -48,7 +49,7 @@
 
 (defn output-groupby
   "internal function called by output when aggregation is applied"
-  [dist msg groupby-keys key-index formatter]
+  [dist msg groupby-keys key-index formatter write-index]
   ;; msg this time is a vector
 
   ;; key-index contains the one to one correspondence of key value to index value, it is a map
@@ -57,7 +58,7 @@
   (let [output-filename (gen-groupby-filenames dist msg groupby-keys key-index formatter) ;; generate output filename
         groupby-wrtr (io/writer output-filename :append true)]
     ;; write as maps e.g. {:name "Tim", :salary 62, :tax 0.1, :bonus 12}
-    (.write groupby-wrtr (str msg "\n"))
+    (.write groupby-wrtr (str (u/gets msg write-index) "\n"))
 
     ;; write as csv format e.g. Tim,62,0.1,12
     ;(.write groupby-wrtr (str (clojure.string/join "," (map msg (keys msg))) "\n"))
diff --git a/src/main/clojure/clojask/join.clj b/src/main/clojure/clojask/join.clj
index 68b7125..cd25a9c 100644
--- a/src/main/clojure/clojask/join.clj
+++ b/src/main/clojure/clojask/join.clj
@@ -4,17 +4,10 @@
             [clojure.core.async :as async]
             ;; [clojask.onyx-comps :refer [start-onyx-groupby start-onyx-join]]
             [clojask.groupby :refer [read-csv-seq gen-groupby-filenames]]
-            [clojure.string :as str]))
+            [clojure.string :as str]
+            [clojask.utils :as u]))
 
 
-(defn- group-inner-join
-  [a b c]
-  ;; a is readers to the file
-  ;; b is the filename
-  (with-open [wtr (io/writer c :append true)]
-    (doseq [a-row (read-csv-seq a)]
-      (doseq [b-row (read-csv-seq b)]
-        (.write wtr (str (vec (concat a-row b-row)) "\n"))))))
 
 (defn gen-join-filenames
   [dist a-row a-keys]
@@ -26,7 +19,7 @@
     (str dist a-val)))
 
 (defn output-join-inner
-  [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
+  [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index]
   (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
     ;; (println writer)
     ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true)
@@ -46,12 +39,13 @@
                         (if-let [format (get b-format index)]
                           (format (nth b-row index))
                           (nth b-row index)))]
-            (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))))
+            ;; (println [(vec a-row) (vec b-row) a-index b-index join-index])
+            (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n"))))
         (.close filename)))))
 
 (defn output-join-loo
   "used for left join right join or outter join"
-  [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
+  [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index]
   (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
     ;; (println writer)
     ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true)
@@ -68,112 +62,13 @@
                         (if-let [format (get b-format index)]
                           (format (nth b-row index))
                           (nth b-row index)))]
-            (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n"))))
+            (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n"))))
         (.close filename))
       (let [a-row (for [index a-index]
                     (if-let [format (get a-format index)]
                       (format (nth a-row index))
                       (nth a-row index)))]
-        (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))))
-
-;; (defn roll-join-get-line-forward
-;;   "get the max of all the smaller"
-;;   [bench filename index]
-;;   (def memo (volatile! nil))
-;;   (def res (volatile! nil))
-;;   (doseq [row (read-csv-seq filename)]
-;;     (let [val (nth row index)]
-;;       ;; (println [bench filename index row val])
-;;       ;;        | does here need to be =?
-;;       (if (and (<= (compare val bench) 0) (or (= @memo nil) (> (compare val @memo) 0)))
-;;         (do (vreset! memo val)
-;;             (vreset! res row)))))
-;;   @res)
-
-;; (defn roll-join-get-line-backward
-;;   "get the min of all the greater"
-;;   [bench filename index]
-;;   (def memo (volatile! nil))
-;;   (def res (volatile! nil))
-;;   (doseq [row (read-csv-seq filename)]
-;;     (let [val (nth row index)]
-;;       ;;        | does here need to be =?
-;;       (if (and (>= (compare val bench) 0) (or (= @memo nil) (< (compare val @memo) 0)))
-;;         (do (vreset! memo val)
-;;             (vreset! res row)))))
-;;   @res)
-
-;; (doseq [file (rest (file-seq (clojure.java.io/file "./_clojask/grouped/")))]
-;;   (io/delete-file file))
-
-;; (defn internal-rolling-join-forward
-;;   [a b a-dir b-dir a-keys b-keys a-roll b-roll]
-;;   ;; (let [a-reader (io/reader (:path a))]
-;;   ;;   ())
-;;   (s))
-
-
-;; (defn output-join-forward
-;;   "[writer a-row a-keys a-map b-keys count a-format b-format a-index b-index] "
-;;   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
-;;   (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
-;;     ;; (println writer)
-;;     ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true)
-;;     (if (.exists (io/file filename))
-;;       ;; (spit "_clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true)
-;;       (let [filename (io/reader filename)]
-;;         (if-let [b-row (roll-join-get-line-forward (nth a-row a-roll) filename b-roll)] ;; bench is a string
-;;           (let [a-row (for [index a-index]
-;;                         (if-let [format (get a-format index)]
-;;                           (format (nth a-row index))
-;;                           (nth a-row index)))
-;;                 b-row (for [index b-index]
-;;                         (if-let [format (get b-format index)]
-;;                           (format (nth b-row index))
-;;                           (nth b-row index)))]
-;;             (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))
-;;           (let [a-row (for [index a-index]
-;;                         (if-let [format (get a-format index)]
-;;                           (format (nth a-row index))
-;;                           (nth a-row index)))]
-;;             (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))
-;;         (.close filename))
-;;       (let [a-row (for [index a-index]
-;;                     (if-let [format (get a-format index)]
-;;                       (format (nth a-row index))
-;;                       (nth a-row index)))]
-;;         (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))))
-
-;; (defn output-join-backward
-;;   ""
-;;   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
-;;   (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
-;;     ;; (println writer)
-;;     ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true)
-;;     (if (.exists (io/file filename))
-;;       ;; (spit "_clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true)
-;;       (let [filename (io/reader filename)]
-;;         (if-let [b-row (roll-join-get-line-backward (nth a-row a-roll) filename b-roll)] ;; bench is a string
-;;           (let [a-row (for [index a-index]
-;;                         (if-let [format (get a-format index)]
-;;                           (format (nth a-row index))
-;;                           (nth a-row index)))
-;;                 b-row (for [index b-index]
-;;                         (if-let [format (get b-format index)]
-;;                           (format (nth b-row index))
-;;                           (nth b-row index)))]
-;;             (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))
-;;           (let [a-row (for [index a-index]
-;;                         (if-let [format (get a-format index)]
-;;                           (format (nth a-row index))
-;;                           (nth a-row index)))]
-;;             (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))
-;;         (.close filename))
-;;       (let [a-row (for [index a-index]
-;;                     (if-let [format (get a-format index)]
-;;                       (format (nth a-row index))
-;;                       (nth a-row index)))]
-;;         (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))))
+        (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n"))))))
 
 (defn defn-join
   [type limit]
@@ -191,7 +86,7 @@
                                                  (do (vreset! memo val)
                                                      (vreset! res row)))))
                                            @res)]
-          (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
+          (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index]
             (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
               (if (.exists (io/file filename))
                 (let [filename (io/reader filename)]
@@ -204,18 +99,18 @@
                                   (if-let [format (get b-format index)]
                                     (format (nth b-row index))
                                     (nth b-row index)))]
-                      (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))
+                      (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n")))
                     (let [a-row (for [index a-index]
                                   (if-let [format (get a-format index)]
                                     (format (nth a-row index))
                                     (nth a-row index)))]
-                      (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))
+                      (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n"))))
                   (.close filename))
                 (let [a-row (for [index a-index]
                               (if-let [format (get a-format index)]
                                 (format (nth a-row index))
                                 (nth a-row index)))]
-                  (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))))))
+                  (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n")))))))
       ;; 5 output-join-backward
       5 (let [roll-join-get-line-backward (fn [bench filename index]
                                             (def memo (volatile! nil))
@@ -228,7 +123,7 @@
                                                       (vreset! res row)))))
                                             @res)]
           (fn
-            [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index]
+            [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index]
             (let [filename (gen-join-filenames "_clojask/join/b/" a-row a-keys)]
     ;; (println writer)
     ;; (spit "_clojask/join/test.txt" (str writer "\n") :append true)
@@ -244,16 +139,16 @@
                                   (if-let [format (get b-format index)]
                                     (format (nth b-row index))
                                     (nth b-row index)))]
-                      (.write writer (str (str/join "," (vec (concat a-row b-row))) "\n")))
+                      (.write writer (str (str/join "," (vec (u/gets (concat a-row b-row) join-index))) "\n")))
                     (let [a-row (for [index a-index]
                                   (if-let [format (get a-format index)]
                                     (format (nth a-row index))
                                     (nth a-row index)))]
-                      (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n"))))
+                      (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n"))))
                   (.close filename))
                 (let [a-row (for [index a-index]
                               (if-let [format (get a-format index)]
                                 (format (nth a-row index))
                                 (nth a-row index)))]
-                  (.write writer (str (str/join "," (vec (concat a-row (repeat count "")))) "\n")))))))
+                  (.write writer (str (str/join "," (vec (u/gets (concat a-row (repeat count "")) join-index))) "\n")))))))
       nil)))
\ No newline at end of file
diff --git a/src/main/clojure/clojask/onyx_comps.clj b/src/main/clojure/clojask/onyx_comps.clj
index dd5236c..48f5790 100644
--- a/src/main/clojure/clojask/onyx_comps.clj
+++ b/src/main/clojure/clojask/onyx_comps.clj
@@ -539,12 +539,12 @@
   (try
     (workflow-gen num-work)
     (config-env)
-    (worker-func-gen dataframe exception groupby-index) ;;need some work
+    (worker-func-gen dataframe exception (vec (take (count (.getKeyIndex (.col-info dataframe))) (iterate inc 0)))) ;;need some work
     (catalog-groupby-gen num-work batch-size)
     (lifecycle-groupby-gen (.path dataframe) dist groupby-keys (.getKeyIndex (.col-info dataframe)))
     (flow-cond-gen num-work)
     (input/inject-dataframe dataframe)
-    (groupby/inject-dataframe dataframe groupby-keys)
+    (groupby/inject-dataframe dataframe groupby-keys groupby-index)
     (catch Exception e (throw (Exception. (str "[preparing stage (group by)] " (.getMessage e))))))
   (try
     (let [submission (onyx.api/submit-job peer-config
@@ -567,7 +567,7 @@
 
 (defn start-onyx-join
   "start the onyx cluster with the specification inside dataframe"
-  [num-work batch-size dataframe b dist exception a-keys b-keys a-roll b-roll join-type & [limit]]
+  [num-work batch-size dataframe b dist exception a-keys b-keys a-roll b-roll join-type limit a-index b-index b-format write-index]
   ;; dataframe means a
   (try
     (workflow-gen num-work)
@@ -577,7 +577,7 @@
     (lifecycle-join-gen (.path dataframe) dist dataframe b a-keys b-keys a-roll b-roll join-type)
     (flow-cond-gen num-work)
     (input/inject-dataframe dataframe)
-    (join/inject-dataframe dataframe b a-keys b-keys)
+    (join/inject-dataframe dataframe b a-keys b-keys a-index b-index write-index b-format)
     (let [limit (or limit (fn [a b] true))]
      (defn-join join-type limit))
     (catch Exception e (throw (Exception. (str "[preparing stage (join)] " (.getMessage e))))))
diff --git a/src/main/clojure/clojask/utils.clj b/src/main/clojure/clojask/utils.clj
index cd3852b..29fa6d0 100644
--- a/src/main/clojure/clojask/utils.clj
+++ b/src/main/clojure/clojask/utils.clj
@@ -86,8 +86,6 @@
         (if (= com nil)
           true
           (do
-            ;; (println row)
-            ;; (println (nth com 1))
             (if (apply (first com) (get-val row types (nth com 1)))
               (recur rem)
               false)))))))
@@ -111,28 +109,82 @@
 (def fromString
   (atom (fn [_] (str _))))
 
-(def toDate
-  (atom (fn [string]
-          (try
-            (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
-            (catch Exception e (throw e))))))
+;; (def toDate
+;;   (atom (fn [string]
+;;           (try
+;;             (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
+;;             (catch Exception e (throw e))))))
 
-(def fromDate
-  (atom (fn [date]
-          (if (= (type date) java.time.LocalDate)
-            (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
-            date))))
+;; (def fromDate
+;;   (atom (fn [date]
+;;           (if (= (type date) java.time.LocalDate)
+;;             (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
+;;             date))))
+
+;; (def toDateTime
+;;   (atom (fn [string]
+;;           (try
+;;             (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
+;;             (catch Exception e (throw e))))))
+
+;; (def fromDateTime
+;;   (atom (fn [date]
+;;           (if (= (type date) java.time.LocalDateTime)
+;;             (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
+;;             date))))
+
+;; (defn set-format-string
+;;   [string]
+;;   (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:"))
+;;     (let [format-string (subs string (inc (str/index-of string ":")))]
+;;       (reset! toDate
+;;               (fn [string]
+;;                 (try
+;;                   (LocalDate/parse string (DateTimeFormatter/ofPattern format-string))
+;;                   (catch Exception e (throw e)))))
+
+;;       (reset! fromDate
+;;               (fn [date]
+;;                 (if (= (type date) java.time.LocalDate)
+;;                   (.format date (DateTimeFormatter/ofPattern format-string))
+;;                   date)))
+
+;;       (reset! toDateTime
+;;               (fn [string]
+;;                 (try
+;;                   (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string))
+;;                   (catch Exception e (throw e)))))
 
-(def toDateTime
+;;       (reset! fromDateTime
+;;               (fn [date]
+;;                 (if (= (type date) java.time.LocalDateTime)
+;;                   (.format date (DateTimeFormatter/ofPattern format-string))
+;;                   date))))
+;;     ))
+
+;; ;; (def operation-type-map
+;; ;;   {toInt "int"
+;; ;;    toDouble "double"
+;; ;;    toString "string"
+;; ;;    toDate "date"})
+
+;; (def type-operation-map
+;;   {"int" [toInt fromString]
+;;    "double" [toDouble fromString]
+;;    "string" [toString fromString]
+;;    "date" [toDate fromDate]
+;;    "datetime" [toDateTime fromDateTime]})
+
+(def toDate
   (atom (fn [string]
           (try
-            (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
+            (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string)
             (catch Exception e (throw e))))))
 
-(def fromDateTime
+(def fromDate
   (atom (fn [date]
-          (if (= (type date) java.time.LocalDateTime)
-            (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
+          (if (= (type date) java.util.Date)
+            (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date)
             date))))
 
 (defn set-format-string
@@ -142,52 +194,33 @@
       (reset! toDate
               (fn [string]
                 (try
-                  (LocalDate/parse string (DateTimeFormatter/ofPattern format-string))
+                  (.parse (java.text.SimpleDateFormat. format-string) string)
                   (catch Exception e (throw e)))))
 
       (reset! fromDate
               (fn [date]
-                (if (= (type date) java.time.LocalDate)
-                  (.format date (DateTimeFormatter/ofPattern format-string))
-                  date)))
-
-      (reset! toDateTime
+                (if (= (type date) java.util.Date)
+                  (.format (java.text.SimpleDateFormat. format-string) date)
+                  date))))
+    (do
+      (reset! toDate
               (fn [string]
                 (try
-                  (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string))
+                  (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string)
                   (catch Exception e (throw e)))))
 
-      (reset! fromDateTime
+      (reset! fromDate
               (fn [date]
-                (if (= (type date) java.time.LocalDateTime)
-                  (.format date (DateTimeFormatter/ofPattern format-string))
-                  date))))
-    ;; (do
-    ;;   (reset! toDate
-    ;;           (fn [string]
-    ;;             (try
-    ;;               (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string)
-    ;;               (catch Exception e (throw e)))))
-
-    ;;   (reset! fromDate
-    ;;           (fn [date]
-    ;;             (if (= (type date) java.util.Date)
-    ;;               (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date)
-    ;;               date))))
-    ))
-
-;; (def operation-type-map
-;;   {toInt "int"
-;;    toDouble "double"
-;;    toString "string"
-;;    toDate "date"})
+                (if (= (type date) java.util.Date)
+                  (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date)
+                  date))))))
 
 (def type-operation-map
   {"int" [toInt fromString]
    "double" [toDouble fromString]
    "string" [toString fromString]
    "date" [toDate fromDate]
-   "datetime" [toDateTime fromDateTime]})
+   "datetime" [toDate fromDate]})
 
 (defn type-detection
   [file]

From 108b8427c8f783799777d5b327e3ab57e518af60 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Fri, 24 Dec 2021 14:44:52 +0800
Subject: [PATCH 15/33] Amend test file for Join APIs

---
 test/clojask/core_test.clj | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index c4b1cef..2c3e5cb 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -68,8 +68,8 @@
     (def y (dataframe "test/clojask/Employees-example.csv"))
     (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
     ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    ;; (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    ;; (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"]) "Salary" "Salary" 8 "resources/test.csv" :exception false)))
+    (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
+    (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))
     ))
 
 (deftest join-api-output-test
@@ -82,14 +82,14 @@
     ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
     ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
     ;;     (is (= "" (:out result))))
-    ;; (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
-    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
-    ;;     (is (= "" (:out result))))
-    ;; (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false)
-    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")]
-    ;;     (is (= "" (:out result))))
-    ;; (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false)
-    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")]
-    ;;     (is (= "" (:out result))))
+    (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
+    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
+        (is (= "" (:out result))))
+    (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false)
+    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")]
+        (is (= "" (:out result))))
+    (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false)
+    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")]
+        (is (= "" (:out result))))
     ))
 

From 1db584a7d972fd4c9985108b03a5d300a61ba92d Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Fri, 24 Dec 2021 15:13:07 +0800
Subject: [PATCH 16/33] printCol to take selected-col as argument

---
 src/main/clojure/clojask/dataframe.clj | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 1b70b47..5c16774 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -30,7 +30,7 @@
   (colTypes [] "get column type in ColInfo")
   (getColIndex [] "get column indices, excluding deleted columns")
   (getColNames [] "get column names")
-  (printCol [output-path] "print column names to output file")
+  (printCol [output-path selected-col] "print column names to output file")
   (printAggreCol [output-path] "print column names to output file for aggregate")
   (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join")
   (delCol [col-to-del] "delete one or more columns in the dataframe")
@@ -148,10 +148,10 @@
               (concat groupby-keys aggre-new-keys))))
 
   (printCol
-  ;; print column names, called by compute
-    [this output-path]
+  ;; print column names, called by compute and computeAggre
+    [this output-path selected-col]
     (.checkOutputPath this output-path)
-    (let [col-set (.getColNames this)]
+    (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)]
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," col-set) "\n")))))
 
@@ -271,7 +271,7 @@
       (if (<= num-worker 8)
         (try
           (.final this)
-          (.printCol this output-dir) ;; to-do: based on the index => Done
+          (.printCol this output-dir select) ;; to-do: based on the index => Done
           (let [res (start-onyx num-worker batch-size this output-dir exception order index)]
             (if (= res "success")
               "success"
@@ -296,7 +296,7 @@
                                       (.indexOf index num))])
          aggre-func (mapv shift-func aggre-func)
         ;;  test (println [select index aggre-func])
-         tmp (.printCol this output-dir) ;; todo: based on "select"
+         tmp (.printCol this output-dir select) ;; todo: based on "select"
          res (start-onyx-aggre-only num-worker batch-size this output-dir exception aggre-func index select)]
      (if (= res "success")
        "success"
@@ -319,7 +319,7 @@
               ;; test (println [groupby-keys aggre-keys select pre-index data-index])
               res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
-          (.printCol this output-dir) ;; todo: based on "select"
+          (.printCol this output-dir select) ;; todo: based on "select"
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
             (let [shift-func (fn [pair]
@@ -512,7 +512,7 @@
 (definterface JDFIntf
   (checkOutputPath [output-path] "check if output path is of string type")
   (getColNames [] "get the names of all the columns")
-  (printCol [output-path] "print column names to output file")
+  (printCol [output-path selected-col] "print column names to output file")
   (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select]))
 
 (defrecord JoinedDataFrame
@@ -539,8 +539,8 @@
 
   (printCol
     ;; print column names, called by compute
-      [this output-path]
-      (let [col-set (.getColNames this)]
+      [this output-path selected-col]
+      (let [col-set (if (= selected-col [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-col))]
         (with-open [wrtr (io/writer output-path)]
           (.write wrtr (str (str/join "," col-set) "\n")))))
         
@@ -564,7 +564,7 @@
       (u/init-file output-dir)
       ;; print column names
       ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done
-      (.printCol this output-dir) ;; todo: based on "select"
+      (.printCol this output-dir select) ;; todo: based on "select"
       (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo
       (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index))))
 

From 6f04e1dce2485576b29c797235eae8e75e29acf6 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sun, 26 Dec 2021 15:14:10 +0800
Subject: [PATCH 17/33] Update test file

---
 src/main/clojure/clojask/dataframe.clj |  7 +++----
 test/clojask/core_test.clj             | 16 ++++++++++++----
 test/clojask/correct_outputs/1-9.csv   |  8 ++++++++
 3 files changed, 23 insertions(+), 8 deletions(-)
 create mode 100644 test/clojask/correct_outputs/1-9.csv

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 5c16774..c82879d 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -539,8 +539,8 @@
 
   (printCol
     ;; print column names, called by compute
-      [this output-path selected-col]
-      (let [col-set (if (= selected-col [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-col))]
+      [this output-path selected-index]
+      (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
         (with-open [wrtr (io/writer output-path)]
           (.write wrtr (str (str/join "," col-set) "\n")))))
         
@@ -563,8 +563,7 @@
           ]
       (u/init-file output-dir)
       ;; print column names
-      ;;  (.printJoinCol a b a-keys b-keys output-dir) to-do: make use of getColNames => Done
-      (.printCol this output-dir select) ;; todo: based on "select"
+      (.printCol this output-dir select) ;; todo: based on "select" => Done
       (start-onyx-groupby num-worker 10 b "./_clojask/join/b/" b-keys b-index exception) ;; todo
       (start-onyx-join num-worker 10 a b output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index))))
 
diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index 2c3e5cb..c93d012 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -56,10 +56,18 @@
     (is (= (col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"]))
     (rename-col y ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])
     (is (= (col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]))
-    (select-col y ["Employee" "new-Department" "EmployeeName"])
-    (is (= (col-names y) ["Employee" "new-Department" "EmployeeName"]))
-    (delete-col y ["new-Department"])
-    (is (= (col-names y) ["Employee" "EmployeeName"]))
+    ;; (select-col y ["Employee" "new-Department" "EmployeeName"])
+    ;; (is (= (col-names y) ["Employee" "new-Department" "EmployeeName"]))
+    ;; (delete-col y ["new-Department"])
+    ;; (is (= (col-names y) ["Employee" "EmployeeName"]))
+    ))
+
+(deftest col-select-output-test
+    (testing "Select column(s) argument"
+    (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
+    (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false)
+    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-9.csv)" "<(sort test/clojask/correct_outputs/1-9.csv)")]
+        (is (= "" (:out result))))
     ))
 
 (deftest join-api-test
diff --git a/test/clojask/correct_outputs/1-9.csv b/test/clojask/correct_outputs/1-9.csv
new file mode 100644
index 0000000..e3ba23c
--- /dev/null
+++ b/test/clojask/correct_outputs/1-9.csv
@@ -0,0 +1,8 @@
+Employee,EmployeeName
+1,Alice
+2,Bob
+3,Carla
+4,Daniel
+5,Evelyn
+6,Ferdinand
+7,Amy

From a43b59b08c91690983738eb7d9211773db4bdd41 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Tue, 28 Dec 2021 15:23:44 +0800
Subject: [PATCH 18/33] preview for joineddataframe

---
 src/main/clojure/clojask/dataframe.clj | 38 ++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index c82879d..0962eee 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -392,14 +392,6 @@
   [dataframe sample-size return-size & {:keys [format] :or {format false}}]
   (.preview dataframe sample-size return-size format))
 
-(defn print-df
-  [dataframe & [sample-size return-size]]
-  (let [data (.preview dataframe (or sample-size 1000) (or return-size 10) false)
-        tmp (first data)
-        types (zipmap (keys tmp) (map u/get-type-string (vals tmp)))
-        data (conj (apply list data) types)]
-    (pprint/print-table data)))
-
 (defn generate-col
   "Generate column names if there are none"
   [col-count]
@@ -513,6 +505,7 @@
   (checkOutputPath [output-path] "check if output path is of string type")
   (getColNames [] "get the names of all the columns")
   (printCol [output-path selected-col] "print column names to output file")
+  (preview [] "preview the column names")
   (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select]))
 
 (defrecord JoinedDataFrame
@@ -539,17 +532,22 @@
 
   (printCol
     ;; print column names, called by compute
-      [this output-path selected-index]
-      (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
-        (with-open [wrtr (io/writer output-path)]
-          (.write wrtr (str (str/join "," col-set) "\n")))))
-        
+    [this output-path selected-index]
+    (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
+      (with-open [wrtr (io/writer output-path)]
+        (.write wrtr (str (str/join "," col-set) "\n")))))
+
+  (preview
+    [this]
+   (.getColNames this)
+   )
+
   (compute
     [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select]
     (let [select (if (coll? select) select [select])
           select (if (= select [nil])
                    (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0)))
-                   (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) 
+                   (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
           a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select)))
           ;; a-write 
           b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select)))
@@ -685,3 +683,15 @@
   ;; to-do: should implement both for the DataFrame and JoinedDataFrame => Done
   (.getColNames this)
   )
+
+(defn print-df
+  [dataframe & [sample-size return-size]]
+  (if (= (type dataframe) DataFrame)
+    (let [data (.preview dataframe (or sample-size 1000) (or return-size 10) false)
+          tmp (first data)
+          types (zipmap (keys tmp) (map u/get-type-string (vals tmp)))
+          data (conj (apply list data) types)]
+      (pprint/print-table data))
+    (do
+      (println (str (str/join "," (.preview dataframe))))
+      (println "The content of joined dataframe is not available."))))

From 6ad9907e8902e865e29b9877c8663cb987d517b2 Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Tue, 28 Dec 2021 18:00:04 +0800
Subject: [PATCH 19/33] doc select and join change

---
 doc/documentation.md | 135 +++++++++++++++++++++++++------------------
 1 file changed, 80 insertions(+), 55 deletions(-)

diff --git a/doc/documentation.md b/doc/documentation.md
index 1484ff6..1aa4fe6 100644
--- a/doc/documentation.md
+++ b/doc/documentation.md
@@ -207,46 +207,8 @@ You can also group by the combination of keys. (Use the above two rules together
   ;; get the min of the two columns grouped by ...
   ```
 
-  
-
-- sort
-
-  **Immediately** sort the dataframe
-
-  | Argument           | Type                    | Function                 | Remarks                                                      |
-  | ------------------ | ----------------------- | ------------------------ | ------------------------------------------------------------ |
-  | `dataframe`        | Clojask.DataFrame       | The operated object      |                                                              |
-  | `trending list`    | Collection (seq vector) | Indicates the sort order | Example: ["Salary" "+" "Employee" "-"] means that sort the Salary in ascending order, if equal sort the Employee in descending order |
-  | `output-directory` | String                  | The output path          |                                                              |
-
-  **Example**
-
-  ```clojure
-  (sort y ["+" "Salary"] "resources/sort.csv")
-  ;; sort by Salary ascendingly
-  ```
-
-  
 
-- compute
 
-  Compute the result. The pre-defined lazy operations will be executed in pipeline, ie the result of the previous operation becomes the argument of the next operation.
-
-  | Argument         | Type              | Function                                                     | Remarks                                                      |
-  | ---------------- | ----------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
-  | `dataframe`      | Clojask.DataFrame | The operated object                                          |                                                              |
-  | `num of workers` | int (max 8)       | The number of worker instances (except the input and output nodes) | If this argument >= 2, will use [onyx](http://www.onyxplatform.org/) as the distributed platform |
-  | `output path`    | String            | The path of the output csv file                              | Could exist or not.                                          |
-  | [`exception`]    | boolean           | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields            |
-  
-  **Example**
-  
-  ```clojure
-  (compute x 8 "../resources/test.csv" :exception true)
-  ;; computes all the pre-registered operations
-  ```
-  
-  
 
 - inner-join / left-join / right-join
 
@@ -258,40 +220,52 @@ You can also group by the combination of keys. (Use the above two rules together
 
   *Will automatically pipeline the registered operations and filters like `compute`. You could think of join as first compute the two dataframes then join.*
 
-  | Argument            | Type                | Function                                                     | Remarks                                           |
-  | ------------------- | ------------------- | ------------------------------------------------------------ | ------------------------------------------------- |
-  | `dataframe a`       | Clojask.DataFrame   | The operated object                                          |                                                   |
-  | `dataframe b`       | Clojask.DataFrame   | The operated object                                          |                                                   |
-  | `a join keys`       | String / Collection | The keys of a to be aligned                                  | Find the specification [here](#groupby-keys)      |
-  | `b join keys`       | String / Collection | The keys of b to be aligned                                  | Find the specification [here](#groupby-keys)      |
-  | `number of workers` | int (max 8)         | Number of worker nodes doing the joining                     |                                                   |
-  | `distination file`  | string              | The file path to the distination                             | Will be emptied first                             |
-  | [`exception`]       | boolean             | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields |
+  | Argument      | Type                | Function                    | Remarks                                      |
+  | ------------- | ------------------- | --------------------------- | -------------------------------------------- |
+  | `dataframe a` | Clojask.DataFrame   | The operated object         |                                              |
+  | `dataframe b` | Clojask.DataFrame   | The operated object         |                                              |
+  | `a join keys` | String / Collection | The keys of a to be aligned | Find the specification [here](#groupby-keys) |
+  | `b join keys` | String / Collection | The keys of b to be aligned | Find the specification [here](#groupby-keys) |
 
-  **Example**
+**Return**
+
+A Clojask.JoinedDataFrame
+
+- Unlike Clojask.DataFrame, it only supports three operations:
+  - `print-df`
+  - `get-col-names`
+  - `compute`
+- This means you cannot further apply complicated operations to a joined dataframe. An alternative is to first compute the result, then read it in as a new dataframe.
+
+**Example**
 
   ```clojure
   (def x (dataframe "path/to/a"))
   (def y (dataframe "path/to/b"))
   
-  (inner-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true)
+  (def z (inner-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"]))
+  (compute z 8 "path/to/output")
   ;; inner join x and y
   
-  (left-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true)
+  (def z (left-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"]))
+  (compute z 8 "path/to/output")
   ;; left join x and y
   
-  (right-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"] 8 "path/to/distination" :exception true)
+  (def z (right-join x y ["col a 1" "col a 2"] ["col b 1" "col b 2"]))
+  (compute z 8 "path/to/output")
   ;; right join x and y
   ```
 
+
+
 - reorderCol / renameCol
 
   Reorder the columns / rename the column names in the dataframe
 
-  | Argument            | Type               | Function                                                     | Remarks                                           |
-  | ------------------- | ------------------ | ------------------------------------------------------------ | ------------------------------------------------- |
-  | `dataframe a`       | Clojask.DataFrame  | The operated object                                          |                                                   |
-  | `a columns`         | Clojure.collection | The new set of column names                                  | Should be existing headers in dataframe a if it is `reorderCol`         |
+  | Argument      | Type               | Function                    | Remarks                                                      |
+  | ------------- | ------------------ | --------------------------- | ------------------------------------------------------------ |
+  | `dataframe a` | Clojask.DataFrame  | The operated object         |                                                              |
+  | `a columns`   | Clojure.collection | The new set of column names | Should be existing headers in dataframe a if it is `reorderCol` |
 
 
   **Example**
@@ -301,3 +275,54 @@ You can also group by the combination of keys. (Use the above two rules together
   (.renameCol y ["Employee" "new-Department" "EmployeeName" "Salary"])
   ```
 
+
+
+
+- sort
+
+  **Immediately** sort the dataframe
+
+  | Argument           | Type                    | Function                 | Remarks                                                      |
+  | ------------------ | ----------------------- | ------------------------ | ------------------------------------------------------------ |
+  | `dataframe`        | Clojask.DataFrame       | The operated object      |                                                              |
+  | `trending list`    | Collection (seq vector) | Indicates the sort order | Example: ["Salary" "+" "Employee" "-"] means that sort the Salary in ascending order, if equal sort the Employee in descending order |
+  | `output-directory` | String                  | The output path          |                                                              |
+
+  **Example**
+
+  ```clojure
+  (sort y ["+" "Salary"] "resources/sort.csv")
+  ;; sort by Salary ascendingly
+  ```
+
+  
+
+- compute
+
+  Compute the result. The pre-defined lazy operations will be executed in pipeline, ie the result of the previous operation becomes the argument of the next operation.
+
+  | Argument         | Type                           | Function                                                     | Remarks                                                      |
+  | ---------------- | ------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+  | `dataframe`      | Clojask.DataFrame              | The operated object                                          |                                                              |
+  | `num of workers` | int (max 8)                    | The number of worker instances (except the input and output nodes) | Use [onyx](http://www.onyxplatform.org/) as the distributed platform |
+  | `output path`    | String                         | The path of the output csv file                              | Could exist or not.                                          |
+  | [`exception`]    | boolean                        | Whether an exception during calculation will cause termination | Is useful for debugging or detecting empty fields            |
+  | [`select`]       | String / Collection of strings | The name of the columns to select. Better to first refer to function `get-col-names` about all the names. (Similar to `SELECT` in sql ) | Can only specify either of select and exclude                |
+  | [`exclude`]      | String / Collection of strings | The name of the columns to exclude                           | Can only specify either of select and exclude                |
+
+  **Example**
+
+  ```clojure
+  (compute x 8 "../resources/test.csv" :exception true)
+  ;; computes all the pre-registered operations
+  
+  (compute x 8 "../resources/test.csv" :select "col a")
+  ;; only select column a
+  
+  (compute x 8 "../resources/test.csv" :select ["col b" "col a"])
+  ;; select two columns, column b and column a in order
+  
+  (compute x 8 "../resources/test.csv" :exclude ["col b" "col a"])
+  ;; select all columns except column b and column a, other columns are in order
+  ```
+

From 078b89b490d68a61fde22ed04c01fdcbffe687d5 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Thu, 30 Dec 2021 14:57:52 +0800
Subject: [PATCH 20/33] Amend right-join API and test file

---
 src/main/clojure/clojask/dataframe.clj | 3 ++-
 test/clojask/core_test.clj             | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 0962eee..6a41052 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -602,7 +602,8 @@
     (JoinedDataFrame. a b a-keys b-keys nil nil 2 nil col-prefix)))
 
 (defn right-join
-  [a b a-keys b-keys num-worker dist & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}]
+  [a b a-keys b-keys & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}]
+  ;[a b a-keys b-keys num-worker dist & {:keys [col-prefix] :or {col-prefix ["1" "2"]}}]
   (let [a-keys (u/proc-groupby-key a-keys)
         b-keys (u/proc-groupby-key b-keys)
         a-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info a)) (nth _ 1))]) a-keys)
diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index c93d012..7bfcbb0 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -75,7 +75,7 @@
     (def x (dataframe "test/clojask/Employees-example.csv"))
     (def y (dataframe "test/clojask/Employees-example.csv"))
     (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    ;; (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
+    (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
     (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
     (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))
     ))
@@ -87,9 +87,9 @@
     (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false)
     (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")]
         (is (= "" (:out result))))
-    ;; (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
-    ;; (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
-    ;;     (is (= "" (:out result))))
+    (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
+    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
+        (is (= "" (:out result))))
     (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
     (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
         (is (= "" (:out result))))

From 916667c40c61cf44429c3f3c643a0ccef19b453e Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Thu, 30 Dec 2021 21:43:33 +0800
Subject: [PATCH 21/33] Amend printCol bug in groupby-aggre

---
 src/main/clojure/clojask/dataframe.clj | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 6a41052..15f2db8 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -31,6 +31,7 @@
   (getColIndex [] "get column indices, excluding deleted columns")
   (getColNames [] "get column names")
   (printCol [output-path selected-col] "print column names to output file")
+  (printColByIndex [output-path selected-index] "print column names to output file")
   (printAggreCol [output-path] "print column names to output file for aggregate")
   (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join")
   (delCol [col-to-del] "delete one or more columns in the dataframe")
@@ -155,6 +156,13 @@
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," col-set) "\n")))))
 
+  (printColByIndex
+    ;; print column names, called by compute
+    [this output-path selected-index]
+    (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
+      (with-open [wrtr (io/writer output-path)]
+        (.write wrtr (str (str/join "," col-set) "\n")))))
+
   ;; !! deprecated
   (printAggreCol
   ;; print column names, called by computeAggre
@@ -319,7 +327,7 @@
               ;; test (println [groupby-keys aggre-keys select pre-index data-index])
               res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
-          (.printCol this output-dir select) ;; todo: based on "select"
+          (.printColByIndex this output-dir select) ;; todo: based on "select"
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
             (let [shift-func (fn [pair]

From 79019c8743b53487f2740caeb3100d830db40352 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Thu, 30 Dec 2021 23:05:44 +0800
Subject: [PATCH 22/33] Amend test file to run zsh shell

---
 test/clojask/core_test.clj           | 49 ++++++++++++++++------------
 test/clojask/correct_outputs/1-4.csv |  6 ++--
 test/clojask/correct_outputs/1-5.csv |  6 ++--
 test/clojask/correct_outputs/1-6.csv | 10 +++---
 test/clojask/correct_outputs/1-7.csv |  8 ++---
 test/clojask/correct_outputs/1-8.csv |  8 ++---
 6 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index 7bfcbb0..609d8ca 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -29,24 +29,27 @@
     (set-type y "Salary" "double")
     (operate y - "Salary")
     (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-1.csv)" "<(sort test/clojask/correct_outputs/1-1.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort ./test/clojask/test_outputs/1-1.csv) <(sort ./test/clojask/correct_outputs/1-1.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ;; filter and row-operation
     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
     (set-type y "Salary" "double")
     (filter y "Salary" (fn [salary] (<= salary 800)))
     (operate y str ["Employee" "Salary"] "new-col")
     (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-2.csv)" "<(sort test/clojask/correct_outputs/1-2.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort ./test/clojask/test_outputs/1-2.csv) <(sort ./test/clojask/correct_outputs/1-2.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ;; groupby and aggregate
     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
     (set-type y "Salary" "double")
     (group-by y ["Department"])
-    (aggregate y max ["Salary"] ["new-Salary"])
+    (aggregate y max ["Salary"] ["new-salary"])
     (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-3.csv)" "<(sort test/clojask/correct_outputs/1-3.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ))
 
 (deftest col-api-test
@@ -66,8 +69,9 @@
     (testing "Select column(s) argument"
     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
     (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-9.csv)" "<(sort test/clojask/correct_outputs/1-9.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-9.csv) <(sort test/clojask/correct_outputs/1-9.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ))
 
 (deftest join-api-test
@@ -85,19 +89,24 @@
     (def x (dataframe "test/clojask/Employees-example.csv"))
     (def y (dataframe "test/clojask/Employees-info-example.csv"))
     (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-4.csv)" "<(sort test/clojask/correct_outputs/1-4.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-5.csv)" "<(sort test/clojask/correct_outputs/1-5.csv)")]
-        (is (= "" (:out result))))
-    (inner-join x y ["Employee"] ["Employee"] 8 "test/clojask/test_outputs/1-6.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-6.csv)" "<(sort test/clojask/correct_outputs/1-6.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
+    (compute (inner-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-6.csv" :exception false)
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-7.csv)" "<(sort test/clojask/correct_outputs/1-7.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false)
-    (let [result (sh "diff" "<(sort test/clojask/test_outputs/1-8.csv)" "<(sort test/clojask/correct_outputs/1-8.csv)")]
-        (is (= "" (:out result))))
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ))
 
diff --git a/test/clojask/correct_outputs/1-4.csv b/test/clojask/correct_outputs/1-4.csv
index 8320b59..46ae2ba 100644
--- a/test/clojask/correct_outputs/1-4.csv
+++ b/test/clojask/correct_outputs/1-4.csv
@@ -1,8 +1,8 @@
 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
-1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
 5,Evelyn,13,800,2020/12/03,,,,
-3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
-7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11
 4,Daniel,12,1000,2020/12/05,,,,
+1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
+3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
 6,Ferdinand,21,700,2020/12/05,,,,
 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05
+7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11
diff --git a/test/clojask/correct_outputs/1-5.csv b/test/clojask/correct_outputs/1-5.csv
index e2473e3..f39148a 100644
--- a/test/clojask/correct_outputs/1-5.csv
+++ b/test/clojask/correct_outputs/1-5.csv
@@ -1,5 +1,5 @@
-1_Employee,1_EmployeeName,1_DayOff,1_UpdateDate,2_Employee,2_EmployeeName,2_Department,2_Salary,2_UpdateDate
-2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01
+2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate
 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12
-3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03
 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26
+2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01
+3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03
diff --git a/test/clojask/correct_outputs/1-6.csv b/test/clojask/correct_outputs/1-6.csv
index 613eb2e..1177529 100644
--- a/test/clojask/correct_outputs/1-6.csv
+++ b/test/clojask/correct_outputs/1-6.csv
@@ -1,5 +1,5 @@
-1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
-2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05
-3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
-1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
-7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11
+2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate
+2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01
+3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03
+1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12
+7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26
diff --git a/test/clojask/correct_outputs/1-7.csv b/test/clojask/correct_outputs/1-7.csv
index fc7f3da..ef84168 100644
--- a/test/clojask/correct_outputs/1-7.csv
+++ b/test/clojask/correct_outputs/1-7.csv
@@ -1,8 +1,8 @@
 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
-3,Carla,12,900,2020/12/03,,,,
-6,Ferdinand,21,700,2020/12/05,,,,
+7,Amy,11,50000,2020/11/26,,,,
+3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
+4,Daniel,12,1000,2020/12/05,,,,
 5,Evelyn,13,800,2020/12/03,,,,
 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
-4,Daniel,12,1000,2020/12/05,,,,
-7,Amy,11,50000,2020/11/26,,,,
 2,Bob,11,600,2020/12/01,,,,
+6,Ferdinand,21,700,2020/12/05,,,,
diff --git a/test/clojask/correct_outputs/1-8.csv b/test/clojask/correct_outputs/1-8.csv
index 4b51863..974f391 100644
--- a/test/clojask/correct_outputs/1-8.csv
+++ b/test/clojask/correct_outputs/1-8.csv
@@ -1,8 +1,8 @@
 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
-3,Carla,12,900,2020/12/03,,,,
-7,Amy,11,50000,2020/11/26,,,,
+5,Evelyn,13,800,2020/12/03,,,,
 1,Alice,11,300,2020/12/12,,,,
 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05
-5,Evelyn,13,800,2020/12/03,,,,
-4,Daniel,12,1000,2020/12/05,,,,
 6,Ferdinand,21,700,2020/12/05,,,,
+7,Amy,11,50000,2020/11/26,,,,
+3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
+4,Daniel,12,1000,2020/12/05,,,,

From 22a141909dd5def90af9ca818e83080b56f378bf Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sun, 2 Jan 2022 18:06:04 +0800
Subject: [PATCH 23/33] Change compute to return output dataframe

---
 src/main/clojure/clojask/dataframe.clj | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 15f2db8..320d920 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -678,14 +678,22 @@
     (assert (not= select []) "must select at least 1 column")
     (if (= (type this) clojask.dataframe.DataFrame)
       (if (= (.getAggreFunc (:row-info this)) [])
-        (.compute this num-worker output-dir exception order select)
+        (do ;; simple compute
+          (.compute this num-worker output-dir exception order select)
+          (dataframe output-dir :have-col true)) ;; return output dataframe
         (if (not= (.getGroupbyKeys (:row-info this)) [])
-          (.computeGroupAggre this num-worker output-dir exception select)
-          (.computeAggre this num-worker output-dir exception select)))
+          (do ;; groupby-aggre
+            (.computeGroupAggre this num-worker output-dir exception select)
+            (dataframe output-dir :have-col true))
+          (do ;; aggre
+            (.computeAggre this num-worker output-dir exception select)
+            (dataframe output-dir :have-col true))))
       (if (= (type this) clojask.dataframe.JoinedDataFrame)
-        (.compute this num-worker output-dir exception order select)
-        (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe")))))
-)
+        (do ;; join
+          (.compute this num-worker output-dir exception order select)
+          (dataframe output-dir :have-col true))
+        (throw (Clojask_TypeException. "Must compute on a clojask dataframe or joined dataframe"))))))
+
 (defn get-col-names
   "Get the names for the columns in sequence"
   [this]

From e315e11443a072bea0f5196cc89935b643ba4252 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sun, 2 Jan 2022 18:06:25 +0800
Subject: [PATCH 24/33] Amend test file

---
 test/clojask/core_test.clj | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index 609d8ca..1e9cbec 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -3,6 +3,7 @@
               [clojask.dataframe :refer :all]
               [clojask.utils :refer :all]
               [clojask.groupby :refer :all]
+              [clojask.api.gb-aggregate :as gb-aggre]
               [clojask.api.aggregate :as aggre]
               [clojask.sort :refer :all]))
         
@@ -45,7 +46,7 @@
     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
     (set-type y "Salary" "double")
     (group-by y ["Department"])
-    (aggregate y max ["Salary"] ["new-salary"])
+    (aggregate y gb-aggre/max ["Salary"] ["new-Salary"])
     (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false)
     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")]
         (is (= "" (:out result))) 
@@ -78,10 +79,10 @@
     (testing "Join dataframes APIs"
     (def x (dataframe "test/clojask/Employees-example.csv"))
     (def y (dataframe "test/clojask/Employees-example.csv"))
-    (is (= "success" (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    (is (= "success" (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    (is (= "success" (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))
-    (is (= "success" (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))
+    (is (= clojask.dataframe.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
+    (is (= clojask.dataframe.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
+    (is (= clojask.dataframe.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
+    (is (= clojask.dataframe.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))))
     ))
 
 (deftest join-api-output-test

From b8e5f989c7125aaf4714326a1cf7a208e67dc2cd Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Wed, 5 Jan 2022 18:08:35 +0800
Subject: [PATCH 25/33] allow only group by

---
 src/main/clojure/clojask/dataframe.clj | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 0962eee..987734f 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -319,6 +319,8 @@
               ;; test (println [groupby-keys aggre-keys select pre-index data-index])
               res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
+          (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of "
+                        "the groupby keys."))
           (.printCol this output-dir select) ;; todo: based on "select"
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
@@ -573,6 +575,8 @@
         b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)]
     (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) 
       (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes.")))
+    (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) []))
+          (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join.")))
     (cond (not (= (count a-keys) (count b-keys))) 
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
@@ -593,6 +597,8 @@
         b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)]
     (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) 
       (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes.")))
+    (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) []))
+          (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join.")))
     (cond (not (= (count a-keys) (count b-keys))) 
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (= (count col-prefix) 2)) 
@@ -609,6 +615,8 @@
         b-keys (mapv (fn [_] [(nth _ 0) (get (.getKeyIndex (.col-info b)) (nth _ 1))]) b-keys)]
     (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame))) 
       (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes.")))
+    (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) []))
+          (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join.")))
     (cond (not (= (count a-keys) (count b-keys))) 
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
@@ -630,6 +638,8 @@
       (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b))) 
       (throw (Clojask_TypeException. "Input includes non-existent column name(s).")))
+    (cond (or (not= (.getAggreFunc(:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) []))
+          (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join.")))
     (let [[a-roll b-roll] [(get (.getKeyIndex (:col-info a)) a-roll) (get (.getKeyIndex (:col-info b)) b-roll)]]
       (do
         (cond (not (and (not= a-roll nil) (not= b-roll nil)))
@@ -648,6 +658,8 @@
           (throw (Clojask_TypeException. "Rolling keys should be strings")))
     (cond (not (and (= (type a) clojask.dataframe.DataFrame) (= (type b) clojask.dataframe.DataFrame)))
           (throw (Clojask_TypeException. "First two arguments should be Clojask dataframes.")))
+    (cond (or (not= (.getAggreFunc (:row-info a)) []) (not= (.getGroupbyKeys (:row-info a)) []) (not= (.getAggreFunc (:row-info b)) []) (not= (.getGroupbyKeys (:row-info b)) []))
+          (throw (Clojask_TypeException. "Cannot join on a dataframe that has been grouped by or aggregated. Try to first compute, then use the new one to join.")))
     (cond (not (= (count a-keys) (count b-keys)))
           (throw (Clojask_TypeException. "The length of left keys and right keys should be equal.")))
     (cond (not (and (u/are-in a-keys a) (u/are-in b-keys b)))
@@ -668,7 +680,7 @@
         select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]
     (assert (not= select []) "must select at least 1 column")
     (if (= (type this) clojask.dataframe.DataFrame)
-      (if (= (.getAggreFunc (:row-info this)) [])
+      (if (and (= (.getGroupbyKeys (:row-info this)) []) (= (.getAggreFunc (:row-info this)) []))
         (.compute this num-worker output-dir exception order select)
         (if (not= (.getGroupbyKeys (:row-info this)) [])
           (.computeGroupAggre this num-worker output-dir exception select)

From 60c2905fcc93819bc0830c3b9ab0a861f44b1048 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 5 Jan 2022 22:12:22 +0800
Subject: [PATCH 26/33] Delete deprecated printCol functions

---
 src/main/clojure/clojask/dataframe.clj | 28 +-------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 320d920..06b8611 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -140,7 +140,6 @@
         ;; not aggregate
         (let [index-key (.getIndexKey (:col-info this))
               index (.getColIndex this)]
-              ;(mapv (fn [i] (get {0 "Employee", 1 "EmployeeName", 2 "Department", 3 "Salary"} i)) [0 2 2 2])
               (mapv (fn [i] (get index-key i)) index))
         ;; if aggregate
         (let [groupby-key-index (.getGroupbyKeys (:row-info this))
@@ -157,36 +156,11 @@
         (.write wrtr (str (str/join "," col-set) "\n")))))
 
   (printColByIndex
-    ;; print column names, called by compute
+    ;; print column names, called by computeGroupByAggre
     [this output-path selected-index]
     (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
       (with-open [wrtr (io/writer output-path)]
         (.write wrtr (str (str/join "," col-set) "\n")))))
-
-  ;; !! deprecated
-  (printAggreCol
-  ;; print column names, called by computeAggre
-    [this output-path]
-    (.checkOutputPath this output-path)
-    (let [groupby-key-index (.getGroupbyKeys (:row-info this))
-          groupby-keys (vec (map (.getIndexKey (.col-info this)) (vec (map #(last %) groupby-key-index))))
-          aggre-new-keys (.getAggreNewKeys (:row-info this))]
-      (with-open [wrtr (io/writer output-path)]
-        (.write wrtr (str (str/join "," (concat groupby-keys aggre-new-keys)) "\n")))))
-
-  ;; !! deprecated
-  (printJoinCol
-  ;; print column names, called by join APIs
-    [this b-df this-keys b-keys output-path col-prefix]
-    (.checkOutputPath this output-path)
-    (let [a-col-prefix (first col-prefix)
-          b-col-prefix (last col-prefix)
-          a-col-set (.getColNames this)
-          b-col-set (.getColNames b-df)
-          a-col-header (map #(str a-col-prefix "_" %) a-col-set)
-          b-col-header (map #(str b-col-prefix "_" %) b-col-set)]
-        (with-open [wrtr (io/writer output-path)]
-          (.write wrtr (str (str/join "," (concat a-col-header b-col-header)) "\n")))))
   
   (delCol
     [this col-to-del]

From e91560ebc7f321b0a6f4994bab9e9baa3a96c14e Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 5 Jan 2022 22:13:15 +0800
Subject: [PATCH 27/33] Delete deprecated function declarations

---
 src/main/clojure/clojask/dataframe.clj | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 5878788..20a4e54 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -32,8 +32,6 @@
   (getColNames [] "get column names")
   (printCol [output-path selected-col] "print column names to output file")
   (printColByIndex [output-path selected-index] "print column names to output file")
-  (printAggreCol [output-path] "print column names to output file for aggregate")
-  (printJoinCol [b-df a-keys b-keys output-path col-prefix] "print column names to output file for join")
   (delCol [col-to-del] "delete one or more columns in the dataframe")
   (reorderCol [new-col-order] "reorder columns in the dataframe")
   (renameCol [new-col-names] "rename columns in the dataframe")

From 50816100afe726f10c4243e0a497cc25daecba08 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 5 Jan 2022 22:28:27 +0800
Subject: [PATCH 28/33] Unify printCol functions as one

---
 src/main/clojure/clojask/dataframe.clj | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 20a4e54..6208d17 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -30,8 +30,7 @@
   (colTypes [] "get column type in ColInfo")
   (getColIndex [] "get column indices, excluding deleted columns")
   (getColNames [] "get column names")
-  (printCol [output-path selected-col] "print column names to output file")
-  (printColByIndex [output-path selected-index] "print column names to output file")
+  (printCol [output-path selected-index] "print column names to output file")
   (delCol [col-to-del] "delete one or more columns in the dataframe")
   (reorderCol [new-col-order] "reorder columns in the dataframe")
   (renameCol [new-col-names] "rename columns in the dataframe")
@@ -145,16 +144,17 @@
               aggre-new-keys (.getAggreNewKeys (:row-info this))]
               (concat groupby-keys aggre-new-keys))))
 
-  (printCol
-  ;; print column names, called by compute and computeAggre
-    [this output-path selected-col]
-    (.checkOutputPath this output-path)
-    (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)]
-      (with-open [wrtr (io/writer output-path)]
-        (.write wrtr (str (str/join "," col-set) "\n")))))
+  ;; !! deprecated
+  ;; (printCol
+  ;; ;; print column names, called by compute and computeAggre
+  ;;   [this output-path selected-col]
+  ;;   (.checkOutputPath this output-path)
+  ;;   (let [col-set (if (= selected-col [nil]) (.getColNames this) selected-col)]
+  ;;     (with-open [wrtr (io/writer output-path)]
+  ;;       (.write wrtr (str (str/join "," col-set) "\n")))))
 
-  (printColByIndex
-    ;; print column names, called by computeGroupByAggre
+  (printCol
+    ;; print column names, called by compute, computeAggre and computeGroupByAggre
     [this output-path selected-index]
     (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
       (with-open [wrtr (io/writer output-path)]
@@ -251,7 +251,7 @@
       (if (<= num-worker 8)
         (try
           (.final this)
-          (.printCol this output-dir select) ;; to-do: based on the index => Done
+          (.printCol this output-dir index) ;; to-do: based on the index => Done
           (let [res (start-onyx num-worker batch-size this output-dir exception order index)]
             (if (= res "success")
               "success"
@@ -302,7 +302,7 @@
           (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of "
                         "the groupby keys."))
           ;; (.printCol this output-dir select) ;; todo: based on "select"
-          (.printColByIndex this output-dir select) ;; todo: based on "select"
+          (.printCol this output-dir select) ;; todo: based on "select"
           (if (= res "success")
           ;;  (if (= "success" (start-onyx-aggre num-worker batch-size this output-dir (.getGroupbyKeys (:row-info this)) exception))
             (let [shift-func (fn [pair]

From 6978df5ed74fb2ece62140ae4d629670b680dad4 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Wed, 5 Jan 2022 22:29:08 +0800
Subject: [PATCH 29/33] Update test file to include only calling group-by and
 only calling aggregate

---
 test/clojask/core_test.clj            | 15 +++++++++++++++
 test/clojask/correct_outputs/1-10.csv |  2 ++
 test/clojask/correct_outputs/1-11.csv |  5 +++++
 3 files changed, 22 insertions(+)
 create mode 100644 test/clojask/correct_outputs/1-10.csv
 create mode 100644 test/clojask/correct_outputs/1-11.csv

diff --git a/test/clojask/core_test.clj b/test/clojask/core_test.clj
index 1e9cbec..e60a92e 100644
--- a/test/clojask/core_test.clj
+++ b/test/clojask/core_test.clj
@@ -51,6 +51,21 @@
     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-3.csv) <(sort test/clojask/correct_outputs/1-3.csv)")]
         (is (= "" (:out result))) 
         (is (= "" (:err result))))
+    ;; aggregate only
+    (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
+    (set-type y "Salary" "double")
+    (aggregate y aggre/max ["Salary"] ["new-Salary"])
+    (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false)
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-10.csv) <(sort test/clojask/correct_outputs/1-10.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
+    ;; groupby only
+    (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
+    (group-by y ["Department"])
+    (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false)
+    (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-11.csv) <(sort test/clojask/correct_outputs/1-11.csv)")]
+        (is (= "" (:out result))) 
+        (is (= "" (:err result))))
     ))
 
 (deftest col-api-test
diff --git a/test/clojask/correct_outputs/1-10.csv b/test/clojask/correct_outputs/1-10.csv
new file mode 100644
index 0000000..7bdcd99
--- /dev/null
+++ b/test/clojask/correct_outputs/1-10.csv
@@ -0,0 +1,2 @@
+new-Salary
+50000.0
diff --git a/test/clojask/correct_outputs/1-11.csv b/test/clojask/correct_outputs/1-11.csv
new file mode 100644
index 0000000..f1d7b1d
--- /dev/null
+++ b/test/clojask/correct_outputs/1-11.csv
@@ -0,0 +1,5 @@
+Department
+12
+21
+13
+11

From ad9f5aee2168b52b24d9b68e576bb32312a5eaaa Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sat, 8 Jan 2022 11:45:14 +0800
Subject: [PATCH 30/33] Added checkInputPathClash for basic compute function

---
 src/main/clojure/clojask/dataframe.clj | 27 +++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 6208d17..5ce89d1 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -21,7 +21,9 @@
 
 (definterface DFIntf
   (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select] "final evaluatation")
+  (getPath [] "get input path of dataframe")
   (checkOutputPath [output-path] "check if output path is of string type")
+  (checkInputPathClash [path] "check if path clashs with dataframe input path")
   (operate [operation colName] "operate an operation to column and replace in place")
   (operate [operation colName newCol] "operate an operation to column and add the result as new column")
   (setType [type colName] "types supported: int double string date")
@@ -58,11 +60,33 @@
             ^Boolean have-col]
   DFIntf
 
+  (getPath
+    [this]
+    path)
+
   (checkOutputPath
     [this output-path]
     (cond (not (= java.lang.String (type output-path)))
           (throw (Clojask_TypeException. "Output path should be a string."))))
 
+  (checkInputPathClash
+    [this path]
+    (defn get-path-str 
+      [path]
+      (if (str/starts-with? path "./")
+                       (str "file:///" (str/replace-first path "./" ""))
+                       (if (str/starts-with? path "/")
+                           (str "file:///" (str/replace-first path "./" ""))
+                           (str "file:///" path))))
+    (let [path-str (get-path-str path)
+          input-path-str (get-path-str (.getPath this))
+          path-obj (java.nio.file.Paths/get (new java.net.URI path-str))
+          input-path-obj (java.nio.file.Paths/get (new java.net.URI input-path-str))
+          paths-equal (java.nio.file.Paths/.equals path-obj input-path-obj)]
+          (cond paths-equal
+            (throw (Clojask_OperationException. "Output path should be different from input path of dataframe argument.")))
+          ))
+
   (operate ;; has assert
     [this operation colName]
     (if (nil? (.operate col-info operation colName))
@@ -656,7 +680,8 @@
 (defn compute
   [this num-worker output-dir & {:keys [exception order select exclude] :or {exception false order true select nil exclude nil}}]
   (assert (or (nil? select) (nil? exclude)) "can only specify either of them")
-  (u/init-file output-dir)
+  ;; check if output-dir clashes with input file path
+  (.checkInputPathClash this output-dir)
   ;; check which type of dataframe this is
   (let [exclude (if (coll? exclude) exclude [exclude])
         select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]

From d949f9f45b342bc189a043a84f277c1e38de3714 Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sat, 8 Jan 2022 13:14:43 +0800
Subject: [PATCH 31/33] Added checkInputPathClash for JoinDF

---
 src/main/clojure/clojask/dataframe.clj | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 5ce89d1..7734db8 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -509,7 +509,7 @@
 
 ;; ============= Below is the definition for the joineddataframe ================
 (definterface JDFIntf
-  (checkOutputPath [output-path] "check if output path is of string type")
+  (checkInputPathClash [path] "check if paths clashes with dataframes a/b input path")
   (getColNames [] "get the names of all the columns")
   (printCol [output-path selected-col] "print column names to output file")
   (preview [] "preview the column names")
@@ -526,6 +526,11 @@
             limit
             prefix]
   JDFIntf
+  
+  (checkInputPathClash 
+    [this path]
+    (.checkInputPathClash a path)
+    (.checkInputPathClash b path))
 
   (getColNames
     [this]

From 8e946c698cb4585fdaa3a219d0c3efe44fde097b Mon Sep 17 00:00:00 2001
From: Yuchen Liu <43634213+hkulyc@users.noreply.github.com>
Date: Sat, 8 Jan 2022 17:41:49 +0800
Subject: [PATCH 32/33] minor

---
 src/main/clojure/clojask/dataframe.clj | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index 6208d17..c564686 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -299,8 +299,9 @@
               ;; test (println [groupby-keys aggre-keys select pre-index data-index])
               res (start-onyx-groupby num-worker batch-size this "_clojask/grouped/" groupby-keys groupby-index exception)]
           ;(.printAggreCol this output-dir) ;; print column names to output-dir
-          (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of "
-                        "the groupby keys."))
+          (if (= aggre-keys [])
+            (println (str "Since the dataframe is only grouped by but not aggregated, the result will be the same as to choose the distinct values of "
+                          "the groupby keys.")))
           ;; (.printCol this output-dir select) ;; todo: based on "select"
           (.printCol this output-dir select) ;; todo: based on "select"
           (if (= res "success")

From 2b5a725f958212f8e7db9a2e56031c6000e53d6d Mon Sep 17 00:00:00 2001
From: Angel Woo <awoo424@gmail.com>
Date: Sat, 8 Jan 2022 22:11:35 +0800
Subject: [PATCH 33/33] Solve bug of mis-deleting init-file line in compute

---
 src/main/clojure/clojask/dataframe.clj | 2 ++
 src/main/clojure/clojask/groupby.clj   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/main/clojure/clojask/dataframe.clj b/src/main/clojure/clojask/dataframe.clj
index d0c88c9..48bd325 100644
--- a/src/main/clojure/clojask/dataframe.clj
+++ b/src/main/clojure/clojask/dataframe.clj
@@ -688,6 +688,8 @@
   (assert (or (nil? select) (nil? exclude)) "can only specify either of them")
   ;; check if output-dir clashes with input file path
   (.checkInputPathClash this output-dir)
+  ;; initialise file
+  (u/init-file output-dir)
   ;; check which type of dataframe this is
   (let [exclude (if (coll? exclude) exclude [exclude])
         select (if select select (if (not= [nil] exclude) (doall (remove (fn [item] (.contains exclude item)) (.getColNames this))) nil))]
diff --git a/src/main/clojure/clojask/groupby.clj b/src/main/clojure/clojask/groupby.clj
index 8d2c7b2..8805af2 100644
--- a/src/main/clojure/clojask/groupby.clj
+++ b/src/main/clojure/clojask/groupby.clj
@@ -1,6 +1,6 @@
 (ns clojask.groupby
   (:require [clojure.java.io :as io]
-            [clojure-csv.core :as csv]
+            ;[clojure-csv.core :as csv]
             [clojask.utils :as u]
             [clojure.core.async :as async]))
 "contains the utility functions to group by and aggregate"