rapidsai · wmalpica · Aug 8, 2018 · Aug 17, 2018 · Aug 20, 2018 · Aug 21, 2018
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "thirdparty/moderngpu"]
 	path = thirdparty/moderngpu
 	url = https://github.com/moderngpu/moderngpu.git
+[submodule "thirdparty/jitify"]
+	path = thirdparty/jitify
+	url = [email protected]:NVIDIA/jitify.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -73,8 +73,10 @@ else()
 endif()
 
 include_directories(
+    "${CMAKE_CURRENT_SOURCE_DIR}/src"
     "${CMAKE_CURRENT_SOURCE_DIR}/include"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cub"
+    "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/jitify"
     "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/moderngpu/src"
     "${CUDA_INCLUDE_DIRS}"
     "${ARROW_INCLUDEDIR}"
@@ -107,6 +109,12 @@ if(HT_LEGACY_ALLOCATOR)
 endif()
 
 
+## Binary Operators
+add_subdirectory(src/binary)
+get_directory_property(gdfs_source_files DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/src/binary DEFINITION gdfs_source_files)
+##
+
+
 cuda_add_library(gdf SHARED
     src/binaryops.cu
     src/column.cpp
@@ -121,6 +129,7 @@ cuda_add_library(gdf SHARED
     src/segmented_sorting.cu
     src/datetimeops.cu
     src/sqls_ops.cu
+    ${gdfs_source_files}
 )
 
 target_link_libraries(gdf arrow)

diff --git a/include/gdf/cffi/functions.h b/include/gdf/cffi/functions.h
@@ -321,8 +321,25 @@ gdf_error gdf_extract_datetime_minute(gdf_column *input, gdf_column *output);
 gdf_error gdf_extract_datetime_second(gdf_column *input, gdf_column *output);
 
 
-/* binary operators */
+/**
+ * Binary Operations
+ */
+gdf_error gdf_binary_operation_v_s_v(gdf_column* out, gdf_scalar* vax, gdf_column* vay, gdf_binary_operator ope);
+
+gdf_error gdf_binary_operation_v_v_s(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_binary_operator ope);
+
+gdf_error gdf_binary_operation_v_v_v(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_binary_operator ope);
+
+
+gdf_error gdf_binary_operation_v_s_v_d(gdf_column* out, gdf_scalar* vax, gdf_column* vay, gdf_scalar* def, gdf_binary_operator ope);
 
+gdf_error gdf_binary_operation_v_v_s_d(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_scalar* def, gdf_binary_operator ope);
+
+gdf_error gdf_binary_operation_v_v_v_d(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_scalar* def, gdf_binary_operator ope);
+
+
+
+/* binary operators */
 /* arith */
 
 gdf_error gdf_add_generic(gdf_column *lhs, gdf_column *rhs, gdf_column *output);

diff --git a/include/gdf/cffi/types.h b/include/gdf/cffi/types.h
@@ -8,6 +8,10 @@ typedef enum {
     GDF_INT16,
     GDF_INT32,
     GDF_INT64,
+    GDF_UINT8,
+    GDF_UINT16,
+    GDF_UINT32,
+    GDF_UINT64,
     GDF_FLOAT32,
     GDF_FLOAT64,
     GDF_DATE32,   // int32_t days since the UNIX epoch
@@ -16,6 +20,23 @@ typedef enum {
     N_GDF_TYPES, /* additional types should go BEFORE N_GDF_TYPES */
 } gdf_dtype;
 
+union gdf_data {
+    void*    invd;
+    int8_t   si08;
+    int16_t  si16;
+    int32_t  si32;
+    int64_t  si64;
+    uint8_t  ui08;
+    uint16_t ui16;
+    uint32_t ui32;
+    uint64_t ui64;
+    float    fp32;
+    double   fp64;
+    int32_t  dt32;  // GDF_DATE32
+    int64_t  dt64;  // GDF_DATE64
+    int64_t  tmst;  // GDF_TIMESTAMP
+};
+
 typedef enum {
     GDF_SUCCESS=0,
     GDF_CUDA_ERROR,
@@ -47,6 +68,11 @@ typedef struct {
 	// here we can also hold info for decimal datatype or any other datatype that requires additional information
 } gdf_dtype_extra_info;
 
+struct gdf_scalar {
+    gdf_data  data;
+    gdf_dtype dtype;
+};
+
 typedef struct gdf_column_{
     void *data;
     gdf_valid_type *valid;
@@ -71,6 +97,29 @@ typedef enum {
   N_GDF_AGG_OPS, /* additional aggregation ops should go BEFORE N_GDF_... */
 } gdf_agg_op;
 
+
+enum gdf_binary_operator {
+    GDF_ADD,
+    GDF_SUB,
+    GDF_MUL,
+    GDF_DIV,
+    GDF_TRUE_DIV,
+    GDF_FLOOR_DIV,
+    GDF_MOD,
+    GDF_POW,
+    //GDF_COMBINE,
+    //GDF_COMBINE_FIRST,
+    //GDF_ROUND,
+    GDF_EQUAL,
+    GDF_NOT_EQUAL,
+    GDF_LESS,
+    GDF_GREATER,
+    GDF_LESS_EQUAL,
+    GDF_GREATER_EQUAL,
+    //GDF_PRODUCT,
+    //GDF_DOT
+};
+
 /* additonal flags */
 typedef struct gdf_context_{
   int flag_sorted;        /* 0 = No, 1 = yes */

diff --git a/src/binary/CMakeLists.txt b/src/binary/CMakeLists.txt
@@ -0,0 +1,29 @@
+# cmake -DCMAKE_BUILD_TYPE=Release -DBINARY_OPERATION_VERSION:STRING=V1 ../../code/libgdf
+
+if (NOT DEFINED BINARY_OPERATION_VERSION)
+    list(APPEND gdfs_source_files "")
+    return()
+endif()
+
+
+if (${BINARY_OPERATION_VERSION} STREQUAL "V1")
+    message("BINARY_OPERATION_VERSION: V1 Selected")
+    list(APPEND gdfs_source_files
+         "${CMAKE_CURRENT_LIST_DIR}/common/types.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/common/mediator.cu"
+    )
+endif()
+
+
+if (${BINARY_OPERATION_VERSION} STREQUAL "V2")
+    message("BINARY_OPERATION_VERSION: V2 Selected")
+    list(APPEND gdfs_source_files
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/binary.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/kernel_gdf_data.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/kernel.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/launcher.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/operation.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/traits.cpp"
+         "${CMAKE_CURRENT_LIST_DIR}/binary2/type.cpp"
+    )
+endif()
diff --git a/src/binary/binary2/binary.cpp b/src/binary/binary2/binary.cpp
@@ -0,0 +1,61 @@
+#include "gdf/gdf.h"
+#include "binary/binary2/launcher.h"
+
+namespace gdf {
+    gdf_error binary_operation(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_binary_operator ope) {
+        gdf::Launcher::launch().kernel("kernel_v_s")
+                               .instantiate(out, vax, vay, ope)
+                               .launch(out, vax, vay);
+
+        return GDF_SUCCESS;
+    }
+
+    gdf_error binary_operation(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_binary_operator ope) {
+        gdf::Launcher::launch().kernel("kernel_v_v")
+                               .instantiate(out, vax, vay, ope)
+                               .launch(out, vax, vay);
+
+        return GDF_SUCCESS;
+    }
+
+    gdf_error binary_operation(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_scalar* def, gdf_binary_operator ope) {
+        gdf::Launcher::launch().kernel("kernel_v_s_d")
+                               .instantiate(out, vax, vay, def, ope)
+                               .launch(out, vax, vay, def);
+
+        return GDF_SUCCESS;
+    }
+
+    gdf_error binary_operation(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_scalar* def, gdf_binary_operator ope) {
+        gdf::Launcher::launch().kernel("kernel_v_v_d")
+                               .instantiate(out, vax, vay, def, ope)
+                               .launch(out, vax, vay, def);
+
+        return GDF_SUCCESS;
+    }
+}
+
+
+gdf_error gdf_binary_operation_v_s_v(gdf_column* out, gdf_scalar* vax, gdf_column* vay, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vay, vax, ope);
+}
+
+gdf_error gdf_binary_operation_v_v_s(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vax, vay, ope);
+}
+
+gdf_error gdf_binary_operation_v_v_v(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vax, vay, ope);
+}
+
+gdf_error gdf_binary_operation_v_s_v_d(gdf_column* out, gdf_scalar* vax, gdf_column* vay, gdf_scalar* def, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vay, vax, def, ope);
+}
+
+gdf_error gdf_binary_operation_v_v_s_d(gdf_column* out, gdf_column* vax, gdf_scalar* vay, gdf_scalar* def, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vax, vay, def, ope);
+}
+
+gdf_error gdf_binary_operation_v_v_v_d(gdf_column* out, gdf_column* vax, gdf_column* vay, gdf_scalar* def, gdf_binary_operator ope) {
+    return gdf::binary_operation(out, vax, vay, def, ope);
+}
diff --git a/src/binary/binary2/cuda.h b/src/binary/binary2/cuda.h
@@ -0,0 +1,15 @@
+#ifndef GDF_BINARY_CUDA_H
+#define GDF_BINARY_CUDA_H
+
+namespace gdf {
+namespace cuda {
+
+    extern const char* kernel;
+    extern const char* traits;
+    extern const char* operation;
+    extern const char* kernel_gdf_data;
+
+}
+}
+
+#endif
diff --git a/src/binary/binary2/kernel.cpp b/src/binary/binary2/kernel.cpp
@@ -0,0 +1,141 @@
+namespace gdf {
+namespace cuda {
+
+const char* kernel =
+R"***(
+    #include <cstdint>
+    #include "traits.h"
+    #include "operation.h"
+    #include "kernel_gdf_data.h"
+
+    #define WARP_SIZE 32
+    #define WARP_MASK 0xFFFFFFFF
+
+    __device__ __forceinline__
+    uint32_t isValid(int tid, uint32_t* valid, uint32_t mask) {
+        return valid[tid / WARP_SIZE] & mask;
+    }
+
+    __device__ __forceinline__
+    void shiftMask(uint32_t& mask) {
+        #pragma unroll
+        for (int offset = 16; offset > 0; offset /= 2) {
+            mask += __shfl_down_sync(WARP_MASK, mask, offset);
+        }
+    }
+
+    template <typename TypeOut, typename TypeVax, typename TypeVay, typename TypeOpe>
+    __global__
+    void kernel_v_s(int size, TypeOut* out_data, TypeVax* vax_data, gdf_data vay_data) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        for (int i=start; i<size; i+=step) {
+            AbstractOperation<TypeOpe> operation;
+            out_data[i] = operation.template operate<TypeOut, TypeVax, TypeVay>(vax_data[i], (TypeVay)vay_data);
+        }
+    }
+
+    template <typename TypeOut, typename TypeVax, typename TypeVay, typename TypeOpe>
+    __global__
+    void kernel_v_v(int size, TypeOut* out_data, TypeVax* vax_data, TypeVay* vay_data) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        for (int i=start; i<size; i+=step) {
+            AbstractOperation<TypeOpe> operation;
+            out_data[i] = operation.template operate<TypeOut, TypeVax, TypeVay>(vax_data[i], vay_data[i]);
+        }
+    }
+
+    template <typename TypeOut, typename TypeVax, typename TypeVay, typename TypeDef, typename TypeOpe>
+    __global__
+    void kernel_v_s_d(int size, gdf_data def_data,
+                      TypeOut* out_data, TypeVax* vax_data, gdf_data vay_data,
+                      uint32_t* out_valid, uint32_t* vax_valid) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        for (int i=start; i<size; i+=step) {
+            uint32_t mask = 1 << (i % WARP_SIZE);
+            uint32_t is_vax_valid = isValid(i, vax_valid, mask);
+
+            TypeVax vax_data_aux = vax_data[i];
+            if ((is_vax_valid & mask) != mask) {
+                vax_data_aux = (TypeDef)def_data;
+            }
+
+            AbstractOperation<TypeOpe> operation;
+            out_data[i] = operation.template operate<TypeOut, TypeVax, TypeVay>(vax_data_aux, (TypeVay)vay_data);
+
+            __syncwarp();
+
+            shiftMask(mask);
+
+            if ((i % WARP_SIZE) == 0) {
+                out_valid[i / WARP_SIZE] = mask;
+            }
+        }
+    }
+
+
+    template <typename TypeOut, typename TypeVax, typename TypeVay, typename TypeDef, typename TypeOpe>
+    __global__
+    void kernel_v_v_d(int size, gdf_data def_data,
+                      TypeOut* out_data, TypeVax* vax_data, TypeVay* vay_data,
+                      uint32_t* out_valid, uint32_t* vax_valid, uint32_t* vay_valid) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        for (int i=start; i<size; i+=step) {
+            uint32_t mask = 1 << (i % WARP_SIZE);
+            uint32_t is_vax_valid = isValid(i, vax_valid, mask);
+            uint32_t is_vay_valid = isValid(i, vay_valid, mask);
+
+            TypeVax vax_data_aux = vax_data[i];
+            TypeVay vay_data_aux = vay_data[i];
+            if ((is_vax_valid & mask) != mask) {
+                vax_data_aux = (TypeDef)def_data;
+            }
+            else if ((is_vay_valid & mask) != mask) {
+                vay_data_aux = (TypeDef)def_data;
+            }
+            if ((is_vax_valid | is_vay_valid) == mask) {
+                AbstractOperation<TypeOpe> operation;
+                out_data[i] = operation.template operate<TypeOut, TypeVax, TypeVay>(vax_data_aux, vay_data_aux);
+            } else {
+                mask = 0;
+            }
+
+            __syncwarp();
+
+            shiftMask(mask);
+
+            if ((i % WARP_SIZE) == 0) {
+                out_valid[i / WARP_SIZE] = mask;
+            }
+        }
+    }
+)***";
+}
+}