Trying to reproduce #4

owensgroup · Feb 7, 2023 · 8a8a042 · 8a8a042
1 parent f30e750
commit 8a8a042
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 13 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -6,21 +6,31 @@ RUN apt-get clean && apt-get update -y -qq
 RUN apt-get install -y wget git build-essential
 
 # Install conda
-ENV PATH="/root/anaconda3/bin:${PATH}"
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh &&\
-    chmod +x Miniconda3-latest-Linux-x86_64.sh &&\
-    ./Miniconda3-latest-Linux-x86_64.sh -b -p /root/anaconda3
+#ENV PATH="/root/anaconda3/bin:${PATH}"
+#RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh &&\
+#    chmod +x Miniconda3-latest-Linux-x86_64.sh &&\
+#    ./Miniconda3-latest-Linux-x86_64.sh -b -p /root/anaconda3
 
 # Create conda environment with dependencies
-COPY environment.yml .
-RUN conda env create -f environment.yml
-SHELL ["conda", "run", "-n", "mvgpubtree", "/bin/bash", "-c"]
+#COPY environment.yml .
+#RUN conda env create -f environment.yml
+#SHELL ["conda", "run", "-n", "mvgpubtree", "/bin/bash", "-c"]
 
 # Clone MVGpuBTree code from github
 RUN git clone https://github.com/owensgroup/MVGpuBTree.git
 
+#RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y hip-runtime-nvidia hip-dev
+
+# Install CMake
+ARG CMAKE_VERSION=3.18.0
+RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh &&\
+    chmod +x cmake-${CMAKE_VERSION}-linux-x86_64.sh &&\
+    mkdir /opt/cmake && \
+    ./cmake-${CMAKE_VERSION}-linux-x86_64.sh  --skip-license --prefix=/opt/cmake  &&\
+    ln -s /opt/cmake/bin/* /usr/local/bin/
+
 # Activate conda
-ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "mvgpubtree"]
+#ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "mvgpubtree"]
 
 # Building the image:
 # docker build -t mvgpubtree .

diff --git a/docker/build.sh b/docker/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build -t mvgpubtree -f docker/Dockerfile .
diff --git a/docker/run.sh b/docker/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+git_directoy=$(pwd -P)
+
+docker run -it --rm --name trees --gpus all -v $git_directoy:$git_directoy -w $git_directoy mvgpubtree /bin/bash
diff --git a/include/device_bump_allocator.hpp b/include/device_bump_allocator.hpp
@@ -18,7 +18,7 @@
 #include <cstdint>
 
 // 67108864 is 8 Gibs when sizeof(T) = 128
-template <class T, std::size_t MaxTCount = 67108864>
+template <class T, std::size_t MaxTCount = 67108864 / 2>
 struct device_bump_allocator {
   using size_type       = uint32_t;
   using difference_type = uint32_t;

diff --git a/include/gpu_blink_tree.hpp b/include/gpu_blink_tree.hpp
@@ -35,8 +35,8 @@
 #include <device_bump_allocator.hpp>
 #include <slab_alloc.hpp>
 
-//#define DEBUG_LOCKS
-// #define DEBUG_STRUCTURE
+// #define DEBUG_LOCKS
+//  #define DEBUG_STRUCTURE
 
 #ifdef DEBUG_STRUCTURE
 #define DEBUG_STRUCTURE_PRINT(fmt, ...)         \
@@ -1024,6 +1024,7 @@ struct gpu_blink_tree {
     double tree_size_gbs = double(num_nodes) * sizeof(node_type<Key, Value, B>) / (1ull << 30);
     return tree_size_gbs;
   }
+  allocator_type allocator_;
 
  private:
   template <typename key_type,
@@ -1124,7 +1125,5 @@ struct gpu_blink_tree {
   size_type* h_node_count_;
 
   size_type* d_root_index_;
-
-  allocator_type allocator_;
 };
 }  // namespace GpuBTree
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,5 +1,12 @@
 project (test)
 
+# repro issue #4
+add_executable(repro_4 repro_4.cu)
+set_target_properties(repro_4 PROPERTIES
+								FOLDER "tests")
+target_link_libraries(repro_4 PRIVATE gpu_btrees)
+
+
 
 # concurrent insert range query
 add_executable(concurrent_insert_rq_vtree concurrent_insert_rq_vtree.cu)

diff --git a/test/repro_4.cu b/test/repro_4.cu
@@ -0,0 +1,85 @@
+#include <gpu_btree.h>
+#include <cstdint>
+#include <random>
+#include <unordered_set>
+
+#include <cooperative_groups.h>
+namespace cg = cooperative_groups;
+
+template <typename key_type, typename size_type, typename btree>
+__global__ void modified_insert_kernel(const key_type* keys,
+                                       const size_type keys_count,
+                                       btree tree) {
+  auto thread_id = threadIdx.x + blockIdx.x * blockDim.x;
+  auto block     = cg::this_thread_block();
+  auto tile      = cg::tiled_partition<btree::branching_factor>(block);
+
+  if ((thread_id - tile.thread_rank()) >= keys_count) { return; }
+
+  auto key       = btree::invalid_key;
+  auto value     = btree::invalid_value;
+  bool to_insert = false;
+  if (thread_id < keys_count) {
+    key       = keys[thread_id];
+    value     = thread_id;
+    to_insert = true;
+  }
+  using allocator_type = typename btree::device_allocator_context_type;
+  allocator_type allocator{tree.allocator_, tile};
+
+  size_type num_inserted = 1;
+  auto work_queue        = tile.ballot(to_insert);
+  while (work_queue) {
+    auto cur_rank  = __ffs(work_queue) - 1;
+    auto cur_key   = tile.shfl(key, cur_rank);
+    auto cur_value = tile.shfl(value, cur_rank);
+
+    tree.cooperative_insert(cur_key, cur_value, tile, allocator);
+
+    if (tile.thread_rank() == cur_rank) { to_insert = false; }
+    num_inserted++;
+    work_queue = tile.ballot(to_insert);
+  }
+}
+
+void investigate_tree_deadlock() {
+  using key_type   = uint32_t;
+  using value_type = uint32_t;
+
+  size_t build_size       = size_t{1} << 25;
+  key_type min_usable_key = 1;
+  key_type max_usable_key = std::numeric_limits<key_type>::max() - 2;
+
+  std::mt19937_64 gen(42);
+  std::uniform_int_distribution<key_type> key_dist(min_usable_key, max_usable_key);
+  std::vector<key_type> build_keys(build_size);
+  std::unordered_set<key_type> build_keys_set;
+  std::cout << "Generating keys.." << std::endl;
+
+  while (build_keys_set.size() < build_size) {
+    key_type key = key_dist(gen);
+    build_keys_set.insert(key);
+  }
+  std::copy(build_keys_set.begin(), build_keys_set.end(), build_keys.begin());
+  std::sort(build_keys.begin(), build_keys.end());
+
+  key_type* keys_on_gpu;
+  cudaMalloc(&keys_on_gpu, build_size * sizeof(key_type));
+  cudaMemcpy(keys_on_gpu, build_keys.data(), build_size * sizeof(key_type), cudaMemcpyHostToDevice);
+  for (size_t i = 0; i < 10000; ++i) {
+    std::cout << "round " << i << " starting" << std::endl;
+
+    GpuBTree::gpu_blink_tree<key_type, value_type, 16> tree;
+    modified_insert_kernel<<<(build_size + 511) / 512, 512>>>(keys_on_gpu, build_size, tree);
+
+    std::cout << "tree uses " << tree.compute_memory_usage() << " GB" << std::endl;
+    std::cout << "round " << i << " done" << std::endl;
+  }
+
+  cudaFree(keys_on_gpu);
+}
+
+int main() {
+  investigate_tree_deadlock();
+  return 0;
+}