Add llama-gpu pipeline in cmake.

buddy-compiler · Dec 4, 2023 · 8c4ffa2 · 8c4ffa2
1 parent dd98199
commit 8c4ffa2
Show file tree

Hide file tree

Showing 3 changed files with 250 additions and 1 deletion.
diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt
@@ -50,6 +50,47 @@ add_custom_command(
 
 add_library(LLAMA STATIC llama.o)
 
+add_custom_command(
+  OUTPUT llama-gpu.o
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
+          ${BUDDY_BINARY_DIR}/buddy-opt
+            -arith-expand
+            -eliminate-empty-tensors
+            -empty-tensor-to-alloc-tensor
+            -linalg-bufferize
+            -matmul-paralell-vectorization-optimize
+            -batchmatmul-optimize
+            -convert-linalg-to-affine-loops
+            -affine-loop-fusion
+            -affine-parallelize
+            -lower-affine
+            -canonicalize
+            -func-bufferize
+            -arith-bufferize
+            -tensor-bufferize
+            -buffer-deallocation
+            -finalizing-bufferize
+            -gpu-map-parallel-loops
+            -convert-parallel-loops-to-gpu
+            -canonicalize
+            -gpu-kernel-outlining
+            -convert-scf-to-cf
+            -memref-expand
+            -finalize-memref-to-llvm
+            -convert-arith-to-llvm
+            -convert-gpu-to-nvvm='has-redux=1'
+            -llvm-request-c-wrappers
+            --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" |
+        ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+        ${LLVM_MLIR_BINARY_DIR}/llvm-as |
+        ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+          -o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/llama-gpu.o
+  DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir
+  COMMENT "Building llama-gpu.o "
+  VERBATIM)
+add_library(LLAMA_GPU STATIC llama-gpu.o)
+
 SET_SOURCE_FILES_PROPERTIES(
   template.o
   PROPERTIES
@@ -74,3 +115,21 @@ if(BUDDY_MLIR_USE_MIMALLOC)
 endif()
 
 target_link_libraries(buddy-llama-run ${BUDDY_LLAMA_LIBS})
+
+SET_TARGET_PROPERTIES(
+  LLAMA_GPU
+  PROPERTIES
+  LINKER_LANGUAGE C)
+
+set(BUDDY_LLAMA_GPU_LIBS
+  LLAMA_GPU
+  mlir_c_runner_utils
+  omp
+)
+if(BUDDY_MLIR_USE_MIMALLOC)
+  list(APPEND BUDDY_LLAMA_GPU_LIBS mimalloc)
+endif()
+
+add_executable(buddy-llama-gpu-run llama-gpu.cpp)
+target_link_directories(buddy-llama-gpu-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(buddy-llama-gpu-run ${BUDDY_LLAMA_GPU_LIBS})
diff --git a/examples/BuddyLlama/README-gpu.md b/examples/BuddyLlama/README-gpu.md
@@ -107,7 +107,8 @@ gpu.module @forward_kernel_753 {
 This step converts the operations to LLVM dialect operations, and then convert some math functions to NVVM intrinsics.
 
 ```
-mlir-opt llama-outlined.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
+buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir
+mlir-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
 ```
 
 Why do we need the `convert-gpu-to-nvvm` step? If it is not applied, and we are using the unmodified lowering pipeline from torch to linalg, the generated LLVM IR would look like this:

diff --git a/examples/BuddyLlama/llama-gpu.cpp b/examples/BuddyLlama/llama-gpu.cpp
@@ -0,0 +1,189 @@
+//===- llama-main.cpp -----------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include <buddy/Core/Container.h>
+#include <buddy/LLM/TextContainer.h>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
+using namespace buddy;
+
+constexpr size_t ParamsSize = 6755192832;
+constexpr size_t MaxVocabSize = 32000;
+constexpr size_t MaxTokenLength = 40;
+constexpr size_t HiddenSize = 4096;
+
+/// Declare LLaMA forward function.
+extern "C" void _mlir_ciface_forward(MemRef<float, 3> *, MemRef<float, 1> *,
+                                     Text<size_t, 2> *);
+
+// -----------------------------------------------------------------------------
+// Helper Functions
+// -----------------------------------------------------------------------------
+
+/// Capture input message.
+void getUserInput(std::string &inputStr) {
+  std::cout << "\nPlease send a message:" << std::endl;
+  std::cout << ">>> ";
+  getline(std::cin, inputStr);
+  std::cout << std::endl;
+}
+
+/// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+/// Print information for each iteration.
+void printIterInfo(size_t iterIdx, std::string str, double time) {
+  std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m";
+  std::cout << "Token: " << str << " | "
+            << "Time: " << time << "s" << std::endl;
+}
+
+/// Tokenize input data in the container.
+void tokenizeInput(const std::string &vocabFile,
+                   Text<size_t, 2> &inputContainer) {
+  printLogLabel();
+  std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile)
+            << std::endl;
+  const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now();
+  inputContainer.tokenizeLlama(vocabFile, MaxTokenLength);
+  const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> buddyTokenizeTime =
+      buddyTokenizeEnd - buddyTokenizeStart;
+  printLogLabel();
+  std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms"
+            << std::endl;
+}
+
+/// Load parameters into data container.
+void loadParameters(const std::string &paramFilePath,
+                    MemRef<float, 1> &params) {
+  const auto loadStart = std::chrono::high_resolution_clock::now();
+  std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
+  if (!paramFile.is_open()) {
+    throw std::runtime_error("[Error] Failed to open params file!");
+  }
+  printLogLabel();
+  std::cout << "Loading params..." << std::endl;
+  printLogLabel();
+  std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
+            << std::endl;
+  paramFile.read(reinterpret_cast<char *>(params.getData()),
+                 sizeof(float) * (params.getSize()));
+  if (paramFile.fail()) {
+    throw std::runtime_error("Error occurred while reading params file!");
+  }
+  paramFile.close();
+  const auto loadEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> loadTime =
+      loadEnd - loadStart;
+  printLogLabel();
+  std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
+            << "s\n"
+            << std::endl;
+}
+
+/// Find the index of the max value.
+int findMaxIndex(const float *start, const float *end) {
+  return std::distance(start, std::max_element(start, end));
+}
+
+// -----------------------------------------------------------------------------
+// LLaMA Inference Main Entry
+// -----------------------------------------------------------------------------
+
+int main() {
+  /// Print the title of this example.
+  const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler";
+  std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+  /// Define directories of vacabulary and parameter file.
+  const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt";
+  const std::string paramsDir = "../../examples/BuddyLlama/arg0.data";
+
+  /// Get user message.
+  std::string inputStr;
+  getUserInput(inputStr);
+
+  /// Initialize data containers
+  //  - Input container.
+  //  - Result container
+  //  - Output container.
+  //  - Parameters container.
+  Text<size_t, 2> outputContainer;
+  MemRef<float, 3> resultContainer[2] = {
+      MemRef<float, 3>({1, MaxTokenLength, MaxVocabSize}, false, 0),
+      MemRef<float, 3>({1, MaxTokenLength, HiddenSize}, false, 0)};
+  Text<size_t, 2> inputContainer(inputStr);
+  MemRef<float, 1> paramsContainer({ParamsSize});
+
+  /// Fill data into containers
+  //  - Input: register vocabulary and tokenize the input string.
+  //  - Output: register vocabulary.
+  //  - Parameters: load parameters from the `arg0` file into the container.
+  tokenizeInput(vocabDir, inputContainer);
+  outputContainer.loadVocab(vocabDir);
+  loadParameters(paramsDir, paramsContainer);
+
+  /// Run LLaMA Inference
+  //  - Perform the forward function.
+  //  - Find and append the generated token.
+  //  - Continue iterating until the terminal condition is met.
+  int generateLen = MaxTokenLength - inputContainer.getTokenCnt();
+  for (int i = 0; i < generateLen; i++) {
+    const auto inferenceStart = std::chrono::high_resolution_clock::now();
+    // Execute the forward pass of the model.
+    _mlir_ciface_forward(resultContainer, &paramsContainer, &inputContainer);
+
+    const auto inferenceEnd = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double, std::milli> inferenceTime =
+        inferenceEnd - inferenceStart;
+
+    // Determine the generated token.
+    int tokenIndex = inputContainer.getTokenCnt() - 1;
+    const float *startPtr =
+        resultContainer[0].getData() + tokenIndex * MaxVocabSize;
+    const float *endPtr = startPtr + MaxVocabSize;
+    int maxIndex = findMaxIndex(startPtr, endPtr);
+    std::string tok = inputContainer.getStr(maxIndex);
+    // Print the generated token and inference time.
+    printIterInfo(i, tok, inferenceTime.count() / 1000);
+
+    // Stop if a separator token (2, </s>) or line break token (13 <0x0A>) is
+    // generated.
+    if (maxIndex == 2) {
+      break;
+    }
+    // Append the generated token into the input and output container.
+    inputContainer.appendTokenIdx(maxIndex);
+    outputContainer.appendTokenIdx(maxIndex);
+    free(resultContainer[0].release());
+    free(resultContainer[1].release());
+  }
+
+  /// Print the final result
+  std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl;
+  std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama()
+            << std::endl;
+
+  return 0;
+}