Skip to content

Commit

Permalink
Add llama-gpu pipeline in cmake.
Browse files Browse the repository at this point in the history
  • Loading branch information
SForeKeeper committed Dec 4, 2023
1 parent dd98199 commit 8c4ffa2
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 1 deletion.
59 changes: 59 additions & 0 deletions examples/BuddyLlama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,47 @@ add_custom_command(

add_library(LLAMA STATIC llama.o)

add_custom_command(
OUTPUT llama-gpu.o
COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-linalg-bufferize
-matmul-paralell-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
-affine-parallelize
-lower-affine
-canonicalize
-func-bufferize
-arith-bufferize
-tensor-bufferize
-buffer-deallocation
-finalizing-bufferize
-gpu-map-parallel-loops
-convert-parallel-loops-to-gpu
-canonicalize
-gpu-kernel-outlining
-convert-scf-to-cf
-memref-expand
-finalize-memref-to-llvm
-convert-arith-to-llvm
-convert-gpu-to-nvvm='has-redux=1'
-llvm-request-c-wrappers
--test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" |
${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
${LLVM_MLIR_BINARY_DIR}/llvm-as |
${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/llama-gpu.o
DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/llama.mlir
COMMENT "Building llama-gpu.o "
VERBATIM)
add_library(LLAMA_GPU STATIC llama-gpu.o)

SET_SOURCE_FILES_PROPERTIES(
template.o
PROPERTIES
Expand All @@ -74,3 +115,21 @@ if(BUDDY_MLIR_USE_MIMALLOC)
endif()

target_link_libraries(buddy-llama-run ${BUDDY_LLAMA_LIBS})

SET_TARGET_PROPERTIES(
LLAMA_GPU
PROPERTIES
LINKER_LANGUAGE C)

set(BUDDY_LLAMA_GPU_LIBS
LLAMA_GPU
mlir_c_runner_utils
omp
)
if(BUDDY_MLIR_USE_MIMALLOC)
list(APPEND BUDDY_LLAMA_GPU_LIBS mimalloc)
endif()

add_executable(buddy-llama-gpu-run llama-gpu.cpp)
target_link_directories(buddy-llama-gpu-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
target_link_libraries(buddy-llama-gpu-run ${BUDDY_LLAMA_GPU_LIBS})
3 changes: 2 additions & 1 deletion examples/BuddyLlama/README-gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ gpu.module @forward_kernel_753 {
This step converts the operations to LLVM dialect operations, and then convert some math functions to NVVM intrinsics.

```
mlir-opt llama-outlined.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
buddy-opt llama-outlined.mlir -gpu-host-register -o llama-host-registered.mlir
mlir-opt llama-host-registered.mlir -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm -convert-gpu-to-nvvm='has-redux=1' -o llama-nvvm.mlir
```

Why do we need the `convert-gpu-to-nvvm` step? If it is not applied, and we are using the unmodified lowering pipeline from torch to linalg, the generated LLVM IR would look like this:
Expand Down
189 changes: 189 additions & 0 deletions examples/BuddyLlama/llama-gpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
//===- llama-main.cpp -----------------------------------------------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//

#include <buddy/Core/Container.h>
#include <buddy/LLM/TextContainer.h>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <limits>
#include <type_traits>

using namespace buddy;

constexpr size_t ParamsSize = 6755192832;
constexpr size_t MaxVocabSize = 32000;
constexpr size_t MaxTokenLength = 40;
constexpr size_t HiddenSize = 4096;

/// Declare LLaMA forward function.
extern "C" void _mlir_ciface_forward(MemRef<float, 3> *, MemRef<float, 1> *,
Text<size_t, 2> *);

// -----------------------------------------------------------------------------
// Helper Functions
// -----------------------------------------------------------------------------

/// Capture input message.
void getUserInput(std::string &inputStr) {
std::cout << "\nPlease send a message:" << std::endl;
std::cout << ">>> ";
getline(std::cin, inputStr);
std::cout << std::endl;
}

/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }

/// Print information for each iteration.
void printIterInfo(size_t iterIdx, std::string str, double time) {
std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m";
std::cout << "Token: " << str << " | "
<< "Time: " << time << "s" << std::endl;
}

/// Tokenize input data in the container.
void tokenizeInput(const std::string &vocabFile,
Text<size_t, 2> &inputContainer) {
printLogLabel();
std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile)
<< std::endl;
const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now();
inputContainer.tokenizeLlama(vocabFile, MaxTokenLength);
const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> buddyTokenizeTime =
buddyTokenizeEnd - buddyTokenizeStart;
printLogLabel();
std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms"
<< std::endl;
}

/// Load parameters into data container.
void loadParameters(const std::string &paramFilePath,
MemRef<float, 1> &params) {
const auto loadStart = std::chrono::high_resolution_clock::now();
std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
if (!paramFile.is_open()) {
throw std::runtime_error("[Error] Failed to open params file!");
}
printLogLabel();
std::cout << "Loading params..." << std::endl;
printLogLabel();
std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
<< std::endl;
paramFile.read(reinterpret_cast<char *>(params.getData()),
sizeof(float) * (params.getSize()));
if (paramFile.fail()) {
throw std::runtime_error("Error occurred while reading params file!");
}
paramFile.close();
const auto loadEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> loadTime =
loadEnd - loadStart;
printLogLabel();
std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
<< "s\n"
<< std::endl;
}

/// Find the index of the max value.
int findMaxIndex(const float *start, const float *end) {
return std::distance(start, std::max_element(start, end));
}

// -----------------------------------------------------------------------------
// LLaMA Inference Main Entry
// -----------------------------------------------------------------------------

int main() {
/// Print the title of this example.
const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler";
std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;

/// Define directories of vacabulary and parameter file.
const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt";
const std::string paramsDir = "../../examples/BuddyLlama/arg0.data";

/// Get user message.
std::string inputStr;
getUserInput(inputStr);

/// Initialize data containers
// - Input container.
// - Result container
// - Output container.
// - Parameters container.
Text<size_t, 2> outputContainer;
MemRef<float, 3> resultContainer[2] = {
MemRef<float, 3>({1, MaxTokenLength, MaxVocabSize}, false, 0),
MemRef<float, 3>({1, MaxTokenLength, HiddenSize}, false, 0)};
Text<size_t, 2> inputContainer(inputStr);
MemRef<float, 1> paramsContainer({ParamsSize});

/// Fill data into containers
// - Input: register vocabulary and tokenize the input string.
// - Output: register vocabulary.
// - Parameters: load parameters from the `arg0` file into the container.
tokenizeInput(vocabDir, inputContainer);
outputContainer.loadVocab(vocabDir);
loadParameters(paramsDir, paramsContainer);

/// Run LLaMA Inference
// - Perform the forward function.
// - Find and append the generated token.
// - Continue iterating until the terminal condition is met.
int generateLen = MaxTokenLength - inputContainer.getTokenCnt();
for (int i = 0; i < generateLen; i++) {
const auto inferenceStart = std::chrono::high_resolution_clock::now();
// Execute the forward pass of the model.
_mlir_ciface_forward(resultContainer, &paramsContainer, &inputContainer);

const auto inferenceEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> inferenceTime =
inferenceEnd - inferenceStart;

// Determine the generated token.
int tokenIndex = inputContainer.getTokenCnt() - 1;
const float *startPtr =
resultContainer[0].getData() + tokenIndex * MaxVocabSize;
const float *endPtr = startPtr + MaxVocabSize;
int maxIndex = findMaxIndex(startPtr, endPtr);
std::string tok = inputContainer.getStr(maxIndex);
// Print the generated token and inference time.
printIterInfo(i, tok, inferenceTime.count() / 1000);

// Stop if a separator token (2, </s>) or line break token (13 <0x0A>) is
// generated.
if (maxIndex == 2) {
break;
}
// Append the generated token into the input and output container.
inputContainer.appendTokenIdx(maxIndex);
outputContainer.appendTokenIdx(maxIndex);
free(resultContainer[0].release());
free(resultContainer[1].release());
}

/// Print the final result
std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl;
std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama()
<< std::endl;

return 0;
}

0 comments on commit 8c4ffa2

Please sign in to comment.