[examples] Update LLaMA inference example.

buddy-compiler · Nov 10, 2023 · 7d6074d · 7d6074d
1 parent 1a3b616
commit 7d6074d
Show file tree

Hide file tree

Showing 6 changed files with 32,215 additions and 100 deletions.
diff --git a/examples/BuddyLlama/README.md b/examples/BuddyLlama/README.md
@@ -63,9 +63,18 @@ $ cmake -G Ninja .. \
 $ ninja
 $ ninja check-buddy
 ```
-Set environment variable.
+
+Set the `PYTHONPATH` environment variable. Make sure that the `PYTHONPATH` variable includes the directory of LLVM/MLIR python bindings and the directory of Buddy MLIR python packages.
+
 ```
-export PYTHONPATH=/path-to-buddy-mlir/llvm/build/tools/mlir/python_packages/mlir_core:/path-to-buddy-mlir/build/python_packages:${PYTHONPATH}
+$ export PYTHONPATH=/path-to-buddy-mlir/llvm/build/tools/mlir/python_packages/mlir_core:/path-to-buddy-mlir/build/python_packages:${PYTHONPATH}
+
+// For example:
+// Navigate to your buddy-mlir/build directory
+$ cd buddy-mlir/build
+$ export BUDDY_MLIR_BUILD_DIR=$PWD
+$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
 ```
 
 6. Build and run LLaMA example

diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
@@ -31,26 +31,36 @@
 from buddy.compiler.ops import tosa
 
 
-model_path = os.environ.get('LLAMA_MODEL_PATH')
+# Retrieve the LLaMA model path from environment variables.
+model_path = os.environ.get("LLAMA_MODEL_PATH")
+if model_path is None:
+    raise EnvironmentError(
+        "The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid."
+    )
+
+# Initialize the tokenizer and model from the specified model path.
 tokenizer = LlamaTokenizer.from_pretrained(model_path)
 model = LlamaForCausalLM.from_pretrained(model_path, torchscript=True)
-prompt = "Hey, how are you?"
-inputs = tokenizer(prompt, return_tensors="pt")
-inputs = inputs.input_ids
+
+# Initialize Dynamo Compiler with specific configurations as an importer.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
     is_inference=True,
 )
 
+# Import the model into MLIR module and parameters.
 gm, params = dynamo_compiler.importer(
     model, torch.tensor([[1 for i in range(80)]], dtype=torch.int64)
 )
+
+# Write the MLIR module to the file.
 with open(
     os.path.dirname(os.path.abspath(__file__)) + "/llama.mlir", "w"
 ) as module_file:
     print(gm, file=module_file)
 
+# Concatenate all parameters into a single numpy array and write to a file.
 all_param = numpy.concatenate(
     [param.detach().numpy().reshape([-1]) for param in params]
 )

diff --git a/examples/BuddyLlama/llama-main.cpp b/examples/BuddyLlama/llama-main.cpp
@@ -24,93 +24,161 @@
 #include <type_traits>
 
 using namespace buddy;
-using namespace std;
-using namespace chrono;
 
+constexpr size_t ParamsSize = 6755192832;
+constexpr size_t MaxVocabSize = 32000;
+constexpr size_t MaxTokenLength = 80;
+
+/// Declare LLaMA forward function.
 extern "C" void _mlir_ciface_forward(MemRef<float, 3> *, MemRef<float, 1> *,
-                                     MemRef<size_t, 2> *);
+                                     Text<size_t, 2> *);
 
-int main() {
-  // Guide the user to enter the vocab path
-  string vocabDir = "../../tests/Interface/core/vocab_llama.txt";
-  // cout<<"please input vocab file path"<<endl;
-  // getline(cin, vocabDir);
-
-  // Initialize the container
-  string pureStr;
-  cout << "Please enter what you want to say to me" << endl;
-  getline(cin, pureStr);
-  auto buddyTokenizeStart = system_clock::now();
-  Text<size_t, 2> pureStrContainer(pureStr);
-  pureStrContainer.tokenizeLlama(vocabDir, 80);
-  auto buddyTokenizeEnd = system_clock::now();
-  auto buddyTokenizeTime =
-      duration_cast<milliseconds>(buddyTokenizeEnd - buddyTokenizeStart);
-  // Print the tokenized result
-  cout << "Get User input:" << pureStrContainer.revertLlama(pureStrContainer)
-       << endl;
-  cout << "[Buddy] Tokenize input time: " << buddyTokenizeTime.count() << "ms"
-       << endl;
-  // Read the params
-  auto buddyReadStart = system_clock::now();
-  MemRef<float, 1> arg0({intptr_t(6755192832)});
-  ifstream in0("../../examples/BuddyLlama/arg0.data", ios::in | ios::binary);
-  std::cout << "use params file: "
-            << std::filesystem::absolute("../../examples/BuddyLlama/arg0.data")
+// -----------------------------------------------------------------------------
+// Helper Functions
+// -----------------------------------------------------------------------------
+
+/// Capture input message.
+void getUserInput(std::string &inputStr) {
+  std::cout << "\nPlease send a message:" << std::endl;
+  std::cout << ">>> ";
+  getline(std::cin, inputStr);
+  std::cout << std::endl;
+}
+
+/// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+/// Print information for each iteration.
+void printIterInfo(size_t iterIdx, std::string str, double time) {
+  std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m";
+  std::cout << "Token: " << str << " | "
+            << "Time: " << time << "s" << std::endl;
+}
+
+/// Tokenize input data in the container.
+void tokenizeInput(const std::string &vocabFile,
+                   Text<size_t, 2> &inputContainer) {
+  printLogLabel();
+  std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile)
+            << std::endl;
+  const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now();
+  inputContainer.tokenizeLlama(vocabFile, MaxTokenLength);
+  const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> buddyTokenizeTime =
+      buddyTokenizeEnd - buddyTokenizeStart;
+  printLogLabel();
+  std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms"
             << std::endl;
-  if (!in0.is_open()) {
-    throw std::runtime_error("Failed to open param file!");
+}
+
+/// Load parameters into data container.
+void loadParameters(const std::string &paramFilePath,
+                    MemRef<float, 1> &params) {
+  const auto loadStart = std::chrono::high_resolution_clock::now();
+  std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
+  if (!paramFile.is_open()) {
+    throw std::runtime_error("[Error] Failed to open params file!");
+  }
+  printLogLabel();
+  std::cout << "Loading params..." << std::endl;
+  printLogLabel();
+  std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
+            << std::endl;
+  paramFile.read(reinterpret_cast<char *>(params.getData()),
+                 sizeof(float) * (params.getSize()));
+  if (paramFile.fail()) {
+    throw std::runtime_error("Error occurred while reading params file!");
   }
-  in0.read((char *)(arg0.getData()), sizeof(float) * (arg0.getSize()));
-  in0.close();
-  auto buddyReadEnd = system_clock::now();
-  auto buddyReadTime =
-      duration_cast<milliseconds>(buddyReadEnd - buddyReadStart);
-  cout << "Read params finish" << endl;
-  cout << "[Buddy] Read params time: " << (double)(buddyReadTime.count()) / 1000
-       << "s" << endl;
-  // Run the model
-  MemRef<float, 3> result({1, 80, 32000});
-  int generateLen = 80 - pureStrContainer.getTokenCnt();
-  cout << "-----------------------start generate-----------------------"
-       << endl;
-  auto buddyStart = system_clock::now();
+  paramFile.close();
+  const auto loadEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> loadTime =
+      loadEnd - loadStart;
+  printLogLabel();
+  std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
+            << "s\n"
+            << std::endl;
+}
+
+/// Find the index of the max value.
+int findMaxIndex(const float *start, const float *end) {
+  return std::distance(start, std::max_element(start, end));
+}
+
+// -----------------------------------------------------------------------------
+// LLaMA Inference Main Entry
+// -----------------------------------------------------------------------------
+
+int main() {
+  /// Print the title of this example.
+  const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler";
+  std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+  /// Define directories of vacabulary and parameter file.
+  const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt";
+  const std::string paramsDir = "../../examples/BuddyLlama/arg0.data";
+
+  /// Get user message.
+  std::string inputStr;
+  getUserInput(inputStr);
+
+  /// Initialize data containers
+  //  - Input container.
+  //  - Result container
+  //  - Output container.
+  //  - Parameters container.
+  Text<size_t, 2> outputContainer;
+  MemRef<float, 3> resultContainer({1, MaxTokenLength, MaxVocabSize});
+  Text<size_t, 2> inputContainer(inputStr);
+  MemRef<float, 1> paramsContainer({ParamsSize});
+
+  /// Fill data into containers
+  //  - Input: register vocabulary and tokenize the input string.
+  //  - Output: register vocabulary.
+  //  - Parameters: load parameters from the `arg0` file into the container.
+  tokenizeInput(vocabDir, inputContainer);
+  outputContainer.loadVocab(vocabDir);
+  loadParameters(paramsDir, paramsContainer);
+
+  /// Run LLaMA Inference
+  //  - Perform the forward function.
+  //  - Find and append the generated token.
+  //  - Continue iterating until the terminal condition is met.
+  int generateLen = MaxTokenLength - inputContainer.getTokenCnt();
   for (int i = 0; i < generateLen; i++) {
-    cout << "Iteration" << i << ": ";
-    buddyReadStart = system_clock::now();
-    // Perform calculations in memref generated by user input.
-    _mlir_ciface_forward(&result, &arg0, &pureStrContainer);
-    int tokenIndex = pureStrContainer.getTokenCnt() - 1;
-    int index = 0;
-    float maxEle = result.getData()[tokenIndex * 32000];
-    // Calculate the probability of occurrence of each token.
-    for (int j = index + 1; j < 32000; j++) {
-      if (result.getData()[tokenIndex * 32000 + j] > maxEle) {
-        maxEle = result.getData()[tokenIndex * 32000 + j];
-        index = j;
-      }
-    }
-    pureStrContainer.getData()[pureStrContainer.getTokenCnt()] = index;
-    // If the model generate 2(sep marker), interrupt generation immediately.
-    if (index == 2) {
+    const auto inferenceStart = std::chrono::high_resolution_clock::now();
+
+    // Execute the forward pass of the model.
+    _mlir_ciface_forward(&resultContainer, &paramsContainer, &inputContainer);
+
+    const auto inferenceEnd = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double, std::milli> inferenceTime =
+        inferenceEnd - inferenceStart;
+
+    // Determine the generated token.
+    int tokenIndex = inputContainer.getTokenCnt() - 1;
+    const float *startPtr =
+        resultContainer.getData() + tokenIndex * MaxVocabSize;
+    const float *endPtr = startPtr + MaxVocabSize;
+    int maxIndex = findMaxIndex(startPtr, endPtr);
+    std::string tok = inputContainer.getStr(maxIndex);
+    // Print the generated token and inference time.
+    printIterInfo(i, tok, inferenceTime.count() / 1000);
+
+    // Stop if a separator token (2, </s>) or line break token (13 <0x0A>) is
+    // generated.
+    if (maxIndex == 2 || (maxIndex == 13 && i != 0)) {
       break;
     }
-    buddyReadEnd = system_clock::now();
-    buddyReadTime = duration_cast<milliseconds>(buddyReadEnd - buddyReadStart);
-    cout << pureStrContainer.getStr(index) << endl;
-    cout << "[Buddy] Llama iteration " << i
-         << " time: " << (double)(buddyReadTime.count()) / 1000 << "s" << endl;
-    pureStrContainer.setTokenCnt(pureStrContainer.getTokenCnt() + 1);
+
+    // Append the generated token into the input and output container.
+    inputContainer.appendTokenIdx(maxIndex);
+    outputContainer.appendTokenIdx(maxIndex);
   }
-  cout << "------------------------------------------------------------"
-       << endl;
-  // Statistics running time
-  auto buddyEnd = system_clock::now();
-  buddyReadTime = duration_cast<milliseconds>(buddyEnd - buddyStart);
-  // Print the result
-  cout << "[Buddy] Result: " << pureStrContainer.revertLlama(pureStrContainer)
-       << endl;
-  cout << "[Buddy] Llama exection time: "
-       << (double)(buddyReadTime.count()) / 1000 << "s" << endl;
+
+  /// Print the final result
+  std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl;
+  std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama()
+            << std::endl;
+
   return 0;
 }