Skip to content

Commit

Permalink
[examples] Update LLaMA inference example.
Browse files Browse the repository at this point in the history
  • Loading branch information
zhanghb97 committed Nov 10, 2023
1 parent 1a3b616 commit 7d6074d
Show file tree
Hide file tree
Showing 6 changed files with 32,215 additions and 100 deletions.
13 changes: 11 additions & 2 deletions examples/BuddyLlama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,18 @@ $ cmake -G Ninja .. \
$ ninja
$ ninja check-buddy
```
Set environment variable.

Set the `PYTHONPATH` environment variable. Make sure that the `PYTHONPATH` variable includes the directory of LLVM/MLIR python bindings and the directory of Buddy MLIR python packages.

```
export PYTHONPATH=/path-to-buddy-mlir/llvm/build/tools/mlir/python_packages/mlir_core:/path-to-buddy-mlir/build/python_packages:${PYTHONPATH}
$ export PYTHONPATH=/path-to-buddy-mlir/llvm/build/tools/mlir/python_packages/mlir_core:/path-to-buddy-mlir/build/python_packages:${PYTHONPATH}
// For example:
// Navigate to your buddy-mlir/build directory
$ cd buddy-mlir/build
$ export BUDDY_MLIR_BUILD_DIR=$PWD
$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
```

6. Build and run LLaMA example
Expand Down
18 changes: 14 additions & 4 deletions examples/BuddyLlama/import-llama2.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,36 @@
from buddy.compiler.ops import tosa


model_path = os.environ.get('LLAMA_MODEL_PATH')
# Retrieve the LLaMA model path from environment variables.
model_path = os.environ.get("LLAMA_MODEL_PATH")
if model_path is None:
raise EnvironmentError(
"The environment variable 'LLAMA_MODEL_PATH' is not set or is invalid."
)

# Initialize the tokenizer and model from the specified model path.
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path, torchscript=True)
prompt = "Hey, how are you?"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.input_ids

# Initialize Dynamo Compiler with specific configurations as an importer.
dynamo_compiler = DynamoCompiler(
primary_registry=tosa.ops_registry,
aot_autograd_decomposition=aot_autograd_decompositions,
is_inference=True,
)

# Import the model into MLIR module and parameters.
gm, params = dynamo_compiler.importer(
model, torch.tensor([[1 for i in range(80)]], dtype=torch.int64)
)

# Write the MLIR module to the file.
with open(
os.path.dirname(os.path.abspath(__file__)) + "/llama.mlir", "w"
) as module_file:
print(gm, file=module_file)

# Concatenate all parameters into a single numpy array and write to a file.
all_param = numpy.concatenate(
[param.detach().numpy().reshape([-1]) for param in params]
)
Expand Down
226 changes: 147 additions & 79 deletions examples/BuddyLlama/llama-main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,93 +24,161 @@
#include <type_traits>

using namespace buddy;
using namespace std;
using namespace chrono;

constexpr size_t ParamsSize = 6755192832;
constexpr size_t MaxVocabSize = 32000;
constexpr size_t MaxTokenLength = 80;

/// Declare LLaMA forward function.
extern "C" void _mlir_ciface_forward(MemRef<float, 3> *, MemRef<float, 1> *,
MemRef<size_t, 2> *);
Text<size_t, 2> *);

int main() {
// Guide the user to enter the vocab path
string vocabDir = "../../tests/Interface/core/vocab_llama.txt";
// cout<<"please input vocab file path"<<endl;
// getline(cin, vocabDir);

// Initialize the container
string pureStr;
cout << "Please enter what you want to say to me" << endl;
getline(cin, pureStr);
auto buddyTokenizeStart = system_clock::now();
Text<size_t, 2> pureStrContainer(pureStr);
pureStrContainer.tokenizeLlama(vocabDir, 80);
auto buddyTokenizeEnd = system_clock::now();
auto buddyTokenizeTime =
duration_cast<milliseconds>(buddyTokenizeEnd - buddyTokenizeStart);
// Print the tokenized result
cout << "Get User input:" << pureStrContainer.revertLlama(pureStrContainer)
<< endl;
cout << "[Buddy] Tokenize input time: " << buddyTokenizeTime.count() << "ms"
<< endl;
// Read the params
auto buddyReadStart = system_clock::now();
MemRef<float, 1> arg0({intptr_t(6755192832)});
ifstream in0("../../examples/BuddyLlama/arg0.data", ios::in | ios::binary);
std::cout << "use params file: "
<< std::filesystem::absolute("../../examples/BuddyLlama/arg0.data")
// -----------------------------------------------------------------------------
// Helper Functions
// -----------------------------------------------------------------------------

/// Capture input message.
void getUserInput(std::string &inputStr) {
std::cout << "\nPlease send a message:" << std::endl;
std::cout << ">>> ";
getline(std::cin, inputStr);
std::cout << std::endl;
}

/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }

/// Print information for each iteration.
void printIterInfo(size_t iterIdx, std::string str, double time) {
std::cout << "\033[32;1m[Iteration " << iterIdx << "] \033[0m";
std::cout << "Token: " << str << " | "
<< "Time: " << time << "s" << std::endl;
}

/// Tokenize input data in the container.
void tokenizeInput(const std::string &vocabFile,
Text<size_t, 2> &inputContainer) {
printLogLabel();
std::cout << "Vocab file: " << std::filesystem::canonical(vocabFile)
<< std::endl;
const auto buddyTokenizeStart = std::chrono::high_resolution_clock::now();
inputContainer.tokenizeLlama(vocabFile, MaxTokenLength);
const auto buddyTokenizeEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> buddyTokenizeTime =
buddyTokenizeEnd - buddyTokenizeStart;
printLogLabel();
std::cout << "Tokenize time: " << buddyTokenizeTime.count() << "ms"
<< std::endl;
if (!in0.is_open()) {
throw std::runtime_error("Failed to open param file!");
}

/// Load parameters into data container.
void loadParameters(const std::string &paramFilePath,
MemRef<float, 1> &params) {
const auto loadStart = std::chrono::high_resolution_clock::now();
std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
if (!paramFile.is_open()) {
throw std::runtime_error("[Error] Failed to open params file!");
}
printLogLabel();
std::cout << "Loading params..." << std::endl;
printLogLabel();
std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
<< std::endl;
paramFile.read(reinterpret_cast<char *>(params.getData()),
sizeof(float) * (params.getSize()));
if (paramFile.fail()) {
throw std::runtime_error("Error occurred while reading params file!");
}
in0.read((char *)(arg0.getData()), sizeof(float) * (arg0.getSize()));
in0.close();
auto buddyReadEnd = system_clock::now();
auto buddyReadTime =
duration_cast<milliseconds>(buddyReadEnd - buddyReadStart);
cout << "Read params finish" << endl;
cout << "[Buddy] Read params time: " << (double)(buddyReadTime.count()) / 1000
<< "s" << endl;
// Run the model
MemRef<float, 3> result({1, 80, 32000});
int generateLen = 80 - pureStrContainer.getTokenCnt();
cout << "-----------------------start generate-----------------------"
<< endl;
auto buddyStart = system_clock::now();
paramFile.close();
const auto loadEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> loadTime =
loadEnd - loadStart;
printLogLabel();
std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
<< "s\n"
<< std::endl;
}

/// Find the index of the max value.
int findMaxIndex(const float *start, const float *end) {
return std::distance(start, std::max_element(start, end));
}

// -----------------------------------------------------------------------------
// LLaMA Inference Main Entry
// -----------------------------------------------------------------------------

int main() {
/// Print the title of this example.
const std::string title = "LLaMA 2 Inference Powered by Buddy Compiler";
std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;

/// Define directories of vacabulary and parameter file.
const std::string vocabDir = "../../examples/BuddyLlama/vocab.txt";
const std::string paramsDir = "../../examples/BuddyLlama/arg0.data";

/// Get user message.
std::string inputStr;
getUserInput(inputStr);

/// Initialize data containers
// - Input container.
// - Result container
// - Output container.
// - Parameters container.
Text<size_t, 2> outputContainer;
MemRef<float, 3> resultContainer({1, MaxTokenLength, MaxVocabSize});
Text<size_t, 2> inputContainer(inputStr);
MemRef<float, 1> paramsContainer({ParamsSize});

/// Fill data into containers
// - Input: register vocabulary and tokenize the input string.
// - Output: register vocabulary.
// - Parameters: load parameters from the `arg0` file into the container.
tokenizeInput(vocabDir, inputContainer);
outputContainer.loadVocab(vocabDir);
loadParameters(paramsDir, paramsContainer);

/// Run LLaMA Inference
// - Perform the forward function.
// - Find and append the generated token.
// - Continue iterating until the terminal condition is met.
int generateLen = MaxTokenLength - inputContainer.getTokenCnt();
for (int i = 0; i < generateLen; i++) {
cout << "Iteration" << i << ": ";
buddyReadStart = system_clock::now();
// Perform calculations in memref generated by user input.
_mlir_ciface_forward(&result, &arg0, &pureStrContainer);
int tokenIndex = pureStrContainer.getTokenCnt() - 1;
int index = 0;
float maxEle = result.getData()[tokenIndex * 32000];
// Calculate the probability of occurrence of each token.
for (int j = index + 1; j < 32000; j++) {
if (result.getData()[tokenIndex * 32000 + j] > maxEle) {
maxEle = result.getData()[tokenIndex * 32000 + j];
index = j;
}
}
pureStrContainer.getData()[pureStrContainer.getTokenCnt()] = index;
// If the model generate 2(sep marker), interrupt generation immediately.
if (index == 2) {
const auto inferenceStart = std::chrono::high_resolution_clock::now();

// Execute the forward pass of the model.
_mlir_ciface_forward(&resultContainer, &paramsContainer, &inputContainer);

const auto inferenceEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> inferenceTime =
inferenceEnd - inferenceStart;

// Determine the generated token.
int tokenIndex = inputContainer.getTokenCnt() - 1;
const float *startPtr =
resultContainer.getData() + tokenIndex * MaxVocabSize;
const float *endPtr = startPtr + MaxVocabSize;
int maxIndex = findMaxIndex(startPtr, endPtr);
std::string tok = inputContainer.getStr(maxIndex);
// Print the generated token and inference time.
printIterInfo(i, tok, inferenceTime.count() / 1000);

// Stop if a separator token (2, </s>) or line break token (13 <0x0A>) is
// generated.
if (maxIndex == 2 || (maxIndex == 13 && i != 0)) {
break;
}
buddyReadEnd = system_clock::now();
buddyReadTime = duration_cast<milliseconds>(buddyReadEnd - buddyReadStart);
cout << pureStrContainer.getStr(index) << endl;
cout << "[Buddy] Llama iteration " << i
<< " time: " << (double)(buddyReadTime.count()) / 1000 << "s" << endl;
pureStrContainer.setTokenCnt(pureStrContainer.getTokenCnt() + 1);

// Append the generated token into the input and output container.
inputContainer.appendTokenIdx(maxIndex);
outputContainer.appendTokenIdx(maxIndex);
}
cout << "------------------------------------------------------------"
<< endl;
// Statistics running time
auto buddyEnd = system_clock::now();
buddyReadTime = duration_cast<milliseconds>(buddyEnd - buddyStart);
// Print the result
cout << "[Buddy] Result: " << pureStrContainer.revertLlama(pureStrContainer)
<< endl;
cout << "[Buddy] Llama exection time: "
<< (double)(buddyReadTime.count()) / 1000 << "s" << endl;

/// Print the final result
std::cout << "\n\033[33;1m[Input]\033[0m " << inputStr << std::endl;
std::cout << "\033[33;1m[Output]\033[0m " << outputContainer.revertLlama()
<< std::endl;

return 0;
}
Loading

0 comments on commit 7d6074d

Please sign in to comment.