Skip to content

Commit

Permalink
Merge branch 'releases/2024/3' into jane-intel-24/3-optimum
Browse files Browse the repository at this point in the history
  • Loading branch information
jane-intel authored Aug 1, 2024
2 parents 5e62aff + a295fe1 commit 3e13405
Show file tree
Hide file tree
Showing 45 changed files with 2,023 additions and 265 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ concurrency:
cancel-in-progress: true

env:
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
jobs:
cpp-multinomial-greedy_causal_lm-ubuntu:
runs-on: ubuntu-20.04-8-cores
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/genai_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
cancel-in-progress: true
env:
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
jobs:
ubuntu_genai_package:
strategy:
Expand Down Expand Up @@ -113,5 +113,6 @@ jobs:
&& cmake --install "samples build" --config ${{ matrix.build-type }} --component samples_bin --prefix samples_install
if: ${{ 'Release' != matrix.build-type }}
- run: call ov\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ ""
if: ${{ 'Release' == matrix.build-type }} # Tokenizers don't work in debug
- run: call ov\setupvars.bat && python .\ov\samples\python\multinomial_causal_lm\multinomial_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 0
if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only
6 changes: 3 additions & 3 deletions .github/workflows/genai_python_lib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
cancel-in-progress: true
env:
l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240719_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
jobs:
ubuntu_genai_python_lib:
# A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
Expand Down
38 changes: 0 additions & 38 deletions Dockerfile

This file was deleted.

15 changes: 8 additions & 7 deletions llm_bench/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
--extra-index-url https://download.pytorch.org/whl/cpu
numpy
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino
openvino-tokenizers
openvino_genai
openvino~=2024.3.0
openvino-tokenizers~=2024.3.0
openvino_genai~=2024.3.0
auto-gptq>=0.5.1 # for gptq
pillow
torch
transformers>=4.40.0
torch<2.5.0
torchvision<0.20.0
transformers>=4.40.0,<4.43.0
diffusers>=0.22.0
#optimum is in dependency list of optimum-intel
#optimum is in dependency list of optimum-intel
git+https://github.com/jane-intel/optimum-intel.git@releases/2024/3#egg=optimum-intel
git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
nncf~=2.12.0
packaging
psutil
timm
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ classifiers = [
"Programming Language :: Python :: 3.12",
]
dependencies = [
"openvino_tokenizers~=2024.3.0.0"
"openvino_tokenizers~=2024.3.0.0.dev"
]

[tool.py-build-cmake.module]
Expand Down
1 change: 1 addition & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
add_subdirectory(cpp/multinomial_causal_lm)
add_subdirectory(cpp/prompt_lookup_decoding_lm)
add_subdirectory(cpp/speculative_decoding_lm)
add_subdirectory(cpp/benchmark_genai)

install(FILES requirements.txt DESTINATION samples
COMPONENT cpp_samples_genai)
Expand Down
24 changes: 24 additions & 0 deletions samples/cpp/benchmark_genai/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


find_package(OpenVINOGenAI REQUIRED PATHS
"${CMAKE_BINARY_DIR}" # Reuse the package from the build.
${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO.
)

FetchContent_Declare(cxxopts
URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz
URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
FetchContent_MakeAvailable(cxxopts)

add_executable(benchmark_genai benchmark_genai.cpp)
target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts)
set_target_properties(benchmark_genai PROPERTIES
COMPILE_PDB_NAME benchmark_genai
# Ensure out of box LC_RPATH on macOS with SIP
INSTALL_RPATH_USE_LINK_PATH ON)
install(TARGETS benchmark_genai
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)
47 changes: 47 additions & 0 deletions samples/cpp/benchmark_genai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# LLMs benchmarking sample

This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.

## Download and convert the model and tokenizers

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.

It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.

```sh
pip install --upgrade-strategy eager -r ../../requirements.txt
optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
```

## Usage

```sh
benchmark_vanilla_genai [OPTIONS]
```

### Options

- `-m, --model`: Path to the model and tokenizers base directory.
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
- `-n, --num_iter` (default: `3`): Number of iterations.
- `-d, --device` (default: `"CPU"`): Device to run the model on.

### Output:

```
benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10
```

```
Load time: 3405.69 ms
Generate time: 1430.77 ± 3.04 ms
Tokenization time: 0.51 ± 0.02 ms
Detokenization time: 0.37 ± 0.01 ms
TTFT: 81.60 ± 0.54 ms
TPOT: 71.52 ± 2.72 ms
Throughput tokens/s: 13.98 ± 0.53
```

For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
70 changes: 70 additions & 0 deletions samples/cpp/benchmark_genai/benchmark_genai.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include <cxxopts.hpp>

int main(int argc, char* argv[]) try {
cxxopts::Options options("benchmark_vanilla_genai", "Help command");

options.add_options()
("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
("h,help", "Print usage");

cxxopts::ParseResult result;
try {
result = options.parse(argc, argv);
} catch (const cxxopts::exceptions::exception& e) {
std::cout << e.what() << "\n\n";
std::cout << options.help() << std::endl;
return EXIT_FAILURE;
}

if (result.count("help")) {
std::cout << options.help() << std::endl;
return EXIT_SUCCESS;
}

std::string prompt = result["prompt"].as<std::string>();
const std::string model_path = result["model"].as<std::string>();
std::string device = result["device"].as<std::string>();
size_t num_warmup = result["num_warmup"].as<size_t>();
size_t num_iter = result["num_iter"].as<size_t>();

ov::genai::GenerationConfig config;
config.max_new_tokens = result["max_new_tokens"].as<size_t>();

ov::genai::LLMPipeline pipe(model_path, device);

for (size_t i = 0; i < num_warmup; i++)
pipe.generate(prompt, config);

ov::genai::DecodedResults res = pipe.generate(prompt, config);
ov::genai::PerfMetrics metrics = res.perf_metrics;
for (size_t i = 0; i < num_iter - 1; i++) {
res = pipe.generate(prompt, config);
metrics = metrics + res.perf_metrics;
}

std::cout << std::fixed << std::setprecision(2);
std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl;
std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl;
std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl;
std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl;
std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl;
std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl;
std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl;

return 0;
} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return EXIT_FAILURE;
} catch (...) {
std::cerr << "Non-exception object thrown\n";
return EXIT_FAILURE;
}
47 changes: 47 additions & 0 deletions samples/python/benchmark_genai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# LLMs benchmarking sample

This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.

## Download and convert the model and tokenizers

The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.

It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.

```sh
pip install --upgrade-strategy eager -r ../../requirements.txt
optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
```

## Usage

```sh
python benchmark_vanilla_genai.py [OPTIONS]
```

### Options

- `-m, --model`: Path to the model and tokenizers base directory.
- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
- `-n, --num_iter` (default: `3`): Number of iterations.
- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
- `-d, --device` (default: `"CPU"`): Device to run the model on.

### Output:

```
python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
```

```
Load time: 3405.69 ms
Generate time: 1430.77 ± 3.04 ms
Tokenization time: 0.51 ± 0.02 ms
Detokenization time: 0.37 ± 0.01 ms
TTFT: 81.60 ± 0.54 ms
TPOT: 71.52 ± 2.72 ms
Throughput tokens/s: 13.98 ± 0.53
```

For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics).
49 changes: 49 additions & 0 deletions samples/python/benchmark_genai/benchmark_genai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import openvino_genai as ov_genai

def main():
parser = argparse.ArgumentParser(description="Help command")
parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")

args = parser.parse_args()

# Perf metrics is stored in DecodedResults.
# In order to get DecodedResults instead of a string input should be a list.
prompt = [args.prompt]
model_path = args.model
device = args.device
num_warmup = args.num_warmup
num_iter = args.num_iter

config = ov_genai.GenerationConfig()
config.max_new_tokens = args.max_new_tokens

pipe = ov_genai.LLMPipeline(model_path, device)

for _ in range(num_warmup):
pipe.generate(prompt, config)

res = pipe.generate(prompt, config)
perf_metrics = res.perf_metrics
for _ in range(num_iter - 1):
res = pipe.generate(prompt, config)
perf_metrics += res.perf_metrics

print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")

if __name__ == "__main__":
main()
Loading

0 comments on commit 3e13405

Please sign in to comment.