Merge branch 'releases/2024/3' into jane-intel-24/3-optimum

openvinotoolkit · Aug 1, 2024 · 3e13405 · 3e13405
2 parents 5e62aff + a295fe1
commit 3e13405
Show file tree

Hide file tree

Showing 45 changed files with 2,023 additions and 265 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores

diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
@@ -113,5 +113,6 @@ jobs:
           && cmake --install "samples build" --config ${{ matrix.build-type }} --component samples_bin --prefix samples_install
         if: ${{ 'Release' != matrix.build-type }}
       - run: call ov\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ ""
+        if: ${{ 'Release' == matrix.build-type }} # Tokenizers don't work in debug
       - run: call ov\setupvars.bat && python .\ov\samples\python\multinomial_causal_lm\multinomial_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 0
         if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240719_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240719_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc2/windows/w_openvino_toolkit_windows_2024.3.0.dev20240719_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.

diff --git a/Dockerfile b/Dockerfile
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
@@ -1,17 +1,18 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-openvino
-openvino-tokenizers
-openvino_genai
+openvino~=2024.3.0
+openvino-tokenizers~=2024.3.0
+openvino_genai~=2024.3.0
 auto-gptq>=0.5.1 # for gptq
 pillow
-torch
-transformers>=4.40.0
+torch<2.5.0
+torchvision<0.20.0
+transformers>=4.40.0,<4.43.0
 diffusers>=0.22.0
-#optimum is in dependency list of optimum-intel 
+#optimum is in dependency list of optimum-intel
 git+https://github.com/jane-intel/optimum-intel.git@releases/2024/3#egg=optimum-intel
-git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
+nncf~=2.12.0
 packaging
 psutil
 timm

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "openvino_tokenizers~=2024.3.0.0"
+    "openvino_tokenizers~=2024.3.0.0.dev"
 ]
 
 [tool.py-build-cmake.module]

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
+add_subdirectory(cpp/benchmark_genai)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)

diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+find_package(OpenVINOGenAI REQUIRED PATHS
+    "${CMAKE_BINARY_DIR}"  # Reuse the package from the build.
+    ${OpenVINO_DIR}  # GenAI may be installed alogside OpenVINO.
+)
+
+FetchContent_Declare(cxxopts
+    URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz
+    URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
+FetchContent_MakeAvailable(cxxopts)
+
+add_executable(benchmark_genai benchmark_genai.cpp)
+target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_genai PROPERTIES
+    COMPILE_PDB_NAME benchmark_genai
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+install(TARGETS benchmark_genai
+    RUNTIME DESTINATION samples_bin/
+    COMPONENT samples_bin
+    EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md
@@ -0,0 +1,47 @@
+# LLMs benchmarking sample
+
+This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+benchmark_vanilla_genai [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10
+```
+
+```
+Load time: 3405.69 ms
+Generate time: 1430.77 ± 3.04 ms
+Tokenization time: 0.51 ± 0.02 ms
+Detokenization time: 0.37 ± 0.01 ms
+TTFT: 81.60 ± 0.54 ms
+TPOT: 71.52 ± 2.72 ms
+Throughput tokens/s: 13.98 ± 0.53
+```
+
+For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp
@@ -0,0 +1,70 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/llm_pipeline.hpp"
+#include <cxxopts.hpp>
+
+int main(int argc, char* argv[]) try {
+    cxxopts::Options options("benchmark_vanilla_genai", "Help command");
+
+    options.add_options()
+    ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
+    ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
+    ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
+    ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
+    ("h,help", "Print usage");
+
+    cxxopts::ParseResult result;
+    try {
+        result = options.parse(argc, argv);
+    } catch (const cxxopts::exceptions::exception& e) {
+        std::cout << e.what() << "\n\n";
+        std::cout << options.help() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (result.count("help")) {
+        std::cout << options.help() << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    std::string prompt = result["prompt"].as<std::string>();
+    const std::string model_path = result["model"].as<std::string>();
+    std::string device = result["device"].as<std::string>();
+    size_t num_warmup = result["num_warmup"].as<size_t>();
+    size_t num_iter = result["num_iter"].as<size_t>();
+
+    ov::genai::GenerationConfig config;
+    config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+
+    ov::genai::LLMPipeline pipe(model_path, device);
+
+    for (size_t i = 0; i < num_warmup; i++)
+        pipe.generate(prompt, config);
+
+    ov::genai::DecodedResults res = pipe.generate(prompt, config);
+    ov::genai::PerfMetrics metrics = res.perf_metrics;
+    for (size_t i = 0; i < num_iter - 1; i++) {
+        res = pipe.generate(prompt, config);
+        metrics = metrics + res.perf_metrics;
+    }
+
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl;
+    std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl;
+    std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl;
+    std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl;
+    std::cout << "TTFT: " << metrics.get_ttft().mean  << " ± " << metrics.get_ttft().std << " ms" << std::endl;
+    std::cout << "TPOT: " << metrics.get_tpot().mean  << " ± " << metrics.get_tpot().std << " ms/token " << std::endl;
+    std::cout << "Throughput: " << metrics.get_throughput().mean  << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl;
+
+    return 0;
+} catch (const std::exception& error) {
+    std::cerr << error.what() << '\n';
+    return EXIT_FAILURE;
+} catch (...) {
+    std::cerr << "Non-exception object thrown\n";
+    return EXIT_FAILURE;
+}
diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md
@@ -0,0 +1,47 @@
+# LLMs benchmarking sample
+
+This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+python benchmark_vanilla_genai.py [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
+```
+
+```
+Load time: 3405.69 ms
+Generate time: 1430.77 ± 3.04 ms
+Tokenization time: 0.51 ± 0.02 ms
+Detokenization time: 0.37 ± 0.01 ms
+TTFT: 81.60 ± 0.54 ms
+TPOT: 71.52 ± 2.72 ms
+Throughput tokens/s: 13.98 ± 0.53
+```
+
+For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics).
diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+
+    args = parser.parse_args()
+
+    # Perf metrics is stored in DecodedResults. 
+    # In order to get DecodedResults instead of a string input should be a list.
+    prompt = [args.prompt]
+    model_path = args.model
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = args.max_new_tokens
+
+    pipe = ov_genai.LLMPipeline(model_path, device)
+
+    for _ in range(num_warmup):
+        pipe.generate(prompt, config)
+
+    res = pipe.generate(prompt, config)
+    perf_metrics = res.perf_metrics
+    for _ in range(num_iter - 1):
+        res = pipe.generate(prompt, config)
+        perf_metrics += res.perf_metrics
+
+    print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
+    print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
+    print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")
+    print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms")
+    print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms")
+    print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms")
+    print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s")
+
+if __name__ == "__main__":
+    main()