.github/workflows/causal_lm_cpp.yml

name: causal_lm_cpp
on:
  pull_request:
    paths:
      - text_generation/causal_lm/cpp/**
      - '!text_generation/causal_lm/cpp/README.md'
      - thirdparty/openvino_contrib
      - .github/workflows/causal_lm_cpp.yml
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
jobs:
  causal_lm_cpp:
    runs-on: ubuntu-20.04-8-cores
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - run: ./text_generation/causal_lm/cpp/set_up_and_run.sh
  cpp-beam_search_causal_lm-ubuntu:
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install OpenVINO
        run: |
          mkdir ./ov/
          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13649-bbddb891712/l_openvino_toolkit_ubuntu20_2023.3.0.dev20231214_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
      - name: Download, convert and build
        run: |
          source ./ov/setupvars.sh
          python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v0.6 --output_dir ./TinyLlama-1.1B-Chat-v0.6/ --precision FP16 --stateful &
          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
          cmake --build ./build/ --config Release -j
          wait
      - name: Compare
        run: |
          source ./ov/setupvars.sh
          python ./text_generation/causal_lm/cpp/convert_tokenizers.py ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/

          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ 69 > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6')
          tokenized = tokenizer('69', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo 69 passed

          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ Hi > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6')
          tokenized = tokenizer('Hi', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo Hi passed

          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "return 0" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6')
          tokenized = tokenizer('return 0', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo return 0 passed

          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6')
          tokenized = tokenizer('', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo '""' passed

          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "你好！ 你好嗎？" > ./pred.txt
          python -c "
          import transformers
          with open('pred.txt', 'r') as file:
              predictions = file.read()
          tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6')
          tokenized = tokenizer('你好！ 你好嗎？', return_tensors='pt')
          for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
              ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
              idx = predictions.find(ref)
              if -1 == idx:
                  raise RuntimeError(f'Missing "{ref=}" from predictions')
              predictions = predictions[:idx] + predictions[idx + len(ref):]
          "
          echo 你好！ 你好嗎？ passed
  cpp-beam_search_causal_lm-windows:
    if: false  # TODO: enable after openvino package with fix is published
    runs-on: windows-latest
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: recursive
      - uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - uses: actions/checkout@v4
        with:
          repository: TinyLlama/TinyLlama-1.1B-Chat-v0.6
          ref: bf9ae1c8bf026667e6f810768de259bb4a7f4777
          path: TinyLlama-1.1B-Chat-v0.6
          lfs: true
          github-server-url: https://huggingface.co
      - name: Install OpenVINO
        shell: bash
        run: |
          curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13649-bbddb891712/w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64.zip
          unzip ov.zip
      - name: Download, convert and build
        shell: bash
        run: |
          export OpenVINO_DIR=$GITHUB_WORKSPACE/w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64/runtime/cmake
          export OPENVINO_LIB_PATHS=$GITHUB_WORKSPACE/w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64/runtime/3rdparty/tbb/bin\;$GITHUB_WORKSPACE/w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64/runtime/bin/intel64/Release
          export PATH=$OPENVINO_LIB_PATHS:$PATH
          export PYTHONPATH=./w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64/python:./w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64/python/python3
          python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v0.6 --output_dir ./TinyLlama-1.1B-Chat-v0.6/ --precision FP16 --stateful &
          cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
          cmake --build ./build/ --config Release -j
          wait
      - name: Compare
        shell: cmd
        run: |
          call w_openvino_toolkit_windows_2023.3.0.dev20231214_x86_64\setupvars.bat
          python .\text_generation\causal_lm\cpp\convert_tokenizers.py .\TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP16\

          .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP16\ "Why is the Sun yellow?" > .\pred.txt
          echo import transformers > ref.py
          echo predictions = open('pred.txt', 'r').read() >> ref.py
          echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') >> ref.py
          echo tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') >> ref.py
          echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py
          echo     ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py
          echo     idx = predictions.find(ref) >> ref.py
          echo     if -1 == idx: >> ref.py
          echo         raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
          echo     predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py
          python ref.py