diff --git a/.github/license_template.txt b/.github/license_template.txt index 49875491..b43bb9dc 100644 --- a/.github/license_template.txt +++ b/.github/license_template.txt @@ -1,13 +1,2 @@ -Copyright (c) 2024 Intel Corporation - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. \ No newline at end of file +Copyright (C) 2024 Intel Corporation +SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index e79b9686..7a1fa9e6 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,15 +1,23 @@ -## Type of Change +## Description -feature or bug fix or documentation or validation or others +The summary of the proposed changes as long as the relevant motivation and context. -## Description +## Issues + +List the issue or RFC link this PR is working on. If there is no such link, please mark it as `n/a`. + +## Type of change + +List the type of change like below. Please delete options that are not relevant. -detail description +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds new functionality) +- [ ] Breaking change (fix or feature that would break existing design and interface) -## How has this PR been tested? +## Dependencies -how to reproduce the test (including hardware information) +List the newly introduced 3rd party dependency if exists. -## Dependency Change? +## Tests -any library dependency introduced or removed +Describe the tests that you ran to verify your changes. diff --git a/.github/workflows/docker/hpu.dockerfile b/.github/workflows/docker/hpu.dockerfile new file mode 100644 index 00000000..e6a35d54 --- /dev/null +++ b/.github/workflows/docker/hpu.dockerfile @@ -0,0 +1,25 @@ +FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest as hpu + +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/root:/usr/lib/habanalabs/ +ARG REPO=https://github.com/intel/genaieval.git +ARG REPO_PATH="" +ARG BRANCH=main + +RUN apt-get update && \ + apt-get install git-lfs && \ + git-lfs install + +# Download code +SHELL ["/bin/bash", "--login", "-c"] +RUN mkdir -p /genaieval +COPY ${REPO_PATH} /genaieval +RUN if [ "$REPO_PATH" == "" ]; then rm -rf /genaieval/* && rm -rf /genaieval/.* ; git clone --single-branch --branch=${BRANCH} ${REPO} /genaieval ; fi + +# Build From Source +RUN cd /genaieval && \ + python setup.py install && \ + pip install --upgrade-strategy eager optimum[habana] && \ + pip list + +WORKDIR /genaieval/ \ No newline at end of file diff --git a/.github/workflows/model_test.yml b/.github/workflows/model_test.yml deleted file mode 100644 index 4f18c630..00000000 --- a/.github/workflows/model_test.yml +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: Model Test - -on: - workflow_dispatch: - -# If there is a new commit, the previous jobs will be canceled -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -permissions: write-all -env: - OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models - SCRIPT_PATH: /GenAIEval/.github/workflows/scripts - DOCKER_NAME: "genaieval" - DOCKER_TAG: "latest" - CONTAINER_NAME: "modelTest" - - -jobs: - Evaluation-Workflow: - runs-on: aise-cluster - strategy: - matrix: - include: - - modelName: "facebook/opt-125m" - datasets: "piqa" - device: "cpu" - tasks: "text-generation" - fail-fast: true - - steps: - - name: Clean Up Working Directory - run: sudo rm -rf ${{github.workspace}}/* - - - name: Checkout out Repo - uses: actions/checkout@v4 - with: - submodules: "recursive" - fetch-tags: true - # We need this because GitHub needs to clone the branch to pipeline - - name: Docker Build - run: | - docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - - - name: Docker Run - run: | - if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then - docker stop ${{ env.CONTAINER_NAME }} - docker rm -vf ${{ env.CONTAINER_NAME }} || true - fi - docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \ - -v ${{ github.workspace }}:/GenAIEval \ - ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} - - - name: Binary build - run: | - docker exec ${{ env.CONTAINER_NAME }} \ - bash -c "cd /GenAIEval && pip install -r requirements.txt && python setup.py install" - - #- name: Download Reference Artifact - # id: download-artifact - # uses: dawidd6/action-download-artifact@v3.1.2 - # with: - # workflow: model_test.yml - # name: ${{ matrix.device }}-${{ matrix.modelName }} - # run_id: ${{ vars.ModelTest_REF_ID }} - # path: ${{ github.workspace }}/${{ matrix.device }}_${{ matrix.modelName }}_refer_log - # name_is_regexp: true - # repo: ${{ github.repository }} - # check_artifacts: false - # search_artifacts: false - # skip_unpack: false - # if_no_artifact_found: warn - - #- name: Display structure of downloaded files - # run: ls -R - - - name: Evaluation - run: | - docker exec ${{ env.CONTAINER_NAME }} \ - bash -c "cd /GenAIEval/.github/workflows/scripts/models \ - && bash model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}" - - - name: Collect Log - run: | - docker exec ${{ env.CONTAINER_NAME }} \ - bash -c "cd /GenAIEval/.github/workflows/scripts/models \ - && bash -x collect_log.sh --model=${{ matrix.modelName }} \ - --device=${{ matrix.device }} \ - --datasets=${{ matrix.datasets }} \ - --tasks=${{ matrix.tasks }} - - - name: Publish pipeline artifact - uses: actions/upload-artifact@v4 - if: ${{ !cancelled() }} - with: - name: ${{ matrix.device }}-${{ matrix.modelName }} - path: | - ${{ github.workspace }}/${{ matrix.device }}/${{ matrix.modelName }} - ${{ github.workspace }}/.summary.log - if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn` - retention-days: 60 # 1 <= retention-days <= 90 diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml new file mode 100644 index 00000000..ed70411d --- /dev/null +++ b/.github/workflows/model_test_cpu.yml @@ -0,0 +1,172 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Model Test on CPU + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped + paths: + - .github/workflows/model_test_cpu.yml + - GenAIEval/** + - setup.py + workflow_dispatch: + +# If there is a new commit, the previous jobs will be canceled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +permissions: write-all +env: + OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models + SCRIPT_PATH: /GenAIEval/.github/workflows/scripts + DOCKER_NAME: "genaieval" + DOCKER_TAG: "latest" + CONTAINER_NAME: "modelTest" + + +jobs: + Evaluation-Workflow: + runs-on: aise-cluster-cpu + strategy: + matrix: + include: + - modelName: "opt-125m" + datasets: "piqa" + device: "cpu" + tasks: "text-generation" + fail-fast: true + + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Load environment variables + run: + cat ~/actions-runner4/.env >> $GITHUB_ENV + + - name: Checkout out Repo + uses: actions/checkout@v4 + with: + submodules: "recursive" + fetch-tags: true + # We need this because GitHub needs to clone the branch to pipeline + - name: Docker Build + run: | + docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + + - name: Docker Run + run: | + if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then + docker stop ${{ env.CONTAINER_NAME }} + docker rm -vf ${{ env.CONTAINER_NAME }} || true + fi + docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \ + -v ${{ github.workspace }}:/GenAIEval \ + -e http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" -e https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" \ + ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} + + - name: Binary build + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "cd /GenAIEval && pip install -r requirements.txt && python setup.py install" + + - name: Evaluation + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "cd /GenAIEval/.github/workflows/scripts/models \ + && bash -x model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}" + + - name: Collect Log + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "cd /GenAIEval/.github/workflows/scripts/models \ + && bash -x collect_log.sh --model=${{ matrix.modelName }} \ + --device=${{ matrix.device }} \ + --datasets=${{ matrix.datasets }} \ + --tasks=${{ matrix.tasks }}" + + - name: Publish pipeline artifact + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: ${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }} + path: | + ${{ github.workspace }}/${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}.log + ${{ github.workspace }}/summary.log + if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn` + retention-days: 60 # 1 <= retention-days <= 90 + + Genreate-Report: + runs-on: ubuntu-latest + needs: [Evaluation-Workflow] + steps: + - name: Checkout out Repo + uses: actions/checkout@v4 + + - name: Download Summary Log + uses: actions/download-artifact@v4 + with: + path: ${{ env.OUT_SCRIPT_PATH }}/log + - name: Display structure of downloaded files + run: ls -R + - name: Analysis Summary + run: | + cd ${{ env.OUT_SCRIPT_PATH }} + ls -R + + - name: Download Reference Artifact + id: download-artifact + uses: dawidd6/action-download-artifact@v3.1.2 + with: + workflow: model_test_cpu.yml + name: FinalReport + run_id: ${{ vars.ModelTest_CPU_REF_ID }} + path: ${{ env.OUT_SCRIPT_PATH }} + name_is_regexp: true + repo: ${{ github.repository }} + check_artifacts: false + search_artifacts: false + skip_unpack: false + if_no_artifact_found: warn + + - name: Display structure of downloaded files + run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R + + - name: Generate report + run: | + echo "------ Generating final report.html ------" + cd ${{ env.OUT_SCRIPT_PATH }} + mkdir -p generated + /usr/bin/bash -x generate_report.sh + env: + RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }} + BUILD_NUMBER: ${{ github.run_id }} + JOB_STATUS: succeed + + - name: Publish Report + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: FinalReport + path: ${{ env.OUT_SCRIPT_PATH }}/generated + + - name: Specify performance regression + if: ${{ !cancelled() }} + run: | + if [ ${{ env.is_perf_reg }} == 'true' ]; then + echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports." + exit 1 + fi diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml new file mode 100644 index 00000000..1e6f2316 --- /dev/null +++ b/.github/workflows/model_test_hpu.yml @@ -0,0 +1,160 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Model Test on HPU + +on: + pull_request: + branches: [main] + types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped + paths: + - .github/workflows/model_test_hpu.yml + - GenAIEval/** + - setup.py + workflow_dispatch: + +# If there is a new commit, the previous jobs will be canceled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +permissions: write-all +env: + OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models + SCRIPT_PATH: /GenAIEval/.github/workflows/scripts + DOCKER_NAME: "genaieval" + DOCKER_TAG: "latest" + CONTAINER_NAME: "modelTest" + + +jobs: + Evaluation-Workflow: + runs-on: aise-cluster-hpu + strategy: + matrix: + include: + - modelName: "opt-125m" + datasets: "piqa" + device: "hpu" + tasks: "text-generation" + fail-fast: true + + steps: + - name: Clean Up Working Directory + run: sudo rm -rf ${{github.workspace}}/* + + - name: Checkout out Repo + uses: actions/checkout@v4 + with: + submodules: "recursive" + fetch-tags: true + # We need this because GitHub needs to clone the branch to pipeline + - name: Docker Build + run: | + docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/Docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + + - name: Docker Run + run: | + if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then + docker stop ${{ env.CONTAINER_NAME }} + docker rm -vf ${{ env.CONTAINER_NAME }} || true + fi + docker run -tid --runtime=habana --name=${{ env.CONTAINER_NAME }} -v ${{ github.workspace }}:/GenAIEval -v /dev/shm:/dev/shm ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} + + - name: Evaluation + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "cd /GenAIEval/.github/workflows/scripts/models \ + && bash -x model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}" + + - name: Collect Log + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + bash -c "cd /GenAIEval/.github/workflows/scripts/models \ + && bash -x collect_log.sh --model=${{ matrix.modelName }} \ + --device=${{ matrix.device }} \ + --datasets=${{ matrix.datasets }} \ + --tasks=${{ matrix.tasks }}" + + - name: Publish pipeline artifact + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: ${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }} + path: | + ${{ github.workspace }}/${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}.log + ${{ github.workspace }}/summary.log + if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn` + retention-days: 60 # 1 <= retention-days <= 90 + + Genreate-Report: + runs-on: ubuntu-latest + needs: [Evaluation-Workflow] + steps: + - name: Checkout out Repo + uses: actions/checkout@v4 + + - name: Download Summary Log + uses: actions/download-artifact@v4 + with: + path: ${{ env.OUT_SCRIPT_PATH }}/log + - name: Display structure of downloaded files + run: ls -R + - name: Analysis Summary + run: | + cd ${{ env.OUT_SCRIPT_PATH }} + ls -R + + - name: Download Reference Artifact + id: download-artifact + uses: dawidd6/action-download-artifact@v3.1.2 + with: + workflow: model_test_hpu.yml + name: FinalReport + run_id: ${{ vars.ModelTest_HPU_REF_ID }} + path: ${{ env.OUT_SCRIPT_PATH }} + name_is_regexp: true + repo: ${{ github.repository }} + check_artifacts: false + search_artifacts: false + skip_unpack: false + if_no_artifact_found: warn + + - name: Display structure of downloaded files + run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R + + - name: Generate report + run: | + echo "------ Generating final report.html ------" + cd ${{ env.OUT_SCRIPT_PATH }} + mkdir -p generated + /usr/bin/bash -x generate_report.sh + env: + RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }} + BUILD_NUMBER: ${{ github.run_id }} + JOB_STATUS: succeed + + - name: Publish Report + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: FinalReport + path: ${{ env.OUT_SCRIPT_PATH }}/generated + + - name: Specify performance regression + if: ${{ !cancelled() }} + run: | + if [ ${{ env.is_perf_reg }} == 'true' ]; then + echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports." + exit 1 + fi diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh index a197393a..9c22d78f 100644 --- a/.github/workflows/scripts/models/collect_log.sh +++ b/.github/workflows/scripts/models/collect_log.sh @@ -14,6 +14,7 @@ # limitations under the License. set -eo pipefail +set -x source /GenAIEval/.github/workflows/scripts/change_color WORKSPACE="/GenAIEval" # get parameters @@ -34,14 +35,14 @@ for i in "$@"; do esac done -log_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}-${datasets}.log" +log_file="/log/${device}/${model}/${device}-${tasks}-${model}-${datasets}.log" $BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET echo "working in" pwd if [[ ! -f ${log_file} ]]; then - echo "${device};${model};${tasks};${datasets};;${logfile}" >> ${WORKSPACE}/summary.log + echo "${device};${model};${tasks};${datasets};;" >> ${WORKSPACE}/summary.log else - acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${log_file} | head -n 1 | sed 's/.*://;s/[^0-9.]//g') - echo "${device};${model};${tasks};${datasets};${acc};${logfile}" >> ${WORKSPACE}/summary.log + acc=$(grep -Po "acc .*(\d+(\.\d+)?)" ${log_file} | awk -F "|" '{print $2}' | head -n 1 | sed 's/.*://;s/[^0-9.]//g') + echo "${device};${model};${tasks};${datasets};${acc};" >> ${WORKSPACE}/summary.log fi diff --git a/.github/workflows/scripts/models/generate_report.sh b/.github/workflows/scripts/models/generate_report.sh new file mode 100644 index 00000000..4db273f5 --- /dev/null +++ b/.github/workflows/scripts/models/generate_report.sh @@ -0,0 +1,268 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +WORKSPACE=generated +last_log_path=FinalReport +summaryLog=${WORKSPACE}/summary.log +summaryLogLast=${last_log_path}/summary.log +PATTERN='[-a-zA-Z0-9_]*=' + +function main { + echo "summaryLog: ${summaryLog}" + echo "summaryLogLast: ${summaryLogLast}" + echo "is_perf_reg=false" >> "$GITHUB_ENV" + preprocessing + generate_html_head + generate_html_overview + generate_results + generate_html_footer +} + +function preprocessing { + for file_path in log/* + do + if [[ -d ${file_path} ]] && [[ -f ${file_path}/summary.log ]]; then + cat ${file_path}/summary.log >> ${summaryLog} + fi + done +} + +function generate_html_overview { + Test_Info_Title="Test Branch Commit ID " + Test_Info="${MR_source_branch} ${ghprbActualCommit} " + + cat >>${WORKSPACE}/report.html < +
+

ITREX Tests + [ Job-${BUILD_NUMBER} ]

+

Test Status: ${JOB_STATUS}

+

Summary

+ + + + ${Test_Info_Title} + + + + ${Test_Info} + +
Repo
ITREX
+eof +} + +function generate_results { + cat >>${WORKSPACE}/report.html <Performance + + + + + + + + + +eof + + devices=$(cat ${summaryLog} | cut -d';' -f1 | awk '!a[$0]++') + for device in ${devices[@]}; do + models=$(cat ${summaryLog} | grep "${device};" | cut -d';' -f2 | awk '!a[$0]++') + for model in ${models[@]}; do + tasks=$(cat ${summaryLog} | grep "${device};${model};" | cut -d';' -f3 | awk '!a[$0]++') + for task in ${tasks[@]}; do + datasets=$(cat ${summaryLog} | grep "${device};${model};${task};" | cut -d';' -f4 | awk '!a[$0]++') + for dataset in ${datasets[@]}; do + benchmark_pattern="${device};${model};${task};${dataset};" + acc=$(cat ${summaryLog} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++') + acc_last=nan + if [ $(cat ${summaryLogLast} | grep -c "${benchmark_pattern}") != 0 ]; then + acc_last=$(cat ${summaryLogLast} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++') + fi + generate_core + done + done + done + done + cat >>${WORKSPACE}/report.html < +eof +} + +function generate_core { + echo "" >>${WORKSPACE}/report.html + echo | awk -v acc=${acc} -v acc_l=${acc_last} ' + function show_benchmark(a) { + if(a ~/[1-9]/) { + printf("\n",a); + }else { + printf("\n"); + } + } + function compare_new_last(a,b){ + if(a ~/[1-9]/ && b ~/[1-9]/) { + target = b / a; + if(target >= 0.945) { + status_png = "background-color:#90EE90"; + }else { + status_png = "background-color:#FFD2D2"; + job_status = "fail" + } + printf("", status_png, target); + }else{ + if(a == ""){ + job_status = "fail" + status_png = "background-color:#FFD2D2"; + printf("", status_png); + }else{ + printf(""); + } + } + } + BEGIN { + job_status = "pass" + }{ + // current + show_benchmark(acc) + // Last + printf("\n") + show_benchmark(acc_l) + // current vs last + printf("\n"); + compare_new_last(acc,acc_l) + printf("\n"); + } END{ + printf("\n%s", job_status); + } + ' >>${WORKSPACE}/report.html + job_state=$(tail -1 ${WORKSPACE}/report.html) + sed -i '$s/.*//' ${WORKSPACE}/report.html + if [ ${job_state} == 'fail' ]; then + echo "is_perf_reg=true" >> "$GITHUB_ENV" + fi +} + +function generate_html_head { + cat >${WORKSPACE}/report.html < + + + + + + + Daily Tests - TensorFlow - Jenkins + + +eof +} + +function generate_html_footer { + cat >>${WORKSPACE}/report.html < + + +eof +} + +main diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh index fb420086..7d460ac2 100644 --- a/.github/workflows/scripts/models/model_test.sh +++ b/.github/workflows/scripts/models/model_test.sh @@ -13,9 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -eo pipefail +set -o pipefail +set -x source /GenAIEval/.github/workflows/scripts/change_color - +git config --global --add safe.directory /GenAIEval # get parameters PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true @@ -34,11 +35,7 @@ for i in "$@"; do esac done -log_dir="/GenAIEval/${device}/${model}" -mkdir -p ${log_dir} working_dir="" -$BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET - main() { case ${tasks} in "text-generation") @@ -48,7 +45,21 @@ main() { *) echo "Not suppotted task"; exit 1;; esac + if [[ ${model} == *"opt"* ]]; then + pretrained="facebook/${model}" + else + pretrained="${model}" + fi + if [[ ${device} == "cpu" ]]; then + model_sourze="hf" + elif [[ ${device} == "hpu" ]]; then + model_sourze="gaudi-hf" + fi + log_dir="/log/${device}/${model}" + mkdir -p ${log_dir} + $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET run_benchmark + cp ${log_dir}/${device}-${tasks}-${model}-${datasets}.log /GenAIEval/ } function prepare() { @@ -62,22 +73,20 @@ function prepare() { else echo "Not found requirements.txt file." fi - if [[ ${device} == "hpu" ]]; then - pip install --upgrade-strategy eager optimum[habana] - fi } function run_benchmark() { cd ${working_dir} - overall_log="${log_dir}/${device}-${model}-${tasks}-${datasets}.log" + overall_log="${log_dir}/${device}-${tasks}-${model}-${datasets}.log" python main.py \ - --model hf \ - --model_args pretrained=${model} \ + --model ${model_sourze} \ + --model_args pretrained=${pretrained} \ --tasks ${datasets} \ --device ${device} \ - --batch_size 112 - 2>&1 | tee ${overall_log} + --batch_size 112 2>&1 | tee ${overall_log} + echo "print log content:" + cat ${overall_log} status=$? if [ ${status} != 0 ]; then echo "Evaluation process returned non-zero exit code." diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 615831c3..4c3807f6 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -49,7 +49,9 @@ jobs: steps: - name: Clean Up Working Directory run: sudo rm -rf ${{github.workspace}}/* - + - name: Load environment variables + run: + cat ~/actions-runner4/.env >> $GITHUB_ENV - name: Checkout out Repo uses: actions/checkout@v4 with: @@ -59,7 +61,7 @@ jobs: - name: Docker Build run: | - docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + docker build --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | @@ -68,6 +70,7 @@ jobs: docker rm -vf ${{ env.CONTAINER_NAME }} || true fi docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \ + -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" \ -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} - name: Install Dependencies diff --git a/Docker/hpu.dockerfile b/Docker/hpu.dockerfile new file mode 100644 index 00000000..58c4ce1b --- /dev/null +++ b/Docker/hpu.dockerfile @@ -0,0 +1,25 @@ +FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest as hpu + +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/root:/usr/lib/habanalabs/ +ARG REPO=https://github.com/opea-project/GenAIEval.git +ARG REPO_PATH="" +ARG BRANCH=main + +RUN apt-get update && \ + apt-get install git-lfs && \ + git-lfs install + +# Download code +SHELL ["/bin/bash", "--login", "-c"] +RUN mkdir -p /GenAIEval +COPY ${REPO_PATH} /GenAIEval +RUN if [ "$REPO_PATH" == "" ]; then rm -rf /GenAIEval/* && rm -rf /GenAIEval/.* ; git clone --single-branch --branch=${BRANCH} ${REPO} /GenAIEval ; fi + +# Build From Source +RUN cd /GenAIEval && \ + python setup.py install && \ + pip install --upgrade-strategy eager optimum[habana] && \ + pip list + +WORKDIR /GenAIEval/ \ No newline at end of file diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py index fb77a038..90e2e3a5 100644 --- a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py +++ b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py @@ -16,22 +16,26 @@ # limitations under the License. import copy +import json import os from datetime import timedelta from pathlib import Path from typing import List, Literal, Optional, Tuple, Union +import requests as requests_obj import torch import torch.nn.functional as F import transformers from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs, find_executable_batch_size from lm_eval import utils from lm_eval.api.instance import Instance -from lm_eval.api.model import TemplateLM +from lm_eval.api.model import CacheHook, TemplateLM +from lm_eval.api.registry import register_model from lm_eval.models.utils import Collator, clear_torch_cache, get_dtype, pad_and_concat, stop_sequences_criteria from packaging import version from peft import PeftModel from peft import __version__ as PEFT_VERSION +from requests.exceptions import RequestException from tqdm import tqdm from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -1218,3 +1222,282 @@ def _model_call(self, inps): logits = logits[:, :-padding_length, :] logits = logits.to(torch.float32) return logits + + +@register_model("genai-hf") +class GenAI_HFLM(HFLM): + AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM + + def __init__( + self, + base_url=None, + logits_cache: bool = True, + tokenizer: Optional[str] = None, + revision: Optional[str] = "main", + batch_size: int = 1, + max_length: Optional[int] = None, + trust_remote_code: Optional[bool] = False, + use_fast_tokenizer: Optional[bool] = True, + add_bos_token: Optional[bool] = False, + prefix_token_id: Optional[int] = None, + **kwargs, + ): + self.base_url = base_url + assert self.base_url, "must pass `base_url` to use GenaAI service!" + self._rank = 0 + self._world_size = 1 + + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, + revision=revision, + trust_remote_code=trust_remote_code, + use_fast=use_fast_tokenizer, + ) + + self.logits_cache = logits_cache + # select (or create) a pad token to use + if self.tokenizer.pad_token: + pass + elif self.tokenizer.unk_token: + self.tokenizer.pad_token_id = self.tokenizer.unk_token_id + elif self.tokenizer.eos_token: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + else: + if getattr(self.config, "model_type", None) == "qwen": + # Qwen's trust_remote_code tokenizer does not allow for adding special tokens + self.tokenizer.pad_token = "<|endoftext|>" + elif ( + self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer" + or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer" + ): + # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0) + # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer + # --- + # Note that the world tokenizer class name, might change in the future for the final huggingface merge + # https://github.com/huggingface/transformers/pull/26963 + assert self.tokenizer.pad_token_id == 0 + else: + self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) + + # TODO: override this for Gemma + self.add_bos_token = add_bos_token + if "GemmaTokenizer" in self.tokenizer.__class__.__name__: + self.add_bos_token = True + eval_logger.info( + f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it." + ) + + self._batch_size = int(batch_size) + self._max_length = max_length + self.custom_prefix_token_id = prefix_token_id + if prefix_token_id is not None: + eval_logger.info(f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}") + self.cache_hook = CacheHook(None) + self.headers = {"Content-Type": "application/json"} + + @property + def max_length(self) -> int: + if self._max_length: + return self._max_length + else: + return self._DEFAULT_MAX_LENGTH + + @property + def batch_size(self) -> int: + return self._batch_size + + def _loglikelihood_tokens( + self, + task_requests: List[Tuple[Tuple[str, str], List[int], List[int]]], + disable_tqdm: bool = False, + override_bs: int = None, + ) -> List[Tuple[float, bool]]: + # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context + res = [] + + def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]): + """Defines the key for the sorted method.""" + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + + toks = req[1] + req[2] + return -len(toks), tuple(toks) + + def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): + """Defines the key to group and lookup one-token continuations.""" + # Use with group_by="contexts" (optional)" + # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations. + # speeds up some multiple-choice tasks proportionally to the number of choices. + # groups requests by context+continuation[:-1] and infer on one request/group. + return req[-2] + req[-1][:-1] + + re_ord = Collator( + task_requests, + sort_fn=_collate, + group_by=None, + group_fn=_lookup_one_token_cont, + ) + + # automatic (variable) batch size detection for vectorization + # pull longest context sample from request + n_reordered_requests = len(re_ord) + batch_size = self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0 + batch_fn = ( + self._batch_scheduler + if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs + else None + ) + + chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn) + pbar = tqdm( + total=len(task_requests), + disable=(disable_tqdm or (self.rank != 0)), + desc="Running loglikelihood requests", + ) + for chunk in chunks: + inps = [] + cont_toks_list = [] + inplens = [] + + conts = [] + encoder_attns = [] + + padding_len_inp = None + padding_len_cont = None + # because vectorizing is annoying, we first convert each (context, continuation) pair to padded + # tensors, then we pack them together into a batch, call the model, and then pick it all apart + # again because vectorizing is annoying + + for _, context_enc, continuation_enc in chunk: + # sanity check + assert len(context_enc) > 0 + assert len(continuation_enc) > 0 + assert len(continuation_enc) <= self.max_length + + # how this all works (illustrated on a causal decoder-only setup): + # CTX CONT + # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + # model \ \ + # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice + + # when too long to fit in context, truncate from the left + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + inp = torch.tensor( + (context_enc + continuation_enc)[-(self.max_length + 1) :], + dtype=torch.long, + ) + (inplen,) = inp.shape + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + inp = torch.tensor( + (context_enc)[-self.max_length :], + dtype=torch.long, + ) + (inplen,) = inp.shape + + # build encoder attn masks + encoder_attns.append(torch.ones_like(inp)) + + cont = torch.tensor( + (continuation_enc)[-self.max_length :], + # TODO: left-shift these? + # TODO: our code assumes we never end up truncating conts for either model type + dtype=torch.long, + ) + (contlen,) = cont.shape + + conts.append(cont) + + padding_len_cont = max(padding_len_cont, contlen) if padding_len_cont is not None else contlen + + padding_len_inp = max(padding_len_inp, inplen) if padding_len_inp is not None else inplen + + inps.append(inp) # [1, inp_length] + cont_toks_list.append(continuation_enc) + inplens.append(inplen) + + # create encoder attn mask and batched conts, if seq2seq + call_kwargs = {} + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + batched_inps = pad_and_concat(padding_len_inp, inps, padding_side="right") # [batch, padding_len_inp] + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + # TODO: left-pad encoder inps and mask? + batched_inps = pad_and_concat(padding_len_inp, inps) # [batch, padding_len_inp] + batched_conts = pad_and_concat(padding_len_cont, conts) # [batch, padding_len_cont] + batched_encoder_mask = pad_and_concat(padding_len_inp, encoder_attns) # [batch, padding_len_inp] + call_kwargs = { + "attn_mask": batched_encoder_mask, + "labels": batched_conts, + } + + data = { + "batched_inputs": batched_inps.tolist(), + } + try: + response = requests_obj.post( + f"{self.base_url}/v1/completions", + headers=self.headers, + data=json.dumps(data), + ) + response.raise_for_status() + response = response.json() + except RequestException as e: + eval_logger.error(f"RequestException: {e}") + + for (request_str, ctx_tokens, _), greedy_tokens, logprobs, inplen, cont_toks in zip( + chunk, response["greedy_tokens"], response["logprobs"], inplens, cont_toks_list + ): + # Slice to original seq length + contlen = len(cont_toks) + # take only logits in the continuation + # (discard context toks if decoder-only ; discard right-padding) + # also discards + checks for "virtual tokens" in the causal LM's input window + # from prompt/prefix tuning tokens, if applicable + ctx_len = ( + inplen + (len(logprobs) - padding_len_inp) + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM + else None + ) + cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq] + greedy_tokens = torch.tensor( + self._select_cont_toks(greedy_tokens, contlen=contlen, inplen=ctx_len), dtype=torch.long + ).unsqueeze( + 0 + ) # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + cont_logprobs = self._select_cont_toks(logprobs, contlen=contlen, inplen=ctx_len) + + # Answer: (log prob, is-exact-match) + answer = (sum(cont_logprobs), bool(max_equal)) + + res.append(answer) + + self.cache_hook.add_partial("loglikelihood", request_str, answer) + pbar.update(1) + + pbar.close() + + return re_ord.get_original(res) + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("loglikelihood_rolling not yet supported for GenAI service") + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + raise NotImplementedError("Not supported yet.")
DeviceTasksModelDatasetsVSAccuracy
${device}${model}${task}${dataset}New%.2f%.2f
Last
New/Last