diff --git a/.github/license_template.txt b/.github/license_template.txt
index 49875491..b43bb9dc 100644
--- a/.github/license_template.txt
+++ b/.github/license_template.txt
@@ -1,13 +1,2 @@
-Copyright (c) 2024 Intel Corporation
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
+Copyright (C) 2024 Intel Corporation
+SPDX-License-Identifier: Apache-2.0
\ No newline at end of file
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index e79b9686..7a1fa9e6 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,15 +1,23 @@
-## Type of Change
+## Description
 
-feature or bug fix or documentation or validation or others
+The summary of the proposed changes as long as the relevant motivation and context.
 
-## Description
+## Issues
+
+List the issue or RFC link this PR is working on. If there is no such link, please mark it as `n/a`.
+
+## Type of change
+
+List the type of change like below. Please delete options that are not relevant.
 
-detail description
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [ ] New feature (non-breaking change which adds new functionality)
+- [ ] Breaking change (fix or feature that would break existing design and interface)
 
-## How has this PR been tested?
+## Dependencies
 
-how to reproduce the test (including hardware information)
+List the newly introduced 3rd party dependency if exists.
 
-## Dependency Change?
+## Tests
 
-any library dependency introduced or removed
+Describe the tests that you ran to verify your changes.
diff --git a/.github/workflows/docker/hpu.dockerfile b/.github/workflows/docker/hpu.dockerfile
new file mode 100644
index 00000000..e6a35d54
--- /dev/null
+++ b/.github/workflows/docker/hpu.dockerfile
@@ -0,0 +1,25 @@
+FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest as hpu
+
+ENV LANG=en_US.UTF-8
+ENV PYTHONPATH=/root:/usr/lib/habanalabs/
+ARG REPO=https://github.com/intel/genaieval.git
+ARG REPO_PATH=""
+ARG BRANCH=main
+
+RUN apt-get update && \
+    apt-get install git-lfs && \
+    git-lfs install
+
+# Download code
+SHELL ["/bin/bash", "--login", "-c"]
+RUN mkdir -p /genaieval
+COPY ${REPO_PATH} /genaieval
+RUN if [ "$REPO_PATH" == "" ]; then rm -rf /genaieval/* && rm -rf /genaieval/.* ; git clone --single-branch --branch=${BRANCH} ${REPO} /genaieval ; fi
+
+# Build From Source
+RUN cd /genaieval && \
+    python setup.py install && \
+    pip install --upgrade-strategy eager optimum[habana] && \
+    pip list
+
+WORKDIR /genaieval/
\ No newline at end of file
diff --git a/.github/workflows/model_test.yml b/.github/workflows/model_test.yml
deleted file mode 100644
index 4f18c630..00000000
--- a/.github/workflows/model_test.yml
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Model Test
-
-on:
-  workflow_dispatch:
-
-# If there is a new commit, the previous jobs will be canceled
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-permissions: write-all
-env:
-  OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models
-  SCRIPT_PATH: /GenAIEval/.github/workflows/scripts
-  DOCKER_NAME: "genaieval"
-  DOCKER_TAG: "latest"
-  CONTAINER_NAME: "modelTest"
-
-
-jobs:
-  Evaluation-Workflow:
-    runs-on: aise-cluster
-    strategy:
-      matrix:
-        include:
-          - modelName: "facebook/opt-125m"
-            datasets: "piqa"
-            device: "cpu"
-            tasks: "text-generation"
-      fail-fast: true
-
-    steps:
-      - name: Clean Up Working Directory
-        run: sudo rm -rf ${{github.workspace}}/*
-
-      - name: Checkout out Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: "recursive"
-          fetch-tags: true
-    # We need this because GitHub needs to clone the branch to pipeline
-      - name: Docker Build
-        run: |
-          docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
-
-      - name: Docker Run
-        run: |
-          if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then
-            docker stop ${{ env.CONTAINER_NAME }}
-            docker rm -vf ${{ env.CONTAINER_NAME }} || true
-          fi
-          docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \
-          -v ${{ github.workspace }}:/GenAIEval \
-          ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
-
-      - name: Binary build
-        run: |
-            docker exec ${{ env.CONTAINER_NAME }} \
-            bash -c "cd /GenAIEval && pip install -r requirements.txt && python setup.py install"
-
-      #- name: Download Reference Artifact
-      #  id: download-artifact
-      #  uses: dawidd6/action-download-artifact@v3.1.2
-      #  with:
-      #    workflow: model_test.yml
-      #    name: ${{ matrix.device }}-${{ matrix.modelName }}
-      #    run_id: ${{ vars.ModelTest_REF_ID }}
-      #    path: ${{ github.workspace }}/${{ matrix.device }}_${{ matrix.modelName }}_refer_log
-      #    name_is_regexp: true
-      #    repo: ${{ github.repository }}
-      #    check_artifacts: false
-      #    search_artifacts: false
-      #    skip_unpack: false
-      #    if_no_artifact_found: warn
-
-      #- name: Display structure of downloaded files
-      #  run: ls -R
-
-      - name: Evaluation
-        run: |
-            docker exec ${{ env.CONTAINER_NAME }} \
-            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
-            && bash model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}"
-
-      - name: Collect Log
-        run: |
-            docker exec ${{ env.CONTAINER_NAME }} \
-            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
-            && bash -x collect_log.sh --model=${{ matrix.modelName }} \
-             --device=${{ matrix.device }} \
-             --datasets=${{ matrix.datasets }} \
-             --tasks=${{ matrix.tasks }}
-
-      - name: Publish pipeline artifact
-        uses: actions/upload-artifact@v4
-        if: ${{ !cancelled() }}
-        with:
-          name: ${{ matrix.device }}-${{ matrix.modelName }}
-          path: |
-            ${{ github.workspace }}/${{ matrix.device }}/${{ matrix.modelName }}
-            ${{ github.workspace }}/.summary.log
-          if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn`
-          retention-days: 60 # 1 <= retention-days <= 90
diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml
new file mode 100644
index 00000000..ed70411d
--- /dev/null
+++ b/.github/workflows/model_test_cpu.yml
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Model Test on CPU
+
+on:
+  pull_request:
+    branches: [main]
+    types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
+    paths:
+      - .github/workflows/model_test_cpu.yml
+      - GenAIEval/**
+      - setup.py
+  workflow_dispatch:
+
+# If there is a new commit, the previous jobs will be canceled
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+permissions: write-all
+env:
+  OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models
+  SCRIPT_PATH: /GenAIEval/.github/workflows/scripts
+  DOCKER_NAME: "genaieval"
+  DOCKER_TAG: "latest"
+  CONTAINER_NAME: "modelTest"
+
+
+jobs:
+  Evaluation-Workflow:
+    runs-on: aise-cluster-cpu
+    strategy:
+      matrix:
+        include:
+          - modelName: "opt-125m"
+            datasets: "piqa"
+            device: "cpu"
+            tasks: "text-generation"
+      fail-fast: true
+
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Load environment variables
+        run:
+          cat ~/actions-runner4/.env >> $GITHUB_ENV
+
+      - name: Checkout out Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+          fetch-tags: true
+    # We need this because GitHub needs to clone the branch to pipeline
+      - name: Docker Build
+        run: |
+          docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+
+      - name: Docker Run
+        run: |
+          if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then
+            docker stop ${{ env.CONTAINER_NAME }}
+            docker rm -vf ${{ env.CONTAINER_NAME }} || true
+          fi
+          docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \
+          -v ${{ github.workspace }}:/GenAIEval \
+          -e http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" -e https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" \
+          ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
+
+      - name: Binary build
+        run: |
+            docker exec ${{ env.CONTAINER_NAME }} \
+            bash -c "cd /GenAIEval && pip install -r requirements.txt && python setup.py install"
+
+      - name: Evaluation
+        run: |
+            docker exec ${{ env.CONTAINER_NAME }} \
+            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
+            && bash -x model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}"
+
+      - name: Collect Log
+        run: |
+            docker exec ${{ env.CONTAINER_NAME }} \
+            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
+            && bash -x collect_log.sh --model=${{ matrix.modelName }} \
+             --device=${{ matrix.device }} \
+             --datasets=${{ matrix.datasets }} \
+             --tasks=${{ matrix.tasks }}"
+
+      - name: Publish pipeline artifact
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: ${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}
+          path: |
+            ${{ github.workspace }}/${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}.log
+            ${{ github.workspace }}/summary.log
+          if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn`
+          retention-days: 60 # 1 <= retention-days <= 90
+
+  Genreate-Report:
+    runs-on: ubuntu-latest
+    needs: [Evaluation-Workflow]
+    steps:
+      - name: Checkout out Repo
+        uses: actions/checkout@v4
+
+      - name: Download Summary Log
+        uses: actions/download-artifact@v4
+        with:
+          path: ${{ env.OUT_SCRIPT_PATH }}/log
+      - name: Display structure of downloaded files
+        run: ls -R
+      - name: Analysis Summary
+        run: |
+            cd ${{ env.OUT_SCRIPT_PATH }}
+            ls -R
+
+      - name: Download Reference Artifact
+        id: download-artifact
+        uses: dawidd6/action-download-artifact@v3.1.2
+        with:
+          workflow: model_test_cpu.yml
+          name: FinalReport
+          run_id: ${{ vars.ModelTest_CPU_REF_ID }}
+          path: ${{ env.OUT_SCRIPT_PATH }}
+          name_is_regexp: true
+          repo: ${{ github.repository }}
+          check_artifacts: false
+          search_artifacts: false
+          skip_unpack: false
+          if_no_artifact_found: warn
+
+      - name: Display structure of downloaded files
+        run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R
+
+      - name: Generate report
+        run: |
+          echo "------ Generating final report.html ------"
+          cd ${{ env.OUT_SCRIPT_PATH }}
+          mkdir -p generated
+          /usr/bin/bash -x generate_report.sh
+        env:
+          RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }}
+          BUILD_NUMBER: ${{ github.run_id }}
+          JOB_STATUS: succeed
+
+      - name: Publish Report
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: FinalReport
+          path: ${{ env.OUT_SCRIPT_PATH }}/generated
+
+      - name: Specify performance regression
+        if: ${{ !cancelled() }}
+        run: |
+          if [ ${{ env.is_perf_reg }} == 'true' ]; then
+            echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports."
+            exit 1
+          fi
diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml
new file mode 100644
index 00000000..1e6f2316
--- /dev/null
+++ b/.github/workflows/model_test_hpu.yml
@@ -0,0 +1,160 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Model Test on HPU
+
+on:
+  pull_request:
+      branches: [main]
+      types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
+      paths:
+        - .github/workflows/model_test_hpu.yml
+        - GenAIEval/**
+        - setup.py
+  workflow_dispatch:
+
+# If there is a new commit, the previous jobs will be canceled
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+permissions: write-all
+env:
+  OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models
+  SCRIPT_PATH: /GenAIEval/.github/workflows/scripts
+  DOCKER_NAME: "genaieval"
+  DOCKER_TAG: "latest"
+  CONTAINER_NAME: "modelTest"
+
+
+jobs:
+  Evaluation-Workflow:
+    runs-on: aise-cluster-hpu
+    strategy:
+      matrix:
+        include:
+          - modelName: "opt-125m"
+            datasets: "piqa"
+            device: "hpu"
+            tasks: "text-generation"
+      fail-fast: true
+
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Checkout out Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+          fetch-tags: true
+    # We need this because GitHub needs to clone the branch to pipeline
+      - name: Docker Build
+        run: |
+          docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/Docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+
+      - name: Docker Run
+        run: |
+          if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then
+            docker stop ${{ env.CONTAINER_NAME }}
+            docker rm -vf ${{ env.CONTAINER_NAME }} || true
+          fi
+          docker run -tid --runtime=habana --name=${{ env.CONTAINER_NAME }} -v ${{ github.workspace }}:/GenAIEval -v /dev/shm:/dev/shm ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
+
+      - name: Evaluation
+        run: |
+            docker exec ${{ env.CONTAINER_NAME }} \
+            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
+            && bash -x model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}"
+
+      - name: Collect Log
+        run: |
+            docker exec ${{ env.CONTAINER_NAME }} \
+            bash -c "cd /GenAIEval/.github/workflows/scripts/models \
+            && bash -x collect_log.sh --model=${{ matrix.modelName }} \
+             --device=${{ matrix.device }} \
+             --datasets=${{ matrix.datasets }} \
+             --tasks=${{ matrix.tasks }}"
+
+      - name: Publish pipeline artifact
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: ${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}
+          path: |
+            ${{ github.workspace }}/${{ matrix.device }}-${{ matrix.tasks }}-${{ matrix.modelName }}-${{ matrix.datasets }}.log
+            ${{ github.workspace }}/summary.log
+          if-no-files-found: ignore # 'warn' or 'ignore' are also available, defaults to `warn`
+          retention-days: 60 # 1 <= retention-days <= 90
+
+  Genreate-Report:
+    runs-on: ubuntu-latest
+    needs: [Evaluation-Workflow]
+    steps:
+      - name: Checkout out Repo
+        uses: actions/checkout@v4
+
+      - name: Download Summary Log
+        uses: actions/download-artifact@v4
+        with:
+          path: ${{ env.OUT_SCRIPT_PATH }}/log
+      - name: Display structure of downloaded files
+        run: ls -R
+      - name: Analysis Summary
+        run: |
+            cd ${{ env.OUT_SCRIPT_PATH }}
+            ls -R
+
+      - name: Download Reference Artifact
+        id: download-artifact
+        uses: dawidd6/action-download-artifact@v3.1.2
+        with:
+          workflow: model_test_hpu.yml
+          name: FinalReport
+          run_id: ${{ vars.ModelTest_HPU_REF_ID }}
+          path: ${{ env.OUT_SCRIPT_PATH }}
+          name_is_regexp: true
+          repo: ${{ github.repository }}
+          check_artifacts: false
+          search_artifacts: false
+          skip_unpack: false
+          if_no_artifact_found: warn
+
+      - name: Display structure of downloaded files
+        run: cd ${{ env.OUT_SCRIPT_PATH }}/log && ls -R
+
+      - name: Generate report
+        run: |
+          echo "------ Generating final report.html ------"
+          cd ${{ env.OUT_SCRIPT_PATH }}
+          mkdir -p generated
+          /usr/bin/bash -x generate_report.sh
+        env:
+          RUN_DISPLAY_URL: https://https://github.com/opea-project/GenAIEval/actions/runs/${{ github.run_id }}
+          BUILD_NUMBER: ${{ github.run_id }}
+          JOB_STATUS: succeed
+
+      - name: Publish Report
+        uses: actions/upload-artifact@v4
+        if: ${{ !cancelled() }}
+        with:
+          name: FinalReport
+          path: ${{ env.OUT_SCRIPT_PATH }}/generated
+
+      - name: Specify performance regression
+        if: ${{ !cancelled() }}
+        run: |
+          if [ ${{ env.is_perf_reg }} == 'true' ]; then
+            echo "[Performance Regression] Some model performance regression occurred, please check artifacts and reports."
+            exit 1
+          fi
diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh
index a197393a..9c22d78f 100644
--- a/.github/workflows/scripts/models/collect_log.sh
+++ b/.github/workflows/scripts/models/collect_log.sh
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 set -eo pipefail
+set -x
 source /GenAIEval/.github/workflows/scripts/change_color
 WORKSPACE="/GenAIEval"
 # get parameters
@@ -34,14 +35,14 @@ for i in "$@"; do
     esac
 done
 
-log_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}-${datasets}.log"
+log_file="/log/${device}/${model}/${device}-${tasks}-${model}-${datasets}.log"
 $BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET
 
 echo "working in"
 pwd
 if [[ ! -f ${log_file} ]]; then
-    echo "${device};${model};${tasks};${datasets};;${logfile}" >> ${WORKSPACE}/summary.log
+    echo "${device};${model};${tasks};${datasets};;" >> ${WORKSPACE}/summary.log
 else
-    acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${log_file} | head -n 1 | sed 's/.*://;s/[^0-9.]//g')
-    echo "${device};${model};${tasks};${datasets};${acc};${logfile}" >> ${WORKSPACE}/summary.log
+    acc=$(grep -Po "acc .*(\d+(\.\d+)?)" ${log_file} | awk -F "|" '{print $2}' | head -n 1 | sed 's/.*://;s/[^0-9.]//g')
+    echo "${device};${model};${tasks};${datasets};${acc};" >> ${WORKSPACE}/summary.log
 fi
diff --git a/.github/workflows/scripts/models/generate_report.sh b/.github/workflows/scripts/models/generate_report.sh
new file mode 100644
index 00000000..4db273f5
--- /dev/null
+++ b/.github/workflows/scripts/models/generate_report.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+WORKSPACE=generated
+last_log_path=FinalReport
+summaryLog=${WORKSPACE}/summary.log
+summaryLogLast=${last_log_path}/summary.log
+PATTERN='[-a-zA-Z0-9_]*='
+
+function main {
+    echo "summaryLog: ${summaryLog}"
+    echo "summaryLogLast: ${summaryLogLast}"
+    echo "is_perf_reg=false" >> "$GITHUB_ENV"
+    preprocessing
+    generate_html_head
+    generate_html_overview
+    generate_results
+    generate_html_footer
+}
+
+function preprocessing {
+    for file_path in log/*
+    do
+        if [[ -d ${file_path} ]] && [[ -f ${file_path}/summary.log ]]; then
+            cat ${file_path}/summary.log >> ${summaryLog}
+        fi
+    done
+}
+
+function generate_html_overview {
+    Test_Info_Title="<th colspan="4">Test Branch</th> <th colspan="4">Commit ID</th> "
+    Test_Info="<th colspan="4">${MR_source_branch}</th> <th colspan="4">${ghprbActualCommit}</th> "
+
+    cat >>${WORKSPACE}/report.html <<eof
+
+<body>
+    <div id="main">
+        <h1 align="center">ITREX Tests
+        [ <a href="${RUN_DISPLAY_URL}">Job-${BUILD_NUMBER}</a> ]</h1>
+      <h1 align="center">Test Status: ${JOB_STATUS}</h1>
+        <h2>Summary</h2>
+        <table class="features-table">
+            <tr>
+              <th>Repo</th>
+              ${Test_Info_Title}
+              </tr>
+              <tr>
+                    <td><a href="https://github.com/intel/intel-extension-for-transformers">ITREX</a></td>
+              ${Test_Info}
+                </tr>
+        </table>
+eof
+}
+
+function generate_results {
+    cat >>${WORKSPACE}/report.html <<eof
+    <h2>Performance</h2>
+      <table class="features-table">
+        <tr>
+          <th>Device</th>
+          <th>Tasks</th>
+          <th>Model</th>
+          <th>Datasets</th>
+          <th>VS</th>
+          <th>Accuracy</th>
+        </tr>
+eof
+
+    devices=$(cat ${summaryLog} | cut -d';' -f1 | awk '!a[$0]++')
+    for device in ${devices[@]}; do
+        models=$(cat ${summaryLog} | grep "${device};" | cut -d';' -f2 | awk '!a[$0]++')
+        for model in ${models[@]}; do
+            tasks=$(cat ${summaryLog} | grep "${device};${model};" | cut -d';' -f3 | awk '!a[$0]++')
+            for task in ${tasks[@]}; do
+                datasets=$(cat ${summaryLog} | grep "${device};${model};${task};" | cut -d';' -f4 | awk '!a[$0]++')
+                for dataset in ${datasets[@]}; do
+                    benchmark_pattern="${device};${model};${task};${dataset};"
+                    acc=$(cat ${summaryLog} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++')
+                    acc_last=nan
+                    if [ $(cat ${summaryLogLast} | grep -c "${benchmark_pattern}") != 0 ]; then
+                        acc_last=$(cat ${summaryLogLast} | grep "${benchmark_pattern}" | cut -d';' -f5 | awk '!a[$0]++')
+                    fi
+                    generate_core
+                done
+            done
+        done
+    done
+    cat >>${WORKSPACE}/report.html <<eof
+    </table>
+eof
+}
+
+function generate_core {
+    echo "<tr><td rowspan=3>${device}</td><td rowspan=3>${model}</td><td rowspan=3>${task}</td><td rowspan=3>${dataset}</td><td>New</td>" >>${WORKSPACE}/report.html
+    echo | awk -v acc=${acc} -v acc_l=${acc_last} '
+        function show_benchmark(a) {
+            if(a ~/[1-9]/) {
+                printf("<td>%.2f</td>\n",a);
+            }else {
+                printf("<td></td>\n");
+            }
+        }
+        function compare_new_last(a,b){
+            if(a ~/[1-9]/ && b ~/[1-9]/) {
+                target = b / a;
+                if(target >= 0.945) {
+                    status_png = "background-color:#90EE90";
+                }else {
+                    status_png = "background-color:#FFD2D2";
+                    job_status = "fail"
+                }
+                printf("<td style=\"%s\">%.2f</td>", status_png, target);
+            }else{
+                if(a == ""){
+                    job_status = "fail"
+                    status_png = "background-color:#FFD2D2";
+                    printf("<td style=\"%s\"></td>", status_png);
+                }else{
+                    printf("<td class=\"col-cell col-cell3\"></td>");
+                }
+            }
+        }
+        BEGIN {
+            job_status = "pass"
+        }{
+            // current
+            show_benchmark(acc)
+            // Last
+            printf("</tr>\n<tr><td>Last</td>")
+            show_benchmark(acc_l)
+            // current vs last
+            printf("</tr>\n<tr><td>New/Last</td>");
+            compare_new_last(acc,acc_l)
+            printf("</tr>\n");
+        } END{
+          printf("\n%s", job_status);
+        }
+    ' >>${WORKSPACE}/report.html
+    job_state=$(tail -1 ${WORKSPACE}/report.html)
+    sed -i '$s/.*//' ${WORKSPACE}/report.html
+    if [ ${job_state} == 'fail' ]; then
+        echo "is_perf_reg=true" >> "$GITHUB_ENV"
+    fi
+}
+
+function generate_html_head {
+    cat >${WORKSPACE}/report.html <<eof
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Daily Tests - TensorFlow - Jenkins</title>
+    <style>
+        body {
+            margin: 0;
+            padding: 0;
+            background: white no-repeat left top;
+        }
+
+        #main {
+            // width: 100%;
+            margin: 20px auto 10px auto;
+            background: white;
+            -moz-border-radius: 8px;
+            -webkit-border-radius: 8px;
+            padding: 0 30px 30px 30px;
+            border: 1px solid #adaa9f;
+            -moz-box-shadow: 0 2px 2px #9c9c9c;
+            -webkit-box-shadow: 0 2px 2px #9c9c9c;
+        }
+
+        .features-table {
+            width: 100%;
+            margin: 0 auto;
+            border-collapse: separate;
+            border-spacing: 0;
+            text-shadow: 0 1px 0 #fff;
+            color: #2a2a2a;
+            background: #fafafa;
+            background-image: -moz-linear-gradient(top, #fff, #eaeaea, #fff);
+            /* Firefox 3.6 */
+            background-image: -webkit-gradient(linear, center bottom, center top, from(#fff), color-stop(0.5, #eaeaea), to(#fff));
+            font-family: Verdana, Arial, Helvetica
+        }
+
+        .features-table th,
+        td {
+            text-align: center;
+            height: 25px;
+            line-height: 25px;
+            padding: 0 8px;
+            border: 1px solid #cdcdcd;
+            box-shadow: 0 1px 0 white;
+            -moz-box-shadow: 0 1px 0 white;
+            -webkit-box-shadow: 0 1px 0 white;
+            white-space: nowrap;
+        }
+
+        .no-border th {
+            box-shadow: none;
+            -moz-box-shadow: none;
+            -webkit-box-shadow: none;
+        }
+
+        .col-cell {
+            text-align: center;
+            width: 150px;
+            font: normal 1em Verdana, Arial, Helvetica;
+        }
+
+        .col-cell3 {
+            background: #efefef;
+            background: rgba(144, 144, 144, 0.15);
+        }
+
+        .col-cell1,
+        .col-cell2 {
+            background: #B0C4DE;
+            background: rgba(176, 196, 222, 0.3);
+        }
+
+        .col-cellh {
+            font: bold 1.3em 'trebuchet MS', 'Lucida Sans', Arial;
+            -moz-border-radius-topright: 10px;
+            -moz-border-radius-topleft: 10px;
+            border-top-right-radius: 10px;
+            border-top-left-radius: 10px;
+            border-top: 1px solid #eaeaea !important;
+        }
+
+        .col-cellf {
+            font: bold 1.4em Georgia;
+            -moz-border-radius-bottomright: 10px;
+            -moz-border-radius-bottomleft: 10px;
+            border-bottom-right-radius: 10px;
+            border-bottom-left-radius: 10px;
+            border-bottom: 1px solid #dadada !important;
+        }
+    </style>
+</head>
+eof
+}
+
+function generate_html_footer {
+    cat >>${WORKSPACE}/report.html <<eof
+    </div>
+</body>
+</html>
+eof
+}
+
+main
diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh
index fb420086..7d460ac2 100644
--- a/.github/workflows/scripts/models/model_test.sh
+++ b/.github/workflows/scripts/models/model_test.sh
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -eo pipefail
+set -o pipefail
+set -x
 source /GenAIEval/.github/workflows/scripts/change_color
-
+git config --global --add safe.directory /GenAIEval
 # get parameters
 PATTERN='[-a-zA-Z0-9_]*='
 PERF_STABLE_CHECK=true
@@ -34,11 +35,7 @@ for i in "$@"; do
     esac
 done
 
-log_dir="/GenAIEval/${device}/${model}"
-mkdir -p ${log_dir}
 working_dir=""
-$BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET
-
 main() {
     case ${tasks} in
         "text-generation")
@@ -48,7 +45,21 @@ main() {
         *)
             echo "Not suppotted task"; exit 1;;
     esac
+    if [[ ${model} == *"opt"* ]]; then
+        pretrained="facebook/${model}"
+    else
+        pretrained="${model}"
+    fi
+    if [[ ${device} == "cpu" ]]; then
+        model_sourze="hf"
+    elif [[ ${device} == "hpu" ]]; then
+        model_sourze="gaudi-hf"
+    fi
+    log_dir="/log/${device}/${model}"
+    mkdir -p ${log_dir}
+    $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET
     run_benchmark
+    cp ${log_dir}/${device}-${tasks}-${model}-${datasets}.log /GenAIEval/
 }
 
 function prepare() {
@@ -62,22 +73,20 @@ function prepare() {
     else
         echo "Not found requirements.txt file."
     fi
-    if [[ ${device} == "hpu" ]]; then
-        pip install --upgrade-strategy eager optimum[habana]
-    fi
 }
 
 function run_benchmark() {
     cd ${working_dir}
-    overall_log="${log_dir}/${device}-${model}-${tasks}-${datasets}.log"
+    overall_log="${log_dir}/${device}-${tasks}-${model}-${datasets}.log"
     python main.py \
-        --model hf \
-        --model_args pretrained=${model} \
+        --model ${model_sourze} \
+        --model_args pretrained=${pretrained} \
         --tasks ${datasets} \
         --device ${device} \
-        --batch_size 112
-        2>&1 | tee ${overall_log}
+        --batch_size 112  2>&1 | tee ${overall_log}
 
+    echo "print log content:"
+    cat ${overall_log}
     status=$?
     if [ ${status} != 0 ]; then
         echo "Evaluation process returned non-zero exit code."
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 615831c3..4c3807f6 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -49,7 +49,9 @@ jobs:
     steps:
         - name: Clean Up Working Directory
           run: sudo rm -rf ${{github.workspace}}/*
-
+        - name: Load environment variables
+          run:
+            cat ~/actions-runner4/.env >> $GITHUB_ENV
         - name: Checkout out Repo
           uses: actions/checkout@v4
           with:
@@ -59,7 +61,7 @@ jobs:
 
         - name: Docker Build
           run: |
-            docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+            docker build --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
 
         - name: Docker Run
           run: |
@@ -68,6 +70,7 @@ jobs:
               docker rm -vf ${{ env.CONTAINER_NAME }} || true
             fi
             docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \
+            -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" \
             -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
 
         - name: Install Dependencies
diff --git a/Docker/hpu.dockerfile b/Docker/hpu.dockerfile
new file mode 100644
index 00000000..58c4ce1b
--- /dev/null
+++ b/Docker/hpu.dockerfile
@@ -0,0 +1,25 @@
+FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.0:latest as hpu
+
+ENV LANG=en_US.UTF-8
+ENV PYTHONPATH=/root:/usr/lib/habanalabs/
+ARG REPO=https://github.com/opea-project/GenAIEval.git
+ARG REPO_PATH=""
+ARG BRANCH=main
+
+RUN apt-get update && \
+    apt-get install git-lfs && \
+    git-lfs install
+
+# Download code
+SHELL ["/bin/bash", "--login", "-c"]
+RUN mkdir -p /GenAIEval
+COPY ${REPO_PATH} /GenAIEval
+RUN if [ "$REPO_PATH" == "" ]; then rm -rf /GenAIEval/* && rm -rf /GenAIEval/.* ; git clone --single-branch --branch=${BRANCH} ${REPO} /GenAIEval ; fi
+
+# Build From Source
+RUN cd /GenAIEval && \
+    python setup.py install && \
+    pip install --upgrade-strategy eager optimum[habana] && \
+    pip list
+
+WORKDIR /GenAIEval/
\ No newline at end of file
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
index fb77a038..90e2e3a5 100644
--- a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
+++ b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
@@ -16,22 +16,26 @@
 # limitations under the License.
 
 import copy
+import json
 import os
 from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
 
+import requests as requests_obj
 import torch
 import torch.nn.functional as F
 import transformers
 from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs, find_executable_batch_size
 from lm_eval import utils
 from lm_eval.api.instance import Instance
-from lm_eval.api.model import TemplateLM
+from lm_eval.api.model import CacheHook, TemplateLM
+from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator, clear_torch_cache, get_dtype, pad_and_concat, stop_sequences_criteria
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
+from requests.exceptions import RequestException
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -1218,3 +1222,282 @@ def _model_call(self, inps):
             logits = logits[:, :-padding_length, :]
         logits = logits.to(torch.float32)
         return logits
+
+
+@register_model("genai-hf")
+class GenAI_HFLM(HFLM):
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+    def __init__(
+        self,
+        base_url=None,
+        logits_cache: bool = True,
+        tokenizer: Optional[str] = None,
+        revision: Optional[str] = "main",
+        batch_size: int = 1,
+        max_length: Optional[int] = None,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GenaAI service!"
+        self._rank = 0
+        self._world_size = 1
+
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
+        )
+
+        self.logits_cache = logits_cache
+        # select (or create) a pad token to use
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+        # TODO: override this for Gemma
+        self.add_bos_token = add_bos_token
+        if "GemmaTokenizer" in self.tokenizer.__class__.__name__:
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+            )
+
+        self._batch_size = int(batch_size)
+        self._max_length = max_length
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}")
+        self.cache_hook = CacheHook(None)
+        self.headers = {"Content-Type": "application/json"}
+
+    @property
+    def max_length(self) -> int:
+        if self._max_length:
+            return self._max_length
+        else:
+            return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    def _loglikelihood_tokens(
+        self,
+        task_requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method."""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations."""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            task_requests,
+            sort_fn=_collate,
+            group_by=None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(task_requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []
+            encoder_attns = []
+
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :],
+                        dtype=torch.long,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                    )
+                    (inplen,) = inp.shape
+
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                    )
+                    (contlen,) = cont.shape
+
+                    conts.append(cont)
+
+                    padding_len_cont = max(padding_len_cont, contlen) if padding_len_cont is not None else contlen
+
+                padding_len_inp = max(padding_len_inp, inplen) if padding_len_inp is not None else inplen
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(padding_len_inp, inps, padding_side="right")  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(padding_len_inp, inps)  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(padding_len_cont, conts)  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(padding_len_inp, encoder_attns)  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+
+            data = {
+                "batched_inputs": batched_inps.tolist(),
+            }
+            try:
+                response = requests_obj.post(
+                    f"{self.base_url}/v1/completions",
+                    headers=self.headers,
+                    data=json.dumps(data),
+                )
+                response.raise_for_status()
+                response = response.json()
+            except RequestException as e:
+                eval_logger.error(f"RequestException: {e}")
+
+            for (request_str, ctx_tokens, _), greedy_tokens, logprobs, inplen, cont_toks in zip(
+                chunk, response["greedy_tokens"], response["logprobs"], inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (len(logprobs) - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
+                greedy_tokens = torch.tensor(
+                    self._select_cont_toks(greedy_tokens, contlen=contlen, inplen=ctx_len), dtype=torch.long
+                ).unsqueeze(
+                    0
+                )  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+                cont_logprobs = self._select_cont_toks(logprobs, contlen=contlen, inplen=ctx_len)
+
+                # Answer: (log prob, is-exact-match)
+                answer = (sum(cont_logprobs), bool(max_equal))
+
+                res.append(answer)
+
+                self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("loglikelihood_rolling not yet supported for GenAI service")
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        raise NotImplementedError("Not supported yet.")