diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml
index ed70411d..498ded08 100644
--- a/.github/workflows/model_test_cpu.yml
+++ b/.github/workflows/model_test_cpu.yml
@@ -53,10 +53,6 @@ jobs:
- name: Clean Up Working Directory
run: sudo rm -rf ${{github.workspace}}/*
- - name: Load environment variables
- run:
- cat ~/actions-runner4/.env >> $GITHUB_ENV
-
- name: Checkout out Repo
uses: actions/checkout@v4
with:
@@ -65,7 +61,7 @@ jobs:
# We need this because GitHub needs to clone the branch to pipeline
- name: Docker Build
run: |
- docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+ docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
- name: Docker Run
run: |
@@ -74,9 +70,7 @@ jobs:
docker rm -vf ${{ env.CONTAINER_NAME }} || true
fi
docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \
- -v ${{ github.workspace }}:/GenAIEval \
- -e http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" -e https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" \
- ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
+ -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
- name: Binary build
run: |
diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml
index 1e6f2316..4a99de9c 100644
--- a/.github/workflows/model_test_hpu.yml
+++ b/.github/workflows/model_test_hpu.yml
@@ -61,7 +61,7 @@ jobs:
# We need this because GitHub needs to clone the branch to pipeline
- name: Docker Build
run: |
- docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/Docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+ docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
- name: Docker Run
run: |
diff --git a/.github/workflows/scripts/models/generate_report.sh b/.github/workflows/scripts/models/generate_report.sh
index 4db273f5..8d5f2c71 100644
--- a/.github/workflows/scripts/models/generate_report.sh
+++ b/.github/workflows/scripts/models/generate_report.sh
@@ -48,7 +48,7 @@ function generate_html_overview {
-
ITREX Tests
+
Test Status: ${JOB_STATUS}
Summary
@@ -58,7 +58,7 @@ function generate_html_overview {
${Test_Info_Title}
- ITREX |
+ GenAIEval |
${Test_Info}
diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh
index 7d460ac2..92e55090 100644
--- a/.github/workflows/scripts/models/model_test.sh
+++ b/.github/workflows/scripts/models/model_test.sh
@@ -39,9 +39,9 @@ working_dir=""
main() {
case ${tasks} in
"text-generation")
- working_dir="/GenAIEval/GenAIEval/evaluation/lm_evaluation_harness/examples";;
+ working_dir="/GenAIEval/evals/evaluation/lm_evaluation_harness/examples";;
"code-generation")
- working_dir="/GenAIEval/GenAIEval/evaluation/bigcode_evaluation_harness/examples";;
+ working_dir="/GenAIEval/evals/evaluation/bigcode_evaluation_harness/examples";;
*)
echo "Not suppotted task"; exit 1;;
esac
diff --git a/.github/workflows/scripts/unittest/compare_coverage.sh b/.github/workflows/scripts/unittest/compare_coverage.sh
index 88a4e1e5..55b75f44 100644
--- a/.github/workflows/scripts/unittest/compare_coverage.sh
+++ b/.github/workflows/scripts/unittest/compare_coverage.sh
@@ -20,7 +20,7 @@ coverage_PR_lines_rate=$5
coverage_base_lines_rate=$6
coverage_PR_branches_rate=$7
coverage_base_branches_rate=$8
-module_name="GenAIEval"
+module_name="evals"
[[ ! -f $coverage_pr_log ]] && exit 1
[[ ! -f $coverage_base_log ]] && exit 1
file_name="./coverage_compare"
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 4c3807f6..cc4a2712 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -49,9 +49,6 @@ jobs:
steps:
- name: Clean Up Working Directory
run: sudo rm -rf ${{github.workspace}}/*
- - name: Load environment variables
- run:
- cat ~/actions-runner4/.env >> $GITHUB_ENV
- name: Checkout out Repo
uses: actions/checkout@v4
with:
@@ -61,7 +58,7 @@ jobs:
- name: Docker Build
run: |
- docker build --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+ docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
- name: Docker Run
run: |
@@ -70,7 +67,6 @@ jobs:
docker rm -vf ${{ env.CONTAINER_NAME }} || true
fi
docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \
- -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" \
-v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
- name: Install Dependencies
diff --git a/README.md b/README.md
index d4b4e2a3..23838d92 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation
```shell
# pip install --upgrade-strategy eager optimum[habana]
-cd GenAIEval/evaluation/lm_evaluation_harness/examples
+cd evals/evaluation/lm_evaluation_harness/examples
python main.py \
--model gaudi-hf \
--model_args pretrained=EleutherAI/gpt-j-6B \
@@ -29,7 +29,7 @@ python main.py \
##### CPU
```shell
-cd GenAIEval/evaluation/lm_evaluation_harness/examples
+cd evals/evaluation/lm_evaluation_harness/examples
python main.py \
--model hf \
--model_args pretrained=EleutherAI/gpt-j-6B \
@@ -39,7 +39,7 @@ python main.py \
```
#### function call usage
```python
-from GenAIEval.evaluation.lm_evaluation_harness import LMEvalParser, evaluate
+from evals.evaluation.lm_evaluation_harness import LMEvalParser, evaluate
args = LMevalParser(
model="hf",
@@ -69,7 +69,7 @@ docker run -p 9006:9006 --ipc=host -e MODEL="hf" -e MODEL_ARGS="pretrained=Inte
- set `base_url`, `tokenizer` and `--model genai-hf`
```
-cd GenAIEval/evaluation/lm_evaluation_harness/examples
+cd evals/evaluation/lm_evaluation_harness/examples
python main.py \
--model genai-hf \
@@ -83,7 +83,7 @@ For evaluating the models on coding tasks or specifically coding LLMs, we follow
#### command line usage
```shell
-cd GenAIEval/evaluation/bigcode_evaluation_harness/examples
+cd evals/evaluation/bigcode_evaluation_harness/examples
python main.py \
--model "codeparrot/codeparrot-small" \
--tasks "humaneval" \
@@ -93,7 +93,7 @@ python main.py \
```
#### function call usage
```python
-from GenAIEval.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate
+from evals.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate
args = BigcodeEvalParser(
user_model=user_model,
diff --git a/Docker/hpu.dockerfile b/docker/hpu.dockerfile
similarity index 100%
rename from Docker/hpu.dockerfile
rename to docker/hpu.dockerfile
diff --git a/GenAIEval/__init__.py b/evals/__init__.py
similarity index 100%
rename from GenAIEval/__init__.py
rename to evals/__init__.py
diff --git a/GenAIEval/benchmark/__init__.py b/evals/benchmark/__init__.py
similarity index 100%
rename from GenAIEval/benchmark/__init__.py
rename to evals/benchmark/__init__.py
diff --git a/GenAIEval/benchmark/chatqna_benchmark.py b/evals/benchmark/chatqna_benchmark.py
similarity index 100%
rename from GenAIEval/benchmark/chatqna_benchmark.py
rename to evals/benchmark/chatqna_benchmark.py
diff --git a/GenAIEval/benchmark/data.json b/evals/benchmark/data.json
similarity index 100%
rename from GenAIEval/benchmark/data.json
rename to evals/benchmark/data.json
diff --git a/GenAIEval/evaluation/__init__.py b/evals/evaluation/__init__.py
similarity index 100%
rename from GenAIEval/evaluation/__init__.py
rename to evals/evaluation/__init__.py
diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/__init__.py b/evals/evaluation/bigcode_evaluation_harness/__init__.py
similarity index 100%
rename from GenAIEval/evaluation/bigcode_evaluation_harness/__init__.py
rename to evals/evaluation/bigcode_evaluation_harness/__init__.py
diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/accuracy.py b/evals/evaluation/bigcode_evaluation_harness/accuracy.py
similarity index 100%
rename from GenAIEval/evaluation/bigcode_evaluation_harness/accuracy.py
rename to evals/evaluation/bigcode_evaluation_harness/accuracy.py
diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/arguments.py b/evals/evaluation/bigcode_evaluation_harness/arguments.py
similarity index 100%
rename from GenAIEval/evaluation/bigcode_evaluation_harness/arguments.py
rename to evals/evaluation/bigcode_evaluation_harness/arguments.py
diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py b/evals/evaluation/bigcode_evaluation_harness/examples/main.py
similarity index 90%
rename from GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py
rename to evals/evaluation/bigcode_evaluation_harness/examples/main.py
index 1b998c04..bef7f494 100644
--- a/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py
+++ b/evals/evaluation/bigcode_evaluation_harness/examples/main.py
@@ -14,7 +14,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from GenAIEval.evaluation.bigcode_evaluation_harness import evaluate, setup_parser
+from evals.evaluation.bigcode_evaluation_harness import evaluate, setup_parser
def main():
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/__init__.py b/evals/evaluation/lm_evaluation_harness/__init__.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/__init__.py
rename to evals/evaluation/lm_evaluation_harness/__init__.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/accuracy.py b/evals/evaluation/lm_evaluation_harness/accuracy.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/accuracy.py
rename to evals/evaluation/lm_evaluation_harness/accuracy.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/arguments.py b/evals/evaluation/lm_evaluation_harness/arguments.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/arguments.py
rename to evals/evaluation/lm_evaluation_harness/arguments.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py b/evals/evaluation/lm_evaluation_harness/examples/main.py
similarity index 90%
rename from GenAIEval/evaluation/lm_evaluation_harness/examples/main.py
rename to evals/evaluation/lm_evaluation_harness/examples/main.py
index 15b23d2a..ee61377e 100644
--- a/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py
+++ b/evals/evaluation/lm_evaluation_harness/examples/main.py
@@ -15,7 +15,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from GenAIEval.evaluation.lm_evaluation_harness import evaluate, setup_parser
+from evals.evaluation.lm_evaluation_harness import evaluate, setup_parser
def main():
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/__init__.py b/evals/evaluation/lm_evaluation_harness/lm_eval/__init__.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/__init__.py
rename to evals/evaluation/lm_evaluation_harness/lm_eval/__init__.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/evaluator.py b/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/evaluator.py
rename to evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py b/evals/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py
rename to evals/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py
diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/evals/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
similarity index 100%
rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
rename to evals/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
diff --git a/setup.py b/setup.py
index 56838559..8f51bda0 100644
--- a/setup.py
+++ b/setup.py
@@ -26,8 +26,8 @@ def parse_requirements(filename):
setup(
- name="GenAIEval",
- version="0.0.0",
+ name="opea_eval",
+ version="0.6",
author="Intel AISE AIPC Team",
author_email="haihao.shen@intel.com, feng.tian@intel.com, chang1.wang@intel.com, kaokao.lv@intel.com",
description="Evaluation and benchmark for Generative AI",
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 3439940c..cc3859dd 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1 +1,2 @@
+bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
lm-eval==0.4.2
diff --git a/tests/test_bigcode_eval.py b/tests/test_bigcode_eval.py
index d57e8a51..09e3f139 100644
--- a/tests/test_bigcode_eval.py
+++ b/tests/test_bigcode_eval.py
@@ -19,7 +19,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
-from GenAIEval.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate
+from evals.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate
class TestLMEval(unittest.TestCase):
diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py
index c5e49e14..1f8f4f63 100644
--- a/tests/test_lm_eval.py
+++ b/tests/test_lm_eval.py
@@ -19,7 +19,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
-from GenAIEval.evaluation.lm_evaluation_harness import LMEvalParser, evaluate
+from evals.evaluation.lm_evaluation_harness import LMEvalParser, evaluate
class TestLMEval(unittest.TestCase):