diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml index ed70411d..498ded08 100644 --- a/.github/workflows/model_test_cpu.yml +++ b/.github/workflows/model_test_cpu.yml @@ -53,10 +53,6 @@ jobs: - name: Clean Up Working Directory run: sudo rm -rf ${{github.workspace}}/* - - name: Load environment variables - run: - cat ~/actions-runner4/.env >> $GITHUB_ENV - - name: Checkout out Repo uses: actions/checkout@v4 with: @@ -65,7 +61,7 @@ jobs: # We need this because GitHub needs to clone the branch to pipeline - name: Docker Build run: | - docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | @@ -74,9 +70,7 @@ jobs: docker rm -vf ${{ env.CONTAINER_NAME }} || true fi docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \ - -v ${{ github.workspace }}:/GenAIEval \ - -e http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" -e https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" \ - ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} + -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} - name: Binary build run: | diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml index 1e6f2316..4a99de9c 100644 --- a/.github/workflows/model_test_hpu.yml +++ b/.github/workflows/model_test_hpu.yml @@ -61,7 +61,7 @@ jobs: # We need this because GitHub needs to clone the branch to pipeline - name: Docker Build run: | - docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/Docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + docker build --target hpu --build-arg REPO_PATH="." -f ${{ github.workspace }}/docker/hpu.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | diff --git a/.github/workflows/scripts/models/generate_report.sh b/.github/workflows/scripts/models/generate_report.sh index 4db273f5..8d5f2c71 100644 --- a/.github/workflows/scripts/models/generate_report.sh +++ b/.github/workflows/scripts/models/generate_report.sh @@ -48,7 +48,7 @@ function generate_html_overview {
-

ITREX Tests +

GenAIEval Tests [ Job-${BUILD_NUMBER} ]

Test Status: ${JOB_STATUS}

Summary

@@ -58,7 +58,7 @@ function generate_html_overview { ${Test_Info_Title} - ITREX + GenAIEval ${Test_Info} diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh index 7d460ac2..92e55090 100644 --- a/.github/workflows/scripts/models/model_test.sh +++ b/.github/workflows/scripts/models/model_test.sh @@ -39,9 +39,9 @@ working_dir="" main() { case ${tasks} in "text-generation") - working_dir="/GenAIEval/GenAIEval/evaluation/lm_evaluation_harness/examples";; + working_dir="/GenAIEval/evals/evaluation/lm_evaluation_harness/examples";; "code-generation") - working_dir="/GenAIEval/GenAIEval/evaluation/bigcode_evaluation_harness/examples";; + working_dir="/GenAIEval/evals/evaluation/bigcode_evaluation_harness/examples";; *) echo "Not suppotted task"; exit 1;; esac diff --git a/.github/workflows/scripts/unittest/compare_coverage.sh b/.github/workflows/scripts/unittest/compare_coverage.sh index 88a4e1e5..55b75f44 100644 --- a/.github/workflows/scripts/unittest/compare_coverage.sh +++ b/.github/workflows/scripts/unittest/compare_coverage.sh @@ -20,7 +20,7 @@ coverage_PR_lines_rate=$5 coverage_base_lines_rate=$6 coverage_PR_branches_rate=$7 coverage_base_branches_rate=$8 -module_name="GenAIEval" +module_name="evals" [[ ! -f $coverage_pr_log ]] && exit 1 [[ ! -f $coverage_base_log ]] && exit 1 file_name="./coverage_compare" diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 4c3807f6..cc4a2712 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -49,9 +49,6 @@ jobs: steps: - name: Clean Up Working Directory run: sudo rm -rf ${{github.workspace}}/* - - name: Load environment variables - run: - cat ~/actions-runner4/.env >> $GITHUB_ENV - name: Checkout out Repo uses: actions/checkout@v4 with: @@ -61,7 +58,7 @@ jobs: - name: Docker Build run: | - docker build --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | @@ -70,7 +67,6 @@ jobs: docker rm -vf ${{ env.CONTAINER_NAME }} || true fi docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \ - -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" \ -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} - name: Install Dependencies diff --git a/README.md b/README.md index d4b4e2a3..23838d92 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation ```shell # pip install --upgrade-strategy eager optimum[habana] -cd GenAIEval/evaluation/lm_evaluation_harness/examples +cd evals/evaluation/lm_evaluation_harness/examples python main.py \ --model gaudi-hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -29,7 +29,7 @@ python main.py \ ##### CPU ```shell -cd GenAIEval/evaluation/lm_evaluation_harness/examples +cd evals/evaluation/lm_evaluation_harness/examples python main.py \ --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -39,7 +39,7 @@ python main.py \ ``` #### function call usage ```python -from GenAIEval.evaluation.lm_evaluation_harness import LMEvalParser, evaluate +from evals.evaluation.lm_evaluation_harness import LMEvalParser, evaluate args = LMevalParser( model="hf", @@ -69,7 +69,7 @@ docker run -p 9006:9006 --ipc=host -e MODEL="hf" -e MODEL_ARGS="pretrained=Inte - set `base_url`, `tokenizer` and `--model genai-hf` ``` -cd GenAIEval/evaluation/lm_evaluation_harness/examples +cd evals/evaluation/lm_evaluation_harness/examples python main.py \ --model genai-hf \ @@ -83,7 +83,7 @@ For evaluating the models on coding tasks or specifically coding LLMs, we follow #### command line usage ```shell -cd GenAIEval/evaluation/bigcode_evaluation_harness/examples +cd evals/evaluation/bigcode_evaluation_harness/examples python main.py \ --model "codeparrot/codeparrot-small" \ --tasks "humaneval" \ @@ -93,7 +93,7 @@ python main.py \ ``` #### function call usage ```python -from GenAIEval.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate +from evals.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate args = BigcodeEvalParser( user_model=user_model, diff --git a/Docker/hpu.dockerfile b/docker/hpu.dockerfile similarity index 100% rename from Docker/hpu.dockerfile rename to docker/hpu.dockerfile diff --git a/GenAIEval/__init__.py b/evals/__init__.py similarity index 100% rename from GenAIEval/__init__.py rename to evals/__init__.py diff --git a/GenAIEval/benchmark/__init__.py b/evals/benchmark/__init__.py similarity index 100% rename from GenAIEval/benchmark/__init__.py rename to evals/benchmark/__init__.py diff --git a/GenAIEval/benchmark/chatqna_benchmark.py b/evals/benchmark/chatqna_benchmark.py similarity index 100% rename from GenAIEval/benchmark/chatqna_benchmark.py rename to evals/benchmark/chatqna_benchmark.py diff --git a/GenAIEval/benchmark/data.json b/evals/benchmark/data.json similarity index 100% rename from GenAIEval/benchmark/data.json rename to evals/benchmark/data.json diff --git a/GenAIEval/evaluation/__init__.py b/evals/evaluation/__init__.py similarity index 100% rename from GenAIEval/evaluation/__init__.py rename to evals/evaluation/__init__.py diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/__init__.py b/evals/evaluation/bigcode_evaluation_harness/__init__.py similarity index 100% rename from GenAIEval/evaluation/bigcode_evaluation_harness/__init__.py rename to evals/evaluation/bigcode_evaluation_harness/__init__.py diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/accuracy.py b/evals/evaluation/bigcode_evaluation_harness/accuracy.py similarity index 100% rename from GenAIEval/evaluation/bigcode_evaluation_harness/accuracy.py rename to evals/evaluation/bigcode_evaluation_harness/accuracy.py diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/arguments.py b/evals/evaluation/bigcode_evaluation_harness/arguments.py similarity index 100% rename from GenAIEval/evaluation/bigcode_evaluation_harness/arguments.py rename to evals/evaluation/bigcode_evaluation_harness/arguments.py diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py b/evals/evaluation/bigcode_evaluation_harness/examples/main.py similarity index 90% rename from GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py rename to evals/evaluation/bigcode_evaluation_harness/examples/main.py index 1b998c04..bef7f494 100644 --- a/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py +++ b/evals/evaluation/bigcode_evaluation_harness/examples/main.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from GenAIEval.evaluation.bigcode_evaluation_harness import evaluate, setup_parser +from evals.evaluation.bigcode_evaluation_harness import evaluate, setup_parser def main(): diff --git a/GenAIEval/evaluation/lm_evaluation_harness/__init__.py b/evals/evaluation/lm_evaluation_harness/__init__.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/__init__.py rename to evals/evaluation/lm_evaluation_harness/__init__.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/accuracy.py b/evals/evaluation/lm_evaluation_harness/accuracy.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/accuracy.py rename to evals/evaluation/lm_evaluation_harness/accuracy.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/arguments.py b/evals/evaluation/lm_evaluation_harness/arguments.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/arguments.py rename to evals/evaluation/lm_evaluation_harness/arguments.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py b/evals/evaluation/lm_evaluation_harness/examples/main.py similarity index 90% rename from GenAIEval/evaluation/lm_evaluation_harness/examples/main.py rename to evals/evaluation/lm_evaluation_harness/examples/main.py index 15b23d2a..ee61377e 100644 --- a/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py +++ b/evals/evaluation/lm_evaluation_harness/examples/main.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from GenAIEval.evaluation.lm_evaluation_harness import evaluate, setup_parser +from evals.evaluation.lm_evaluation_harness import evaluate, setup_parser def main(): diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/__init__.py b/evals/evaluation/lm_evaluation_harness/lm_eval/__init__.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/__init__.py rename to evals/evaluation/lm_evaluation_harness/lm_eval/__init__.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/evaluator.py b/evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/evaluator.py rename to evals/evaluation/lm_evaluation_harness/lm_eval/evaluator.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py b/evals/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py rename to evals/evaluation/lm_evaluation_harness/lm_eval/models/__init__.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/evals/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py rename to evals/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py diff --git a/setup.py b/setup.py index 56838559..8f51bda0 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,8 @@ def parse_requirements(filename): setup( - name="GenAIEval", - version="0.0.0", + name="opea_eval", + version="0.6", author="Intel AISE AIPC Team", author_email="haihao.shen@intel.com, feng.tian@intel.com, chang1.wang@intel.com, kaokao.lv@intel.com", description="Evaluation and benchmark for Generative AI", diff --git a/tests/requirements.txt b/tests/requirements.txt index 3439940c..cc3859dd 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1 +1,2 @@ +bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e lm-eval==0.4.2 diff --git a/tests/test_bigcode_eval.py b/tests/test_bigcode_eval.py index d57e8a51..09e3f139 100644 --- a/tests/test_bigcode_eval.py +++ b/tests/test_bigcode_eval.py @@ -19,7 +19,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer -from GenAIEval.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate +from evals.evaluation.bigcode_evaluation_harness import BigcodeEvalParser, evaluate class TestLMEval(unittest.TestCase): diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py index c5e49e14..1f8f4f63 100644 --- a/tests/test_lm_eval.py +++ b/tests/test_lm_eval.py @@ -19,7 +19,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer -from GenAIEval.evaluation.lm_evaluation_harness import LMEvalParser, evaluate +from evals.evaluation.lm_evaluation_harness import LMEvalParser, evaluate class TestLMEval(unittest.TestCase):