evaluate #59

Workflow file for this run

.github/workflows/evaluate.yml at 19d44fc

	name: evaluate

	on:
	workflow_dispatch:
	inputs:
	repo_org:
	required: false
	description: 'Tested repository organization name. Default is InternLM/lmdeploy'
	type: string
	default: 'InternLM/lmdeploy'
	repo_ref:
	required: false
	description: 'Set branch or tag or commit id. Default is "main"'
	type: string
	default: 'main'
	chat_models:
	required: true
	description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
	type: string
	default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, turbomind_internlm2_chat_7b_4bits, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_qwen1_5_7b_chat_4bits, turbomind_qwen1_5_7b_chat_kvint4, turbomind_qwen1_5_7b_chat_kvint8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_8b_instruct_kvint8, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint8]'
	chat_datasets:
	required: true
	description: 'Tested datasets list. eg. [bbh_datasets,ceval_datasets,cmmlu_datasets,GaokaoBench_datasets,gpqa_datasets,gsm8k_datasets,hellaswag_datasets,humaneval_datasets,ifeval_datasets,math_datasets,sanitized_mbpp_datasets,mmlu_datasets,nq_datasets,race_datasets,TheoremQA_datasets,triviaqa_datasets,winogrande_datasets,crowspairs_datasets]'
	type: string
	default: '[mmlu_datasets, gsm8k_datasets]'
	base_models:
	required: true
	description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
	type: string
	default: '[turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
	baes_datasets:
	required: true
	description: 'Tested datasets list. eg. [mmlu_datasets, gsm8k_datasets]'
	type: string
	default: '[mmlu_datasets, gsm8k_datasets]'
	oc_repo_org:
	required: false
	description: 'Tested repository organization name. Default is open-compass/opencompass'
	type: string
	default: 'open-compass/opencompass'
	oc_repo_ref:
	required: false
	description: 'Set branch or tag or commit id. Default is "main"'
	type: string
	default: 'main'
	offline_mode:
	required: true
	description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
	type: boolean
	default: false

	env:
	ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

	jobs:
	linux-build:
	if: ${{github.event_name == 'schedule' \|\| (!cancelled() && !inputs.offline_mode)}}
	strategy:
	matrix:
	pyver: [py310]
	runs-on: ubuntu-latest
	env:
	PYTHON_VERSION: ${{ matrix.pyver }}
	PLAT_NAME: manylinux2014_x86_64
	DOCKER_TAG: cuda11.8
	OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v3
	with:
	repository: ${{ github.event.inputs.repo_org \|\| 'InternLM/lmdeploy' }}
	ref: ${{github.event.inputs.repo_ref \|\| 'main'}}
	- name: Build
	run: \|
	echo ${PYTHON_VERSION}
	echo ${PLAT_NAME}
	echo ${DOCKER_TAG}
	echo ${OUTPUT_FOLDER}
	echo ${GITHUB_RUN_ID}
	# remove -it
	sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
	bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
	- name: Upload Artifacts
	uses: actions/upload-artifact@v4
	with:
	if-no-files-found: error
	path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
	retention-days: 1
	name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}


	evaluate:
	needs: linux-build
	if: ${{github.event_name == 'schedule' \|\| !cancelled()}}
	runs-on: [self-hosted, linux-a100]
	timeout-minutes: 4320 # 72hours
	strategy:
	fail-fast: false
	matrix:
	evaluate_type: ['chat', 'base']
	container:
	image: openmmlab/lmdeploy:latest-cu11
	options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
	volumes:
	- /nvme/github-actions/pip-cache:/root/.cache/pip
	- /nvme/github-actions/packages:/root/packages
	- /nvme/github-actions/resources:/root/resources
	- /nvme/github-actions/opencompass-data:/root/opencompass-data
	- /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
	- /nvme/qa_test_models:/root/models
	- /mnt/shared:/mnt/shared
	- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
	steps:
	- name: Setup systems
	run: \|
	export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
	echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
	- name: Clone repository
	uses: actions/checkout@v2
	if: ${{github.event_name == 'schedule' \|\| !inputs.offline_mode}}
	with:
	repository: ${{ github.event.inputs.repo_org \|\| 'InternLM/lmdeploy' }}
	ref: ${{github.event.inputs.repo_ref \|\| 'main'}}
	- name: Copy repository - offline
	if: ${{inputs.offline_mode}}
	run: cp -r /root/models/offline_pkg/lmdeploy/. .
	- name: Download Artifacts
	if: ${{github.event_name == 'schedule' \|\| !inputs.offline_mode}}
	uses: actions/download-artifact@v4
	with:
	name: my-artifact-${{ github.run_id }}-py310
	- name: Install lmdeploy - dependency
	run: \|
	# manually install flash attn
	# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
	python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
	python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
	python3 -m pip install -r /root/models/offline_pkg/requirements.txt
	- name: Install lmdeploy
	if: ${{github.event_name == 'schedule' \|\| !inputs.offline_mode}}
	run: \|
	python3 -m pip install lmdeploy-*.whl --no-deps
	python3 -m pip install -r requirements/test.txt
	- name: Install lmdeploy - offline
	if: ${{inputs.offline_mode}}
	run: \|
	python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps
	python3 -m pip install -r requirements/test.txt
	- name: Install opencompass
	run: \|
	git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
	cd opencompass
	git checkout ${{ github.event.inputs.oc_repo_ref}}
	python3 -m pip install -e .
	echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
	- name: Check env
	run: \|
	pip uninstall -y nvidia-nccl-cu11
	python3 -m pip list
	lmdeploy check_env
	- name: Setup paths for evaluation
	run: \|
	ln -s /root/opencompass-data ./data
	python3 .github/scripts/action_tools.py create_model_links /root/models .
	- name: Evaluate chat models
	if: matrix.evaluate_type == 'chat'
	run: \|
	echo ${{github.event.inputs.chat_models}}
	echo ${{github.event.inputs.chat_datasets}}
	export LMDEPLOY_DIR=$(pwd)

	python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.chat_models}}" "${{github.event.inputs.chat_datasets}}" /root/evaluation-reports/${{ github.run_id }} chat
	- name: Evaluate base models
	if: matrix.evaluate_type == 'base'
	run: \|
	echo ${{github.event.inputs.base_models}}
	echo ${{github.event.inputs.baes_datasets}}
	export LMDEPLOY_DIR=$(pwd)

	python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation-reports/${{ github.run_id }} base
	- name: Clear workspace
	if: always()
	run: \|
	export workdir=$(pwd)
	cd ..
	rm -rf $workdir
	mkdir $workdir
	chmod -R 777 $workdir

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

evaluate #59

Workflow file

evaluate #59

Jobs

Run details

Workflow file for this run