From ed311f541a69257e8e73f71286fc56253d182104 Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Fri, 31 May 2024 00:46:10 +0800 Subject: [PATCH] llm server for lm-eval (#82) --- comps/llms/lm-eval/Dockerfile.cpu | 26 +++++++++ comps/llms/lm-eval/README.md | 39 ++++++++++++++ comps/llms/lm-eval/requirements.txt | 4 ++ comps/llms/lm-eval/self_hosted_hf.py | 79 ++++++++++++++++++++++++++++ 4 files changed, 148 insertions(+) create mode 100644 comps/llms/lm-eval/Dockerfile.cpu create mode 100644 comps/llms/lm-eval/README.md create mode 100644 comps/llms/lm-eval/requirements.txt create mode 100644 comps/llms/lm-eval/self_hosted_hf.py diff --git a/comps/llms/lm-eval/Dockerfile.cpu b/comps/llms/lm-eval/Dockerfile.cpu new file mode 100644 index 000000000..72d6a555e --- /dev/null +++ b/comps/llms/lm-eval/Dockerfile.cpu @@ -0,0 +1,26 @@ +ARG UBUNTU_VER=22.04 +FROM ubuntu:${UBUNTU_VER} as devel + +ARG REPO_COMPS=https://github.com/opea-project/GenAIComps.git +ARG BRANCH=main +ENV LANG C.UTF-8 + +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + aspell \ + aspell-en \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-distutils \ + git \ + vim \ + wget + +RUN git clone --single-branch --branch=${BRANCH} ${REPO_COMPS} /home/user/GenAIComps/ && \ + cd /home/user/GenAIComps/ && python3 setup.py install && \ + pip install --no-cache-dir -r /home/user/GenAIComps/comps/llms/lm-eval/requirements.txt + +WORKDIR /home/user/GenAIComps/comps/llms/lm-eval/ + +ENTRYPOINT ["python3", "self_hosted_hf.py"] diff --git a/comps/llms/lm-eval/README.md b/comps/llms/lm-eval/README.md new file mode 100644 index 000000000..f0097f3b6 --- /dev/null +++ b/comps/llms/lm-eval/README.md @@ -0,0 +1,39 @@ +# LM-Eval Microservice + +This microservice, designed for [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness), which can host a separate llm server to evaluate `lm-eval` tasks. + +## CPU service + +### build cpu docker + +``` +docker build -f Dockerfile.cpu -t opea/lm-eval:latest . + +``` + +### start the server + +- set the environments `MODEL`, `MODEL_ARGS`, `DEVICE` and start the server + +``` +docker run -p 9006:9006 --ipc=host -e MODEL="hf" -e MODEL_ARGS="pretrained=Intel/neural-chat-7b-v3-3" -e DEVICE="cpu" opea/lm-eval:latest +``` + +### evaluate the model + +- set `base_url` and `tokenizer` + +``` +git clone https://github.com/opea-project/GenAIEval +cd GenAIEval +pip install -e . + +cd GenAIEval/evaluation/lm_evaluation_harness/examples + +python main.py \ + --model genai-hf \ + --model_args "base_url=http://{your_ip}:9006,tokenizer=Intel/neural-chat-7b-v3-3" \ + --tasks "lambada_openai" \ + --batch_size 2 + +``` diff --git a/comps/llms/lm-eval/requirements.txt b/comps/llms/lm-eval/requirements.txt new file mode 100644 index 000000000..43d6f7b79 --- /dev/null +++ b/comps/llms/lm-eval/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e +git+https://github.com/opea-project/GenAIEval.git +lm-eval==0.4.2 +pydantic==2.7.2 diff --git a/comps/llms/lm-eval/self_hosted_hf.py b/comps/llms/lm-eval/self_hosted_hf.py new file mode 100644 index 000000000..d5ba45775 --- /dev/null +++ b/comps/llms/lm-eval/self_hosted_hf.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +from typing import List + +import lm_eval.api.registry +import torch +from docarray import BaseDoc +from GenAIEval.evaluation.lm_evaluation_harness.lm_eval.models.huggingface import HFLM, GaudiHFModelAdapter + +from comps import ServiceType, opea_microservices, opea_telemetry, register_microservice + +lm_eval.api.registry.MODEL_REGISTRY["hf"] = HFLM +lm_eval.api.registry.MODEL_REGISTRY["gaudi-hf"] = GaudiHFModelAdapter + + +class LLMCompletionDoc(BaseDoc): + batched_inputs: List + logprobs: int = 10 + max_tokens: int = 0 + temperature: float = 0.0 + + +model = os.getenv("MODEL", "") +model_args = os.getenv("MODEL_ARGS", "") +device = os.getenv("DEVICE", "") + +llm = lm_eval.api.registry.get_model(model).create_from_arg_string( + model_args, + { + "batch_size": 1, # dummy + "max_batch_size": None, + "device": device, + }, +) + + +@register_microservice( + name="opea_service@self_hosted_hf", + service_type=ServiceType.LLM, + endpoint="/v1/completions", + host="0.0.0.0", + port=9006, +) +@opea_telemetry +def llm_generate(input: LLMCompletionDoc): + global llm + batched_inputs = torch.tensor(input.batched_inputs, dtype=torch.long, device=llm.device) + with torch.no_grad(): + # TODO, use model.generate. + logits = llm._model_call(batched_inputs) + + logits = torch.nn.functional.log_softmax(logits, dim=-1) + # Check if per-token argmax is exactly equal to continuation + greedy_tokens = logits.argmax(dim=-1) + logprobs = torch.gather(logits, 2, batched_inputs[:, 1:].unsqueeze(-1)).squeeze(-1) + + return { + "greedy_tokens": greedy_tokens.detach().cpu().tolist(), + "logprobs": logprobs.detach().cpu().tolist(), + "batched_inputs": input.batched_inputs, + } + + +if __name__ == "__main__": + opea_microservices["opea_service@self_hosted_hf"].start()