Skip to content

Commit

Permalink
llm server for lm-eval (#82)
Browse files Browse the repository at this point in the history
  • Loading branch information
lkk12014402 authored May 30, 2024
1 parent ed0a35c commit ed311f5
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 0 deletions.
26 changes: 26 additions & 0 deletions comps/llms/lm-eval/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
ARG UBUNTU_VER=22.04
FROM ubuntu:${UBUNTU_VER} as devel

ARG REPO_COMPS=https://github.com/opea-project/GenAIComps.git
ARG BRANCH=main
ENV LANG C.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
aspell \
aspell-en \
build-essential \
python3 \
python3-pip \
python3-dev \
python3-distutils \
git \
vim \
wget

RUN git clone --single-branch --branch=${BRANCH} ${REPO_COMPS} /home/user/GenAIComps/ && \
cd /home/user/GenAIComps/ && python3 setup.py install && \
pip install --no-cache-dir -r /home/user/GenAIComps/comps/llms/lm-eval/requirements.txt

WORKDIR /home/user/GenAIComps/comps/llms/lm-eval/

ENTRYPOINT ["python3", "self_hosted_hf.py"]
39 changes: 39 additions & 0 deletions comps/llms/lm-eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# LM-Eval Microservice

This microservice, designed for [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness), which can host a separate llm server to evaluate `lm-eval` tasks.

## CPU service

### build cpu docker

```
docker build -f Dockerfile.cpu -t opea/lm-eval:latest .
```

### start the server

- set the environments `MODEL`, `MODEL_ARGS`, `DEVICE` and start the server

```
docker run -p 9006:9006 --ipc=host -e MODEL="hf" -e MODEL_ARGS="pretrained=Intel/neural-chat-7b-v3-3" -e DEVICE="cpu" opea/lm-eval:latest
```

### evaluate the model

- set `base_url` and `tokenizer`

```
git clone https://github.com/opea-project/GenAIEval
cd GenAIEval
pip install -e .
cd GenAIEval/evaluation/lm_evaluation_harness/examples
python main.py \
--model genai-hf \
--model_args "base_url=http://{your_ip}:9006,tokenizer=Intel/neural-chat-7b-v3-3" \
--tasks "lambada_openai" \
--batch_size 2
```
4 changes: 4 additions & 0 deletions comps/llms/lm-eval/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
git+https://github.com/opea-project/GenAIEval.git
lm-eval==0.4.2
pydantic==2.7.2
79 changes: 79 additions & 0 deletions comps/llms/lm-eval/self_hosted_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
from typing import List

import lm_eval.api.registry
import torch
from docarray import BaseDoc
from GenAIEval.evaluation.lm_evaluation_harness.lm_eval.models.huggingface import HFLM, GaudiHFModelAdapter

from comps import ServiceType, opea_microservices, opea_telemetry, register_microservice

lm_eval.api.registry.MODEL_REGISTRY["hf"] = HFLM
lm_eval.api.registry.MODEL_REGISTRY["gaudi-hf"] = GaudiHFModelAdapter


class LLMCompletionDoc(BaseDoc):
batched_inputs: List
logprobs: int = 10
max_tokens: int = 0
temperature: float = 0.0


model = os.getenv("MODEL", "")
model_args = os.getenv("MODEL_ARGS", "")
device = os.getenv("DEVICE", "")

llm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": 1, # dummy
"max_batch_size": None,
"device": device,
},
)


@register_microservice(
name="opea_service@self_hosted_hf",
service_type=ServiceType.LLM,
endpoint="/v1/completions",
host="0.0.0.0",
port=9006,
)
@opea_telemetry
def llm_generate(input: LLMCompletionDoc):
global llm
batched_inputs = torch.tensor(input.batched_inputs, dtype=torch.long, device=llm.device)
with torch.no_grad():
# TODO, use model.generate.
logits = llm._model_call(batched_inputs)

logits = torch.nn.functional.log_softmax(logits, dim=-1)
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
logprobs = torch.gather(logits, 2, batched_inputs[:, 1:].unsqueeze(-1)).squeeze(-1)

return {
"greedy_tokens": greedy_tokens.detach().cpu().tolist(),
"logprobs": logprobs.detach().cpu().tolist(),
"batched_inputs": input.batched_inputs,
}


if __name__ == "__main__":
opea_microservices["opea_service@self_hosted_hf"].start()

0 comments on commit ed311f5

Please sign in to comment.