opea-project · chensuyue · May 30, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
@@ -0,0 +1,26 @@
+ARG UBUNTU_VER=22.04
+FROM ubuntu:${UBUNTU_VER} as devel
+
+ARG REPO_COMPS=https://github.com/opea-project/GenAIComps.git
+ARG BRANCH=main
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    aspell \
+    aspell-en \
+    build-essential \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-distutils \
+    git \
+    vim \
+    wget
+
+RUN git clone --single-branch --branch=${BRANCH} ${REPO_COMPS} /home/user/GenAIComps/ && \
+    cd /home/user/GenAIComps/ && python3 setup.py install && \
+    pip install --no-cache-dir -r /home/user/GenAIComps/comps/llms/lm-eval/requirements.txt
+
+WORKDIR /home/user/GenAIComps/comps/llms/lm-eval/
+
+ENTRYPOINT ["python3", "self_hosted_hf.py"]
@@ -0,0 +1,39 @@
+# LM-Eval Microservice
+
+This microservice, designed for [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness), which can host a separate llm server to evaluate `lm-eval` tasks.
+
+## CPU service
+
+### build cpu docker
+
+```
+docker build -f Dockerfile.cpu -t comps:lm-eval .
+
+```
+
+### start the server
+
+- set the environments `MODEL`, `MODEL_ARGS`, `DEVICE` and start the server
+
+```
+docker run -p 9006:9006 --ipc=host  -e MODEL="hf" -e MODEL_ARGS="pretrained=Intel/neural-chat-7b-v3-3" -e DEVICE="cpu" comps:lm-eval
+```
+
+### evaluate the model
+
+- set `base_url` and `tokenizer`
+
+```
+git clone https://github.com/opea-project/GenAIEval
+cd GenAIEval
+pip install -e .
+
+cd GenAIEval/evaluation/lm_evaluation_harness/examples
+
+python main.py \
+    --model genai-hf \
+    --model_args "base_url=http://{your_ip}:9006,tokenizer=Intel/neural-chat-7b-v3-3" \
+    --tasks  "lambada_openai" \
+    --batch_size 2
+
+```
@@ -0,0 +1,4 @@
+git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
+git+https://github.com/opea-project/GenAIEval.git
+lm-eval==0.4.2
+pydantic==2.7.2
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from typing import List
+
+import lm_eval.api.registry
+import torch
+from docarray import BaseDoc
+from GenAIEval.evaluation.lm_evaluation_harness.lm_eval.models.huggingface import HFLM, GaudiHFModelAdapter
+
+from comps import ServiceType, opea_microservices, opea_telemetry, register_microservice
+
+lm_eval.api.registry.MODEL_REGISTRY["hf"] = HFLM
+lm_eval.api.registry.MODEL_REGISTRY["gaudi-hf"] = GaudiHFModelAdapter
+
+
+class LLMCompletionDoc(BaseDoc):
+    batched_inputs: List
+    logprobs: int = 10
+    max_tokens: int = 0
+    temperature: float = 0.0
+
+
+model = os.getenv("MODEL", "")
+model_args = os.getenv("MODEL_ARGS", "")
+device = os.getenv("DEVICE", "")
+
+llm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+    model_args,
+    {
+        "batch_size": 1,  # dummy
+        "max_batch_size": None,
+        "device": device,
+    },
+)
+
+
+@register_microservice(
+    name="opea_service@self_hosted_hf",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/completions",
+    host="0.0.0.0",
+    port=9006,
+)
+@opea_telemetry
+def llm_generate(input: LLMCompletionDoc):
+    global llm
+    batched_inputs = torch.tensor(input.batched_inputs, dtype=torch.long, device=llm.device)
+    with torch.no_grad():
+        # TODO, use model.generate.
+        logits = llm._model_call(batched_inputs)
+
+    logits = torch.nn.functional.log_softmax(logits, dim=-1)
+    # Check if per-token argmax is exactly equal to continuation
+    greedy_tokens = logits.argmax(dim=-1)
+    logprobs = torch.gather(logits, 2, batched_inputs[:, 1:].unsqueeze(-1)).squeeze(-1)
+
+    return {
+        "greedy_tokens": greedy_tokens.detach().cpu().tolist(),
+        "logprobs": logprobs.detach().cpu().tolist(),
+        "batched_inputs": input.batched_inputs,
+    }
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@self_hosted_hf"].start()