From 6131a3c316725173ca0d26dc2da80d9557e58173 Mon Sep 17 00:00:00 2001
From: "Lv, Kaokao" <kaokao.lv@intel.com>
Date: Wed, 22 May 2024 17:19:35 +0800
Subject: [PATCH 1/7] add GenAI_HFLM class to support microservice.

---
 .../lm_eval/models/huggingface.py             | 312 ++++++++++++++++++
 1 file changed, 312 insertions(+)

diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
index 38f5d095..10554b23 100644
--- a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
+++ b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
@@ -36,6 +36,11 @@
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
+from lm_eval.api.registry import register_model
+from lm_eval.api.model import CacheHook
+import requests as requests_obj
+from requests.exceptions import RequestException
+import json
 
 eval_logger = utils.eval_logger
 
@@ -1217,3 +1222,310 @@ def _model_call(self, inps):
             logits = logits[:, :-padding_length, :]
         logits = logits.to(torch.float32)
         return logits
+
+
+@register_model("genai-hf")
+class GenAI_HFLM(HFLM):
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+    def __init__(
+        self,
+        base_url=None,
+        logits_cache: bool = True,
+        tokenizer: Optional[str] = None,
+        revision: Optional[str] = "main",
+        batch_size: int = 1,
+        max_length: Optional[int] = None,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        **kwargs):
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GenaAI service!"
+        self._rank = 0
+        self._world_size = 1
+
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                tokenizer,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
+                )
+
+        self.logits_cache = logits_cache
+        # select (or create) a pad token to use
+        if self.tokenizer.pad_token:
+            pass
+        elif self.tokenizer.unk_token:
+            self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
+        elif self.tokenizer.eos_token:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        else:
+            if getattr(self.config, "model_type", None) == "qwen":
+                # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+                self.tokenizer.pad_token = "<|endoftext|>"
+            elif (
+                self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+                or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+            ):
+                # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+                # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+                # ---
+                # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+                # https://github.com/huggingface/transformers/pull/26963
+                assert self.tokenizer.pad_token_id == 0
+            else:
+                self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+        # TODO: override this for Gemma
+        self.add_bos_token = add_bos_token
+        if  "GemmaTokenizer" in self.tokenizer.__class__.__name__:
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
+            )
+
+        self._batch_size = int(batch_size)
+        self._max_length = max_length
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+        self.cache_hook = CacheHook(None)
+        self.headers = {"Content-Type": "application/json"}
+
+    @property
+    def max_length(self) -> int:
+        if self._max_length:
+            return self._max_length
+        else:
+            return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def batch_size(self) -> int:
+        return self._batch_size
+
+    def _loglikelihood_tokens(
+        self,
+        task_requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            task_requests,
+            sort_fn=_collate,
+            group_by=None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(task_requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []
+            encoder_attns = []
+
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :],
+                        dtype=torch.long,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                    )
+                    (inplen,) = inp.shape
+
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                    )
+                    (contlen,) = cont.shape
+
+                    conts.append(cont)
+
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+
+            data = {
+                    "batched_inputs": batched_inps.tolist(),
+                }
+            try:
+                response = requests_obj.post(
+                        f"{self.base_url}/v1/completions",
+                        headers=self.headers,
+                        data=json.dumps(data),
+                        )
+                response.raise_for_status()
+                response = response.json()
+            except RequestException as e:
+                logger.error(f"RequestException: {e}")
+
+            for (request_str, ctx_tokens, _), greedy_tokens, logprobs, inplen, cont_toks in zip(
+                chunk, response["greedy_tokens"], response["logprobs"],inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (len(logprobs) - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long
+                ).unsqueeze(0)  # [1, seq]
+                greedy_tokens = torch.tensor(
+                        self._select_cont_toks(greedy_tokens, contlen=contlen, inplen=ctx_len),
+                        dtype=torch.long
+                ).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+                cont_logprobs = self._select_cont_toks(logprobs, contlen=contlen, inplen=ctx_len)
+
+                # Answer: (log prob, is-exact-match)
+                answer = (sum(cont_logprobs), bool(max_equal))
+
+                res.append(answer)
+
+                self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GenAI service"
+        )
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        raise NotImplementedError("Not supported yet.")

From d86fa599adf8741373a7a3ba52f46d4e44be13db Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 09:21:29 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../lm_eval/models/huggingface.py             | 103 +++++++-----------
 1 file changed, 37 insertions(+), 66 deletions(-)

diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
index 10554b23..30b7aebe 100644
--- a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
+++ b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
@@ -15,32 +15,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import json
 import os
 from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
 
+import requests as requests_obj
 import torch
 import torch.nn.functional as F
 import transformers
 from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs, find_executable_batch_size
 from lm_eval import utils
 from lm_eval.api.instance import Instance
-from lm_eval.api.model import TemplateLM
+from lm_eval.api.model import CacheHook, TemplateLM
+from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator, clear_torch_cache, get_dtype, pad_and_concat, stop_sequences_criteria
 from packaging import version
 from peft import PeftModel
 from peft import __version__ as PEFT_VERSION
+from requests.exceptions import RequestException
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
-from lm_eval.api.registry import register_model
-from lm_eval.api.model import CacheHook
-import requests as requests_obj
-from requests.exceptions import RequestException
-import json
 
 eval_logger = utils.eval_logger
 
@@ -1240,18 +1239,19 @@ def __init__(
         use_fast_tokenizer: Optional[bool] = True,
         add_bos_token: Optional[bool] = False,
         prefix_token_id: Optional[int] = None,
-        **kwargs):
+        **kwargs,
+    ):
         self.base_url = base_url
         assert self.base_url, "must pass `base_url` to use GenaAI service!"
         self._rank = 0
         self._world_size = 1
 
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                tokenizer,
-                revision=revision,
-                trust_remote_code=trust_remote_code,
-                use_fast=use_fast_tokenizer,
-                )
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast_tokenizer,
+        )
 
         self.logits_cache = logits_cache
         # select (or create) a pad token to use
@@ -1280,7 +1280,7 @@ def __init__(
 
         # TODO: override this for Gemma
         self.add_bos_token = add_bos_token
-        if  "GemmaTokenizer" in self.tokenizer.__class__.__name__:
+        if "GemmaTokenizer" in self.tokenizer.__class__.__name__:
             self.add_bos_token = True
             eval_logger.info(
                 f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
@@ -1290,9 +1290,7 @@ def __init__(
         self._max_length = max_length
         self.custom_prefix_token_id = prefix_token_id
         if prefix_token_id is not None:
-            eval_logger.info(
-                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
-            )
+            eval_logger.info(f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}")
         self.cache_hook = CacheHook(None)
         self.headers = {"Content-Type": "application/json"}
 
@@ -1317,7 +1315,7 @@ def _loglikelihood_tokens(
         res = []
 
         def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key for the sorted method"""
+            """Defines the key for the sorted method."""
             # the negative sign on len(toks) sorts descending - this has a few advantages:
             # - time estimates will always be over not underestimates, which is more useful for planning
             # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1329,7 +1327,7 @@ def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
             return -len(toks), tuple(toks)
 
         def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key to group and lookup one-token continuations"""
+            """Defines the key to group and lookup one-token continuations."""
             # Use with group_by="contexts" (optional)"
             # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
             # speeds up some multiple-choice tasks proportionally to the number of choices.
@@ -1346,18 +1344,10 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
         n_reordered_requests = len(re_ord)
-        batch_size = (
-            self.batch_size
-            if self.batch_size != "auto"
-            else override_bs
-            if override_bs is not None
-            else 0
-        )
+        batch_size = self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0
         batch_fn = (
             self._batch_scheduler
-            if self.batch_size == "auto"
-            and n_reordered_requests > 0
-            and not override_bs
+            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
             else None
         )
 
@@ -1421,17 +1411,9 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
                     conts.append(cont)
 
-                    padding_len_cont = (
-                        max(padding_len_cont, contlen)
-                        if padding_len_cont is not None
-                        else contlen
-                    )
+                    padding_len_cont = max(padding_len_cont, contlen) if padding_len_cont is not None else contlen
 
-                padding_len_inp = (
-                    max(padding_len_inp, inplen)
-                    if padding_len_inp is not None
-                    else inplen
-                )
+                padding_len_inp = max(padding_len_inp, inplen) if padding_len_inp is not None else inplen
 
                 inps.append(inp)  # [1, inp_length]
                 cont_toks_list.append(continuation_enc)
@@ -1440,41 +1422,33 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
             # create encoder attn mask and batched conts, if seq2seq
             call_kwargs = {}
             if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                batched_inps = pad_and_concat(
-                    padding_len_inp, inps, padding_side="right"
-                )  # [batch, padding_len_inp]
+                batched_inps = pad_and_concat(padding_len_inp, inps, padding_side="right")  # [batch, padding_len_inp]
             elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                 # TODO: left-pad encoder inps and mask?
-                batched_inps = pad_and_concat(
-                    padding_len_inp, inps
-                )  # [batch, padding_len_inp]
-                batched_conts = pad_and_concat(
-                    padding_len_cont, conts
-                )  # [batch, padding_len_cont]
-                batched_encoder_mask = pad_and_concat(
-                    padding_len_inp, encoder_attns
-                )  # [batch, padding_len_inp]
+                batched_inps = pad_and_concat(padding_len_inp, inps)  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(padding_len_cont, conts)  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(padding_len_inp, encoder_attns)  # [batch, padding_len_inp]
                 call_kwargs = {
                     "attn_mask": batched_encoder_mask,
                     "labels": batched_conts,
                 }
 
             data = {
-                    "batched_inputs": batched_inps.tolist(),
-                }
+                "batched_inputs": batched_inps.tolist(),
+            }
             try:
                 response = requests_obj.post(
-                        f"{self.base_url}/v1/completions",
-                        headers=self.headers,
-                        data=json.dumps(data),
-                        )
+                    f"{self.base_url}/v1/completions",
+                    headers=self.headers,
+                    data=json.dumps(data),
+                )
                 response.raise_for_status()
                 response = response.json()
             except RequestException as e:
                 logger.error(f"RequestException: {e}")
 
             for (request_str, ctx_tokens, _), greedy_tokens, logprobs, inplen, cont_toks in zip(
-                chunk, response["greedy_tokens"], response["logprobs"],inplens, cont_toks_list
+                chunk, response["greedy_tokens"], response["logprobs"], inplens, cont_toks_list
             ):
                 # Slice to original seq length
                 contlen = len(cont_toks)
@@ -1487,13 +1461,12 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
                     if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
                     else None
                 )
-                cont_toks = torch.tensor(
-                        cont_toks, dtype=torch.long
-                ).unsqueeze(0)  # [1, seq]
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
                 greedy_tokens = torch.tensor(
-                        self._select_cont_toks(greedy_tokens, contlen=contlen, inplen=ctx_len),
-                        dtype=torch.long
-                ).unsqueeze(0)  # [1, seq]
+                    self._select_cont_toks(greedy_tokens, contlen=contlen, inplen=ctx_len), dtype=torch.long
+                ).unsqueeze(
+                    0
+                )  # [1, seq]
                 max_equal = (greedy_tokens == cont_toks).all()
                 cont_logprobs = self._select_cont_toks(logprobs, contlen=contlen, inplen=ctx_len)
 
@@ -1523,9 +1496,7 @@ def device(self):
         raise NotImplementedError()
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError(
-            "loglikelihood_rolling not yet supported for GenAI service"
-        )
+        raise NotImplementedError("loglikelihood_rolling not yet supported for GenAI service")
 
     def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
         raise NotImplementedError("Not supported yet.")

From 2278bdd28f06477784e01c9bcc35a8580cffb1a1 Mon Sep 17 00:00:00 2001
From: "Lv, Kaokao" <kaokao.lv@intel.com>
Date: Wed, 22 May 2024 18:39:14 +0800
Subject: [PATCH 3/7] fix typo.

---
 .../lm_evaluation_harness/lm_eval/models/huggingface.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
index 10554b23..2da11d89 100644
--- a/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
+++ b/GenAIEval/evaluation/lm_evaluation_harness/lm_eval/models/huggingface.py
@@ -1471,7 +1471,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
                 response.raise_for_status()
                 response = response.json()
             except RequestException as e:
-                logger.error(f"RequestException: {e}")
+                eval_logger.error(f"RequestException: {e}")
 
             for (request_str, ctx_tokens, _), greedy_tokens, logprobs, inplen, cont_toks in zip(
                 chunk, response["greedy_tokens"], response["logprobs"],inplens, cont_toks_list

From d997e45dd64842cbc777732287942c8f22bc18ec Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Wed, 22 May 2024 19:53:16 +0800
Subject: [PATCH 4/7] Update model_test_cpu.yml

---
 .github/workflows/model_test_cpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model_test_cpu.yml b/.github/workflows/model_test_cpu.yml
index a687d7ff..ed70411d 100644
--- a/.github/workflows/model_test_cpu.yml
+++ b/.github/workflows/model_test_cpu.yml
@@ -131,7 +131,7 @@ jobs:
         id: download-artifact
         uses: dawidd6/action-download-artifact@v3.1.2
         with:
-          workflow: model-test.yml
+          workflow: model_test_cpu.yml
           name: FinalReport
           run_id: ${{ vars.ModelTest_CPU_REF_ID }}
           path: ${{ env.OUT_SCRIPT_PATH }}

From a7aa198997b91f2cc779517b07d2c618a97edb4f Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Wed, 22 May 2024 19:54:04 +0800
Subject: [PATCH 5/7] Update model_test_hpu.yml

---
 .github/workflows/model_test_hpu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model_test_hpu.yml b/.github/workflows/model_test_hpu.yml
index 82a10f79..1e6f2316 100644
--- a/.github/workflows/model_test_hpu.yml
+++ b/.github/workflows/model_test_hpu.yml
@@ -119,7 +119,7 @@ jobs:
         id: download-artifact
         uses: dawidd6/action-download-artifact@v3.1.2
         with:
-          workflow: model-test.yml
+          workflow: model_test_hpu.yml
           name: FinalReport
           run_id: ${{ vars.ModelTest_HPU_REF_ID }}
           path: ${{ env.OUT_SCRIPT_PATH }}

From fcb224a76130d9ffc0c6767c0c60499ebaaf2372 Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Wed, 22 May 2024 20:16:41 +0800
Subject: [PATCH 6/7] Update unittest.yml

---
 .github/workflows/unittest.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 261b775f..a8403784 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -62,7 +62,7 @@ jobs:
 
         - name: Docker Build
           run: |
-            docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
+            docker build --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} .
 
         - name: Docker Run
           run: |
@@ -71,6 +71,7 @@ jobs:
               docker rm -vf ${{ env.CONTAINER_NAME }} || true
             fi
             docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \
+            -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" \
             -v ${{ github.workspace }}:/GenAIEval ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }}
 
         - name: Install Dependencies

From df124ccf2e26a77d4b1b725ec1de079f4ed114c9 Mon Sep 17 00:00:00 2001
From: VincyZhang <wenxin.zhang@intel.com>
Date: Wed, 22 May 2024 20:22:59 +0800
Subject: [PATCH 7/7] Update unittest.yml

---
 .github/workflows/unittest.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index a8403784..af5770c9 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -52,7 +52,9 @@ jobs:
     steps:
         - name: Clean Up Working Directory
           run: sudo rm -rf ${{github.workspace}}/*
-
+        - name: Load environment variables
+          run:
+            cat ~/actions-runner4/.env >> $GITHUB_ENV
         - name: Checkout out Repo
           uses: actions/checkout@v4
           with: