diff --git a/README.md b/README.md index 76cc5d9..63149f4 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,10 @@ Install Package (python>=3.9) ```bash pip install -r requirements.txt ``` +or for ToolEval only +```bash +pip install -r toolbench/tooleval/requirements.txt +``` Prepare the data and tool environment: ```bash diff --git a/README_ZH.md b/README_ZH.md index ddee9db..f8bf5a3 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -123,6 +123,10 @@ cd ToolBench ```bash pip install -r requirements.txt ``` +或者仅安装ToolEval需要的包 +```bash +pip install -r toolbench/tooleval/requirements.txt +``` 准备数据和工具环境: ```bash diff --git a/toolbench/tooleval/README.md b/toolbench/tooleval/README.md index c2ba874..b852ef6 100644 --- a/toolbench/tooleval/README.md +++ b/toolbench/tooleval/README.md @@ -11,6 +11,11 @@ Our automatic evaluator, developed using ChatGPT, demonstrates a significant cor We also obtain the agreement among different human annotators **83.54%**, and the agreement between humans and our evaluator **80.21%**. ## 🚀Usage +### Install +Install Package (python>=3.9) +```bash +pip install -r requirements.txt +``` ### Reproduce our Results diff --git a/toolbench/tooleval/README_ZH.md b/toolbench/tooleval/README_ZH.md index 47918f0..283e192 100644 --- a/toolbench/tooleval/README_ZH.md +++ b/toolbench/tooleval/README_ZH.md @@ -14,6 +14,12 @@ ## 🚀用法 +### 安装 +安装包,要求(python>=3.9) +```bash +pip install -r requirements.txt +``` + ### 复现结果 要在测试集(如G1-Inst.)上评估模型,可以执行以下命令: diff --git a/toolbench/tooleval/evaluators/evaluator.py b/toolbench/tooleval/evaluators/evaluator.py index f842a6b..9a2de23 100644 --- a/toolbench/tooleval/evaluators/evaluator.py +++ b/toolbench/tooleval/evaluators/evaluator.py @@ -4,23 +4,8 @@ import math from typing import List,Union,Dict,Any from copy import deepcopy -from tenacity import retry, wait_random_exponential, stop_after_attempt -import numpy as np -import os +from .utils import OpenaiPoolRequest -class OpenaiPoolRequest: - def __init__(self, pool_json_file): - self.api_key = os.environ.get('OPENAI_KEY') - - @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True) - def request(self,messages,**kwargs): - import openai - kwargs['api_key'] = self.api_key - return openai.ChatCompletion.create(messages=messages,**kwargs) - - def __call__(self,messages,**kwargs): - return self.request(messages,**kwargs) - def process_answer(answer:Dict): answer['final_answer'] = answer['final_answer'][:1000] answer['answer_details'] = answer['answer_details'][:3000] @@ -31,6 +16,7 @@ def process_tools(tools:List[Dict]): tool.pop('description',None) tool.pop('parameters',None) return tools + class AutomaticEvaluator: def __init__(self,eval_config,template): self.eval_config = eval_config diff --git a/toolbench/tooleval/evaluators/utils.py b/toolbench/tooleval/evaluators/utils.py new file mode 100644 index 0000000..f566bc6 --- /dev/null +++ b/toolbench/tooleval/evaluators/utils.py @@ -0,0 +1,17 @@ +from tenacity import retry, wait_random_exponential, stop_after_attempt +import os + + +class OpenaiPoolRequest: + def __init__(self, pool_json_file): + self.api_key = os.environ.get('OPENAI_KEY') + + @retry(wait=wait_random_exponential(multiplier=1, max=30), stop=stop_after_attempt(10),reraise=True) + def request(self,messages,**kwargs): + import openai + kwargs['api_key'] = self.api_key + return openai.ChatCompletion.create(messages=messages,**kwargs) + + def __call__(self,messages,**kwargs): + return self.request(messages,**kwargs) + diff --git a/toolbench/tooleval/requirements.txt b/toolbench/tooleval/requirements.txt new file mode 100644 index 0000000..e4aa9c5 --- /dev/null +++ b/toolbench/tooleval/requirements.txt @@ -0,0 +1,6 @@ +tqdm +numpy +pandas +pydantic +tenacity +openai \ No newline at end of file