From c7e5c1c3be4aafc6111a4db50da6d76f814c4f97 Mon Sep 17 00:00:00 2001 From: Weize Chen <32613237+chenweize1998@users.noreply.github.com> Date: Sun, 24 Mar 2024 16:42:51 +0800 Subject: [PATCH] OpenAI 1.0 and vLLMs support (#127) * migrated to new openai library and added VLLM support * updated README to add vLLM support --------- Co-authored-by: taisazero --- .gitignore | 3 +- README.md | 17 + .../rules/executor/tool_using.py | 16 +- agentverse/llms/__init__.py | 31 +- agentverse/llms/base.py | 5 +- agentverse/llms/openai.py | 364 ++++++++++++------ agentverse/llms/utils/llm_server_utils.py | 42 ++ agentverse/llms/utils/token_counter.py | 4 +- agentverse/memory/chat_history.py | 8 +- agentverse_command/main_simulation_gui.py | 22 +- requirements.txt | 2 +- scripts/evaluate_responsegen.py | 8 +- 12 files changed, 376 insertions(+), 146 deletions(-) create mode 100644 agentverse/llms/utils/llm_server_utils.py diff --git a/.gitignore b/.gitignore index bbfb5aec0..fced8a228 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,5 @@ results tmp/ data/toolbench logs/ -ci_smoke_test_output/ \ No newline at end of file +ci_smoke_test_output/ +.env \ No newline at end of file diff --git a/README.md b/README.md index f93466317..dd03c1019 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,8 @@ https://github.com/OpenBMB/AgentVerse/assets/11704492/4d07da68-f942-4205-b558-f1 - [Framework Required Modules](#framework-required-modules-1) - [CLI Example](#cli-example-1) - [Local Model Support](#local-model-support) + - [vLLM Support](#vllm-support) + - [FSChat Support](#fschat-support) - [1. Install the Additional Dependencies](#1-install-the-additional-dependencies) - [2. Launch the Local Server](#2-launch-the-local-server) - [3. Modify the Config File](#3-modify-the-config-file) @@ -351,6 +353,21 @@ We have provided more tasks in `agentverse/tasks/tasksolving/tool_using/` that s Also, you can take a look at `agentverse/tasks/tasksolving` for more experiments we have done in our paper. ## Local Model Support +## vLLM Support +If you want to use vLLM, follow the guide [here](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) to install and setup the vLLM server which is used to handle larger inference workloads. Create the following environment variables to connect to the vLLM server: +```bash +export VLLM_API_KEY="your_api_key_here" +export VLLM_API_BASE="http://your_vllm_url_here" +``` + +Then modify the `model` in the task config file so that it matches the model name in the vLLM server. For example: +```yaml +model_type: vllm +model: llama-2-7b-chat-hf +``` + +## FSChat Support +This section provides a step-by-step guide to integrate FSChat into AgentVerse. FSChat is a framework that supports local models such as LLaMA, Vicunna, etc. running on your local machine. ### 1. Install the Additional Dependencies If you want to use local models such as LLaMA, you need to additionally install some other dependencies: ```bash diff --git a/agentverse/environments/tasksolving_env/rules/executor/tool_using.py b/agentverse/environments/tasksolving_env/rules/executor/tool_using.py index 9ae70a78d..13b4d02d5 100644 --- a/agentverse/environments/tasksolving_env/rules/executor/tool_using.py +++ b/agentverse/environments/tasksolving_env/rules/executor/tool_using.py @@ -1,12 +1,12 @@ import json import ast -import openai +from openai import OpenAI from string import Template from colorama import Fore from aiohttp import ClientSession from copy import deepcopy from typing import TYPE_CHECKING, Any, List, Tuple - +import httpx from agentverse.agents import ExecutorAgent from agentverse.message import Message, ExecutorMessage, SolverMessage from agentverse.logging import logger @@ -14,9 +14,9 @@ from . import BaseExecutor, executor_registry import asyncio from agentverse.llms.utils.jsonrepair import JsonRepair +from agentverse.llms.openai import DEFAULT_CLIENT_ASYNC as client_async url = "http://127.0.0.1:8080" - SUMMARIZE_PROMPT = """Here is the text gathered from a webpage, and a question you need to answer from the webpage. -- Webpage -- ${webpage} @@ -219,7 +219,7 @@ async def _summarize_webpage(webpage, question): ) for _ in range(3): try: - response = await openai.ChatCompletion.acreate( + response = await client_async.chat.completions.create( messages=[{"role": "user", "content": summarize_prompt}], model="gpt-3.5-turbo-16k", functions=[ @@ -261,7 +261,7 @@ async def _summarize_webpage(webpage, question): continue arguments = ast.literal_eval( JsonRepair( - response["choices"][0]["message"]["function_call"]["arguments"] + response.choices[0].message.function_call.arguments ).repair() ) ret = ( @@ -300,7 +300,7 @@ async def _summarize_webpage(webpage, question): } for i in range(3): try: - async with ClientSession(cookies=cookies, trust_env=True) as session: + async with httpx.AsyncClient(cookies=cookies, trust_env=True) as session: if cookies is None: async with session.post( f"{url}/get_cookie", timeout=30 @@ -327,12 +327,12 @@ async def _summarize_webpage(webpage, question): ) as response: content = await response.text() if command == "WebEnv_browse_website": - openai.aiosession.set(session) + client_async.http_client = session result = await _summarize_webpage( content, arguments["goals_to_browse"] ) elif command == "WebEnv_search_and_browse": - openai.aiosession.set(session) + client_async.http_client = session content = json.loads(content) # for i in range(len(content)): diff --git a/agentverse/llms/__init__.py b/agentverse/llms/__init__.py index b623e6234..f5bcda3fe 100644 --- a/agentverse/llms/__init__.py +++ b/agentverse/llms/__init__.py @@ -9,12 +9,33 @@ "vicuna-13b-v1.5", ] LOCAL_LLMS_MAPPING = { - "llama-2-7b-chat-hf": "meta-llama/Llama-2-7b-chat-hf", - "llama-2-13b-chat-hf": "meta-llama/Llama-2-13b-chat-hf", - "llama-2-70b-chat-hf": "meta-llama/Llama-2-70b-chat-hf", - "vicuna-7b-v1.5": "lmsys/vicuna-7b-v1.5", - "vicuna-13b-v1.5": "lmsys/vicuna-13b-v1.5", + "llama-2-7b-chat-hf": { + "hf_model_name": "meta-llama/Llama-2-7b-chat-hf", + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + }, + "llama-2-13b-chat-hf": { + "hf_model_name": "meta-llama/Llama-2-13b-chat-hf", + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + }, + "llama-2-70b-chat-hf": { + "hf_model_name": "meta-llama/Llama-2-70b-chat-hf", + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + }, + "vicuna-7b-v1.5": { + "hf_model_name": "lmsys/vicuna-7b-v1.5", + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + }, + "vicuna-13b-v1.5": { + "hf_model_name": "lmsys/vicuna-13b-v1.5", + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + }, } + from .base import BaseLLM, BaseChatModel, BaseCompletionModel, LLMResult from .openai import OpenAIChat diff --git a/agentverse/llms/base.py b/agentverse/llms/base.py index b759fb281..cc38f6d39 100644 --- a/agentverse/llms/base.py +++ b/agentverse/llms/base.py @@ -1,6 +1,5 @@ from abc import abstractmethod -from typing import Dict, Any - +from typing import Any, Dict, Optional from pydantic import BaseModel, Field @@ -20,6 +19,8 @@ class BaseModelArgs(BaseModel): class BaseLLM(BaseModel): args: BaseModelArgs = Field(default_factory=BaseModelArgs) max_retry: int = Field(default=3) + client_args: Optional[Dict] = Field(default={}) + is_azure: bool = Field(default=False) @abstractmethod def get_spend(self) -> float: diff --git a/agentverse/llms/openai.py b/agentverse/llms/openai.py index 7897ef2fc..144e19d3d 100644 --- a/agentverse/llms/openai.py +++ b/agentverse/llms/openai.py @@ -5,51 +5,95 @@ import numpy as np from aiohttp import ClientSession from typing import Dict, List, Optional, Union -from tenacity import retry, stop_after_attempt, wait_exponential - -from pydantic import BaseModel, Field +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type, +) +from pydantic import Field from agentverse.llms.base import LLMResult from agentverse.logging import logger from agentverse.message import Message -from . import llm_registry, LOCAL_LLMS -from .base import BaseChatModel, BaseCompletionModel, BaseModelArgs +from . import llm_registry, LOCAL_LLMS, LOCAL_LLMS_MAPPING +from .base import BaseChatModel, BaseModelArgs from .utils.jsonrepair import JsonRepair +from .utils.llm_server_utils import get_llm_server_modelname try: - import openai - from openai.error import OpenAIError + from openai import OpenAI, AsyncOpenAI + from openai import OpenAIError + from openai import AzureOpenAI, AsyncAzureOpenAI except ImportError: is_openai_available = False - logger.warn("openai package is not installed") + logger.warn( + "openai package is not installed. Please install it via `pip install openai`" + ) else: - # openai.proxy = os.environ.get("http_proxy") - # if openai.proxy is None: - # openai.proxy = os.environ.get("HTTP_PROXY") - if os.environ.get("OPENAI_API_KEY") != None: - openai.api_key = os.environ.get("OPENAI_API_KEY") - is_openai_available = True - # set openai api base url if it is set - if os.environ.get("OPENAI_BASE_URL") != None: - openai.base_url = os.environ.get("OPENAI_BASE_URL") - print("use new openai base url", openai.base_url) - elif os.environ.get("AZURE_OPENAI_API_KEY") != None: - openai.api_type = "azure" - openai.api_key = os.environ.get("AZURE_OPENAI_API_KEY") - openai.api_base = os.environ.get("AZURE_OPENAI_API_BASE") - openai.api_version = "2023-05-15" - is_openai_available = True - else: + api_key = None + base_url = None + model_name = None + OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL") + AZURE_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY") + AZURE_API_BASE = os.environ.get("AZURE_OPENAI_API_BASE") + VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL") + VLLM_API_KEY = os.environ.get("VLLM_API_KEY", "EMPTY") + + if not OPENAI_API_KEY and not AZURE_API_KEY: logger.warn( - "OpenAI API key is not set. Please set the environment variable OPENAI_API_KEY" + "OpenAI API key is not set. Please set an environment variable OPENAI_API_KEY or " + "AZURE_OPENAI_API_KEY." ) - is_openai_available = False + elif OPENAI_API_KEY: + DEFAULT_CLIENT = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) + DEFAULT_CLIENT_ASYNC = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) + api_key = OPENAI_API_KEY + base_url = OPENAI_BASE_URL + elif AZURE_API_KEY: + DEFAULT_CLIENT = AzureOpenAI( + api_key=AZURE_API_KEY, + azure_endpoint=AZURE_API_BASE, + api_version="2024-02-15-preview", + ) + DEFAULT_CLIENT_ASYNC = AsyncAzureOpenAI( + api_key=AZURE_API_KEY, + azure_endpoint=AZURE_API_BASE, + ) + api_key = AZURE_API_KEY + base_url = AZURE_API_BASE + if VLLM_BASE_URL: + if model_name := get_llm_server_modelname(VLLM_BASE_URL, VLLM_API_KEY, logger): + # model_name = /mnt/llama/hf_models/TheBloke_Llama-2-70B-Chat-GPTQ + # transform to TheBloke/Llama-2-70B-Chat-GPTQ + hf_model_name = model_name.split("/")[-1].replace("_", "/") + LOCAL_LLMS.append(model_name) + LOCAL_LLMS_MAPPING[model_name] = { + "hf_model_name": hf_model_name, + "base_url": VLLM_BASE_URL, + "api_key": VLLM_API_KEY if VLLM_API_KEY else "EMPTY", + } + logger.info(f"Using vLLM model: {hf_model_name}") + if hf_model_name := get_llm_server_modelname( + "http://localhost:5000", logger=logger + ): + # meta-llama/Llama-2-7b-chat-hf + # transform to llama-2-7b-chat-hf + short_model_name = model_name.split("/")[-1].lower() + LOCAL_LLMS.append(short_model_name) + LOCAL_LLMS_MAPPING[short_model_name] = { + "hf_model_name": hf_model_name, + "base_url": "http://localhost:5000/v1", + "api_key": "EMPTY", + } + + logger.info(f"Using FSChat model: {model_name}") class OpenAIChatArgs(BaseModelArgs): model: str = Field(default="gpt-3.5-turbo") - deployment_id: str = Field(default=None) max_tokens: int = Field(default=2048) temperature: float = Field(default=1.0) top_p: int = Field(default=1) @@ -101,9 +145,14 @@ class OpenAIChatArgs(BaseModelArgs): @llm_registry.register("gpt-35-turbo") @llm_registry.register("gpt-3.5-turbo") @llm_registry.register("gpt-4") +@llm_registry.register("vllm") @llm_registry.register("local") class OpenAIChat(BaseChatModel): args: OpenAIChatArgs = Field(default_factory=OpenAIChatArgs) + client_args: Optional[Dict] = Field( + default={"api_key": api_key, "base_url": base_url} + ) + is_azure: bool = Field(default=False) total_prompt_tokens: int = 0 total_completion_tokens: int = 0 @@ -111,14 +160,27 @@ class OpenAIChat(BaseChatModel): def __init__(self, max_retry: int = 3, **kwargs): args = OpenAIChatArgs() args = args.dict() + client_args = {"api_key": api_key, "base_url": base_url} + # check if api_key is an azure key + is_azure = False + if AZURE_API_KEY and not OPENAI_API_KEY: + is_azure = True for k, v in args.items(): args[k] = kwargs.pop(k, v) if len(kwargs) > 0: logger.warn(f"Unused arguments: {kwargs}") if args["model"] in LOCAL_LLMS: - openai.api_base = "http://localhost:5000/v1" - openai.api_key = "EMPTY" - super().__init__(args=args, max_retry=max_retry) + if args["model"] in LOCAL_LLMS_MAPPING: + client_args["api_key"] = LOCAL_LLMS_MAPPING[args["model"]]["api_key"] + client_args["base_url"] = LOCAL_LLMS_MAPPING[args["model"]]["base_url"] + is_azure = False + else: + raise ValueError( + f"Model {args['model']} not found in LOCAL_LLMS_MAPPING" + ) + super().__init__( + args=args, max_retry=max_retry, client_args=client_args, is_azure=is_azure + ) @classmethod def send_token_limit(self, model: str) -> int: @@ -126,20 +188,27 @@ def send_token_limit(self, model: str) -> int: "gpt-3.5-turbo": 4096, "gpt-35-turbo": 4096, "gpt-3.5-turbo-16k": 16384, + "gpt-3.5-turbo-0613": 16384, + "gpt-3.5-turbo-1106": 16384, + "gpt-3.5-turbo-0125": 16384, "gpt-4": 8192, "gpt-4-32k": 32768, + "gpt-4-0613": 32768, + "gpt-4-1106-preview": 131072, + "gpt-4-0125-preview": 131072, "llama-2-7b-chat-hf": 4096, } - - return send_token_limit_dict[model] - - # def _construct_messages(self, history: List[Message]): - # return history + [{"role": "user", "content": query}] - @retry( - stop=stop_after_attempt(20), - wait=wait_exponential(multiplier=1, min=4, max=10), - reraise=True, - ) + # Default to 4096 tokens if model is not in the dictionary + return send_token_limit_dict[model] if model in send_token_limit_dict else 4096 + + # @retry( + # stop=stop_after_attempt(20), + # wait=wait_exponential(multiplier=1, min=4, max=10), + # reraise=True, + # retry=retry_if_exception_type( + # exception_types=(OpenAIError, json.decoder.JSONDecodeError, Exception) + # ), + # ) def generate_response( self, prepend_prompt: str = "", @@ -149,59 +218,93 @@ def generate_response( ) -> LLMResult: messages = self.construct_messages(prepend_prompt, history, append_prompt) logger.log_prompt(messages) + if self.is_azure: + openai_client = AzureOpenAI( + api_key=self.client_args["api_key"], + azure_endpoint=self.client_args["base_url"], + api_version="2024-02-15-preview", + ) + else: + openai_client = OpenAI( + api_key=self.client_args["api_key"], + base_url=self.client_args["base_url"], + ) try: # Execute function call if functions != []: - response = openai.ChatCompletion.create( + response = openai_client.chat.completions.create( messages=messages, functions=functions, **self.args.dict(), ) - if response["choices"][0]["message"].get("function_call") is not None: + + logger.log_prompt( + [ + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ] + ) + if response.choices[0].message.function_call is not None: self.collect_metrics(response) + return LLMResult( - content=response["choices"][0]["message"].get("content", ""), - function_name=response["choices"][0]["message"][ - "function_call" - ]["name"], + content=response.choices[0].message.get("content", ""), + function_name=response.choices[0].message.function_call.name, function_arguments=ast.literal_eval( - response["choices"][0]["message"]["function_call"][ - "arguments" - ] + response.choices[0].message.function_call.arguments ), - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) else: self.collect_metrics(response) + logger.log_prompt( + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ) return LLMResult( - content=response["choices"][0]["message"]["content"], - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + content=response.choices[0].message.content, + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) else: - response = openai.ChatCompletion.create( + response = openai_client.chat.completions.create( messages=messages, **self.args.dict(), ) + logger.log_prompt( + [ + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ] + ) self.collect_metrics(response) return LLMResult( - content=response["choices"][0]["message"]["content"], - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + content=response.choices[0].message.content, + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) except (OpenAIError, KeyboardInterrupt, json.decoder.JSONDecodeError) as error: raise - @retry( - stop=stop_after_attempt(20), - wait=wait_exponential(multiplier=1, min=4, max=10), - reraise=True, - ) + # @retry( + # stop=stop_after_attempt(20), + # wait=wait_exponential(multiplier=1, min=4, max=10), + # reraise=True, + # retry=retry_if_exception_type( + # exception_types=(OpenAIError, json.decoder.JSONDecodeError, Exception) + # ), + # ) async def agenerate_response( self, prepend_prompt: str = "", @@ -212,19 +315,34 @@ async def agenerate_response( messages = self.construct_messages(prepend_prompt, history, append_prompt) logger.log_prompt(messages) + if self.is_azure: + async_openai_client = AsyncAzureOpenAI( + api_key=self.client_args["api_key"], + azure_endpoint=self.client_args["base_url"], + api_version="2024-02-15-preview", + ) + else: + async_openai_client = AsyncOpenAI( + api_key=self.client_args["api_key"], + base_url=self.client_args["base_url"], + ) try: if functions != []: - async with ClientSession(trust_env=True) as session: - openai.aiosession.set(session) - response = await openai.ChatCompletion.acreate( - messages=messages, - functions=functions, - **self.args.dict(), - ) - if response["choices"][0]["message"].get("function_call") is not None: - function_name = response["choices"][0]["message"]["function_call"][ - "name" + response = await async_openai_client.chat.completions.create( + messages=messages, + functions=functions, + **self.args.dict(), + ) + logger.log_prompt( + [ + { + "role": "assistant", + "content": response.choices[0].message.content, + } ] + ) + if response.choices[0].message.function_call is not None: + function_name = response.choices[0].message.function_call.name valid_function = False if function_name.startswith("function."): function_name = function_name.replace("function.", "") @@ -243,17 +361,13 @@ async def agenerate_response( ) try: arguments = ast.literal_eval( - response["choices"][0]["message"]["function_call"][ - "arguments" - ] + response.choices[0].message.function_call.arguments ) except: try: arguments = ast.literal_eval( JsonRepair( - response["choices"][0]["message"]["function_call"][ - "arguments" - ] + response.choices[0].message.function_call.arguments ).repair() ) except: @@ -264,36 +378,55 @@ async def agenerate_response( "The returned argument in function call is not valid json." ) self.collect_metrics(response) + logger.log_prompt( + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ) return LLMResult( function_name=function_name, function_arguments=arguments, - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) else: self.collect_metrics(response) + logger.log_prompt( + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ) return LLMResult( - content=response["choices"][0]["message"]["content"], - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + content=response.choices[0].message.content, + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) else: - async with ClientSession(trust_env=True) as session: - openai.aiosession.set(session) - response = await openai.ChatCompletion.acreate( - messages=messages, - **self.args.dict(), - ) + + response = await async_openai_client.chat.completions.create( + messages=messages, + **self.args.dict(), + ) self.collect_metrics(response) + logger.log_prompt( + [ + { + "role": "assistant", + "content": response.choices[0].message.content, + } + ] + ) return LLMResult( - content=response["choices"][0]["message"]["content"], - send_tokens=response["usage"]["prompt_tokens"], - recv_tokens=response["usage"]["completion_tokens"], - total_tokens=response["usage"]["total_tokens"], + content=response.choices[0].message.content, + send_tokens=response.usage.prompt_tokens, + recv_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, ) except (OpenAIError, KeyboardInterrupt, json.decoder.JSONDecodeError) as error: raise @@ -311,8 +444,8 @@ def construct_messages( return messages def collect_metrics(self, response): - self.total_prompt_tokens += response["usage"]["prompt_tokens"] - self.total_completion_tokens += response["usage"]["completion_tokens"] + self.total_prompt_tokens += response.usage.prompt_tokens + self.total_completion_tokens += response.usage.completion_tokens def get_spend(self) -> int: input_cost_map = { @@ -320,9 +453,13 @@ def get_spend(self) -> int: "gpt-3.5-turbo-16k": 0.003, "gpt-3.5-turbo-0613": 0.0015, "gpt-3.5-turbo-16k-0613": 0.003, + "gpt-3.5-turbo-1106": 0.0005, + "gpt-3.5-turbo-0125": 0.0005, "gpt-4": 0.03, "gpt-4-0613": 0.03, "gpt-4-32k": 0.06, + "gpt-4-1106-preview": 0.01, + "gpt-4-0125-preview": 0.01, "llama-2-7b-chat-hf": 0.0, } @@ -331,9 +468,13 @@ def get_spend(self) -> int: "gpt-3.5-turbo-16k": 0.004, "gpt-3.5-turbo-0613": 0.002, "gpt-3.5-turbo-16k-0613": 0.004, + "gpt-3.5-turbo-1106": 0.0015, + "gpt-3.5-turbo-0125": 0.0015, "gpt-4": 0.06, "gpt-4-0613": 0.06, "gpt-4-32k": 0.12, + "gpt-4-1106-preview": 0.03, + "gpt-4-0125-preview": 0.03, "llama-2-7b-chat-hf": 0.0, } @@ -353,16 +494,19 @@ def get_spend(self) -> int: reraise=True, ) def get_embedding(text: str, attempts=3) -> np.array: + if AZURE_API_KEY and AZURE_API_BASE: + client = AzureOpenAI( + api_key=AZURE_API_KEY, + azure_endpoint=AZURE_API_BASE, + api_version="2024-02-15-preview", + ) + elif OPENAI_API_KEY: + client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) try: text = text.replace("\n", " ") - if openai.api_type == "azure": - embedding = openai.Embedding.create( - input=[text], deployment_id="text-embedding-ada-002" - )["data"][0]["embedding"] - else: - embedding = openai.Embedding.create( - input=[text], model="text-embedding-ada-002" - )["data"][0]["embedding"] + embedding = client.embeddings.create( + input=text, model="text-embedding-ada-002" + ).model_dump_json(indent=2) return tuple(embedding) except Exception as e: attempt += 1 diff --git a/agentverse/llms/utils/llm_server_utils.py b/agentverse/llms/utils/llm_server_utils.py new file mode 100644 index 000000000..1aef91f78 --- /dev/null +++ b/agentverse/llms/utils/llm_server_utils.py @@ -0,0 +1,42 @@ +import requests +from typing import Optional + +def get_llm_server_modelname( + base_url: str = "http://localhost:8000", api_key=None, logger=None +) -> Optional[str]: + # remove /v1 and any trailing slashes from the base_url + base_url = base_url.replace("/v1", "").rstrip("/") + try: + if api_key: + response = requests.get( + f"{base_url}/v1/models", headers={"Authorization": f"Bearer {api_key}"} + ) + else: + response = requests.get(f"{base_url}/v1/models") + if response.status_code == 200: + # get the model name hosted by vLLM + models = [m for m in response.json()["data"] if m["object"] == "model"] + if len(models) == 0: + if logger: + logger.warn( + "The vLLM server is running but not hosting any models." + ) + return None + model_name = models[0]["id"] + if logger: + logger.info(f"vLLM server is running. Selecting: {model_name}.") + return model_name + else: + if logger: + logger.warn( + f"vLLM server is running but could not get the list of models. Status code: {response.status_code}" + ) + return None + except requests.exceptions.ConnectionError: + if logger: + logger.warn("No vLLM server running at the specified URL.") + return None + except Exception as e: + if logger: + logger.warn(f"Error while trying to get the vLLM model name: {e}") + return None diff --git a/agentverse/llms/utils/token_counter.py b/agentverse/llms/utils/token_counter.py index 94125ad73..7af8e2ffb 100644 --- a/agentverse/llms/utils/token_counter.py +++ b/agentverse/llms/utils/token_counter.py @@ -12,7 +12,7 @@ def count_string_tokens(prompt: str = "", model: str = "gpt-3.5-turbo") -> int: return len(tiktoken.encoding_for_model(model).encode(prompt)) elif model.lower() in LOCAL_LLMS or model in LOCAL_LLMS: from transformers import AutoTokenizer - encoding = AutoTokenizer.from_pretrained(LOCAL_LLMS_MAPPING[model.lower()]) + encoding = AutoTokenizer.from_pretrained(LOCAL_LLMS_MAPPING[model.lower()]['hf_model_name']) return len(encoding.encode(prompt)) @@ -35,7 +35,7 @@ def count_message_tokens( elif model.lower() in LOCAL_LLMS or model in LOCAL_LLMS: from transformers import AutoTokenizer - encoding = AutoTokenizer.from_pretrained(LOCAL_LLMS_MAPPING[model.lower()]) + encoding = AutoTokenizer.from_pretrained(LOCAL_LLMS_MAPPING[model.lower()]['hf_model_name']) else: raise NotImplementedError( f"count_message_tokens() is not implemented for model {model}.\n" diff --git a/agentverse/memory/chat_history.py b/agentverse/memory/chat_history.py index 1b40f19ec..65d007514 100644 --- a/agentverse/memory/chat_history.py +++ b/agentverse/memory/chat_history.py @@ -1,7 +1,7 @@ import json import logging import os -import openai +from openai import OpenAI import copy from typing import List, Optional, Tuple, Dict @@ -12,7 +12,7 @@ from .base import BaseMemory from agentverse.llms.utils import count_message_tokens, count_string_tokens from agentverse.llms import OpenAIChat - +from agentverse.llms.openai import DEFAULT_CLIENT as openai_client @memory_registry.register("chat_history") class ChatHistoryMemory(BaseMemory): @@ -206,12 +206,12 @@ async def _update_summary_with_batch( summary=self.summary, new_events=new_events_batch ) - self.summary = await openai.ChatCompletion.acreate( + self.summary = await openai_client.chat.completions.acreate( messages=[{"role": "user", "content": prompt}], model=model, max_tokens=max_summary_length, temperature=0.5, - )["choices"][0]["message"]["content"] + ).choices[0].message.content def summary_message(self) -> dict: return { diff --git a/agentverse_command/main_simulation_gui.py b/agentverse_command/main_simulation_gui.py index 75d7fbd3f..e132a1024 100644 --- a/agentverse_command/main_simulation_gui.py +++ b/agentverse_command/main_simulation_gui.py @@ -9,19 +9,23 @@ type=str, default=os.path.join(os.path.dirname(__file__), "..", "agentverse", "tasks"), ) -parser.add_argument("--share", - action='store_true', - default=False, - help="Create a publicly shareable link") -parser.add_argument("--server_name", - type=str, - default="127.0.0.1", - help="Server name") +parser.add_argument( + "--share", + action="store_true", + default=False, + help="Create a publicly shareable link", +) +parser.add_argument("--server_name", type=str, default="127.0.0.1", help="Server name") +parser.add_argument("--debug", action="store_true", default=False, help="Debug mode") args = parser.parse_args() def cli_main(): - ui = GUI(args.task, args.tasks_dir,ui_kwargs={"share":args.share,"server_name":args.server_name}) + ui = GUI( + args.task, + args.tasks_dir, + ui_kwargs={"share": args.share, "server_name": args.server_name, "debug": args.debug}, + ) ui.launch() if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 0a93efec9..4c4575e64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ fastapi==0.95.1 uvicorn py3langid setuptools-scm -openai==0.27.8 +openai==1.5.0 opencv-python==4.8.0.76 gradio httpx[socks]==0.25.0 diff --git a/scripts/evaluate_responsegen.py b/scripts/evaluate_responsegen.py index 07b497ae3..0b7adabf2 100644 --- a/scripts/evaluate_responsegen.py +++ b/scripts/evaluate_responsegen.py @@ -2,8 +2,9 @@ import json from string import Template import time -import openai +from openai import OpenAI from tqdm import tqdm +from agentverse.llms.openai import DEFAULT_CLIENT as client with open("./results.jsonl", "r") as f: lines = list(f.readlines()) @@ -25,7 +26,6 @@ res = [] eval = [] - def write_eval_to_file(file, skip=0): for idx, line in tqdm(enumerate(lines)): if idx < skip: @@ -50,7 +50,7 @@ def write_eval_to_file(file, skip=0): ) for i in range(100): try: - eval_response = openai.ChatCompletion.create( + eval_response = client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], temperature=0.0, @@ -59,7 +59,7 @@ def write_eval_to_file(file, skip=0): time.sleep(min(i**2, 60)) continue break - text = eval_response["choices"][0]["message"]["content"] + text = eval_response.choices[0].message.content eval.append(text) text = text.replace("\n", "\n\n") f.write(f"{text}\n\n")