katanemo · cotran2 · Oct 1, 2024 · Sep 28, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/arch/src/stream_context.rs b/arch/src/stream_context.rs
@@ -479,7 +479,9 @@ impl StreamContext {
 
         let model_resp = &arch_fc_response.choices[0];
 
-        if model_resp.message.tool_calls.is_none() {
+        if model_resp.message.tool_calls.is_none()
+            || model_resp.message.tool_calls.as_ref().unwrap().is_empty()
+        {
             // This means that Arch FC did not have enough information to resolve the function call
             // Arch FC probably responded with a message asking for more information.
             // Let's send the response back to the user to initalize lightweight dialog for parameter collection
@@ -494,12 +496,6 @@ impl StreamContext {
         }
 
         let tool_calls = model_resp.message.tool_calls.as_ref().unwrap();
-        if tool_calls.is_empty() {
-            return self.send_server_error(
-                "No tool calls found in function resolver response".to_string(),
-                Some(StatusCode::BAD_REQUEST),
-            );
-        }
 
         debug!("tool_call_details: {:?}", tool_calls);
         // extract all tool names

diff --git a/demos/function_calling/docker-compose.yaml b/demos/function_calling/docker-compose.yaml
@@ -1,3 +1,7 @@
+x-variables: &common-vars
+  environment:
+    - MODE=${MODE:-cloud}  # Set the default mode to 'cloud', others values are local-gpu, local-cpu
+
 services:
 
   config_generator:
@@ -34,6 +38,7 @@ services:
       dockerfile: Dockerfile
     ports:
       - "18081:80"
+    <<: *common-vars
     healthcheck:
         test: ["CMD", "curl" ,"http://localhost/healthz"]
         interval: 5s
@@ -48,6 +53,7 @@ services:
       dockerfile: Dockerfile
     ports:
       - "18082:80"
+    <<: *common-vars
     healthcheck:
         test: ["CMD", "curl" ,"http://localhost:80/healthz"]
         interval: 5s
@@ -57,11 +63,12 @@ services:
     environment:
       # use ollama endpoint that is hosted by host machine (no virtualization)
       - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-host.docker.internal}
+      - FC_URL=${FC_URL:}
       - OLLAMA_MODEL=Arch-Function-Calling-3B-Q4_K_M
+      - MODE=${MODE:-cloud}
       # uncomment following line to use ollama endpoint that is hosted by docker
       # - OLLAMA_ENDPOINT=ollama
       # - OLLAMA_MODEL=Arch-Function-Calling-1.5B:Q4_K_M
-
   api_server:
     build:
       context: api_server

diff --git a/function_resolver/app/bolt_handler.py b/function_resolver/app/bolt_handler.py
@@ -89,7 +89,9 @@ def extract_tools(self, content, executable=False):
             if isinstance(tool_call, dict):
                 try:
                     if not executable:
-                        extracted_tools.append({tool_call["name"]: tool_call["arguments"]})
+                        extracted_tools.append(
+                            {tool_call["name"]: tool_call["arguments"]}
+                        )
                     else:
                         name, arguments = (
                             tool_call.get("name", ""),

diff --git a/function_resolver/app/common.py b/function_resolver/app/common.py
@@ -1,10 +1,12 @@
 from typing import Any, Dict, List
 from pydantic import BaseModel
 
+
 class Message(BaseModel):
     role: str
     content: str
 
+
 class ChatMessage(BaseModel):
     messages: list[Message]
     tools: List[Dict[str, Any]]
diff --git a/function_resolver/app/main.py b/function_resolver/app/main.py
@@ -5,36 +5,56 @@
 from bolt_handler import BoltHandler
 from common import ChatMessage
 import logging
+import yaml
 from openai import OpenAI
 import os
 
+
+with open("openai_params.yaml") as f:
+    params = yaml.safe_load(f)
+
 ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "localhost")
 ollama_model = os.getenv("OLLAMA_MODEL", "Arch-Function-Calling-1.5B-Q4_K_M")
-logger = logging.getLogger('uvicorn.error')
+fc_url = os.getenv("FC_URL", ollama_endpoint)
+mode = os.getenv("MODE", "cloud")
+if mode not in ["cloud", "local-gpu", "local-cpu"]:
+    raise ValueError(f"Invalid mode: {mode}")
+arch_api_key = os.getenv("ARCH_API_KEY", "")
+logger = logging.getLogger("uvicorn.error")
 
 handler = None
 if ollama_model.startswith("Arch"):
-   handler = ArchHandler()
+    handler = ArchHandler()
 else:
     handler = BoltHandler()
 
-logger.info(f"using model: {ollama_model}")
-logger.info(f"using ollama endpoint: {ollama_endpoint}")
 
 app = FastAPI()
 
-client = OpenAI(
-    base_url='http://{}:11434/v1/'.format(ollama_endpoint),
+if mode == "cloud":
+    client = OpenAI(
+        base_url=fc_url,
+        api_key=arch_api_key,
+    )
+    models = client.models.list()
+    model = models.data[0].id
+    chosen_model = model
+    endpoint = fc_url
+else:
+    client = OpenAI(
+        base_url="http://{}:11434/v1/".format(ollama_endpoint),
+        api_key="ollama",
+    )
+    chosen_model = ollama_model
+    endpoint = ollama_endpoint
+logger.info(f"serving mode: {mode}")
+logger.info(f"using model: {chosen_model}")
+logger.info(f"using endpoint: {endpoint}")
 
-    # required but ignored
-    api_key='ollama',
-)
 
 @app.get("/healthz")
 async def healthz():
-    return {
-        "status": "ok"
-    }
+    return {"status": "ok"}
 
 
 @app.post("/v1/chat/completions")
@@ -45,23 +65,28 @@ async def chat_completion(req: ChatMessage, res: Response):
     messages = [{"role": "system", "content": tools_encoded}]
     for message in req.messages:
         messages.append({"role": message.role, "content": message.content})
-    logger.info(f"request model: {ollama_model}, messages: {json.dumps(messages)}")
-    resp = client.chat.completions.create(messages=messages, model=ollama_model, stream=False)
+    logger.info(f"request model: {chosen_model}, messages: {json.dumps(messages)}")
+    completions_params = params["params"]
+    resp = client.chat.completions.create(
+        messages=messages,
+        model=chosen_model,
+        stream=False,
+        extra_body=completions_params,
+    )
     tools = handler.extract_tools(resp.choices[0].message.content)
     tool_calls = []
     for tool in tools:
-       for tool_name, tool_args in tool.items():
-          tool_calls.append({
-              "id": f"call_{random.randint(1000, 10000)}",
-              "type": "function",
-              "function": {
-                "name": tool_name,
-                "arguments": tool_args
-              }
-          })
+        for tool_name, tool_args in tool.items():
+            tool_calls.append(
+                {
+                    "id": f"call_{random.randint(1000, 10000)}",
+                    "type": "function",
+                    "function": {"name": tool_name, "arguments": tool_args},
+                }
+            )
     if tools:
-      resp.choices[0].message.tool_calls = tool_calls
-      resp.choices[0].message.content = None
+        resp.choices[0].message.tool_calls = tool_calls
+        resp.choices[0].message.content = None
     logger.info(f"response (tools): {json.dumps(tools)}")
     logger.info(f"response: {json.dumps(resp.to_dict())}")
     return resp
diff --git a/function_resolver/app/openai_params.yaml b/function_resolver/app/openai_params.yaml
@@ -0,0 +1,8 @@
+params:
+  temperature: 0.0001
+  top_p : 0.5
+  repetition_penalty: 1.0
+  top_k: 50
+  max_tokens: 128
+  stop: ["<|im_start|>", "<|im_end|>"]
+  stop_token_ids: [151645, 151643]
diff --git a/model_server/app/install.py b/model_server/app/install.py
@@ -9,7 +9,5 @@
 load_transformers()
 print("installing ner models")
 load_ner_models()
-print("installing toxic models")
-load_toxic_model()
 print("installing jailbreak models")
 load_jailbreak_model()
diff --git a/model_server/app/main.py b/model_server/app/main.py
@@ -6,6 +6,7 @@
     load_guard_model,
     load_zero_shot_models,
 )
+import os
 from utils import GuardHandler, split_text_into_chunks
 import torch
 import yaml
@@ -26,33 +27,23 @@
     config = yaml.safe_load(file)
 with open("guard_model_config.yaml") as f:
     guard_model_config = yaml.safe_load(f)
+mode = os.getenv("MODE", "cloud")
+logger.info(f"Serving model mode: {mode}")
+if mode not in ['cloud', 'local-gpu', 'local-cpu']:
+    raise ValueError(f"Invalid mode: {mode}")
+if mode == 'local-cpu':
+    hardware = 'cpu'
+else:
+    hardware = "gpu" if torch.cuda.is_available() else "cpu"
 
 if "prompt_guards" in config.keys():
-    if len(config["prompt_guards"]["input_guards"]) == 2:
-        task = "both"
-        jailbreak_hardware = "gpu" if torch.cuda.is_available() else "cpu"
-        toxic_hardware = "gpu" if torch.cuda.is_available() else "cpu"
-        toxic_model = load_guard_model(
-            guard_model_config["toxic"][jailbreak_hardware], toxic_hardware
-        )
-        jailbreak_model = load_guard_model(
-            guard_model_config["jailbreak"][toxic_hardware], jailbreak_hardware
-        )
+    task = list(config["prompt_guards"]["input_guards"].keys())[0]
 
-    else:
-        task = list(config["prompt_guards"]["input_guards"].keys())[0]
-
-        hardware = "gpu" if torch.cuda.is_available() else "cpu"
-        if task == "toxic":
-            toxic_model = load_guard_model(
-                guard_model_config["toxic"][hardware], hardware
-            )
-            jailbreak_model = None
-        elif task == "jailbreak":
-            jailbreak_model = load_guard_model(
-                guard_model_config["jailbreak"][hardware], hardware
-            )
-            toxic_model = None
+    hardware = "gpu" if torch.cuda.is_available() else "cpu"
+    jailbreak_model = load_guard_model(
+        guard_model_config["jailbreak"][hardware], hardware
+    )
+    toxic_model = None
 
 
     guard_handler = GuardHandler(toxic_model, jailbreak_model)