From 4a0a5dff19b6fb50145997cc4d833e7a4a1a9811 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 9 Jan 2024 08:59:40 -0800 Subject: [PATCH 01/65] Add `call_agent_1` --- operate/actions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/operate/actions.py b/operate/actions.py index 45013c1..6bacb76 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -54,13 +54,17 @@ async def get_next_action(model, messages, objective): if model == "gpt-4-with-som": return await call_gpt_4_v_labeled(messages, objective) elif model == "agent-1": - return "coming soon" + return call_agent_1(messages, objective) elif model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective) raise ModelNotRecognizedException(model) +def call_agent_1(messages, objective): + return "coming soon" + + def call_gpt_4_v(messages, objective): """ Get the next action for Self-Operating Computer From 1edc76cde2399119eca2d3754a528e5af8a87b2f Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 9 Jan 2024 11:55:56 -0800 Subject: [PATCH 02/65] Add `call_agent_1` --- operate/actions.py | 49 +++++++++++++++++++++++++++++++++++++++++----- operate/dialog.py | 6 +++++- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 6bacb76..98df3a6 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -48,21 +48,38 @@ yolo_model = YOLO("./operate/model/weights/best.pt") # Load your trained model -async def get_next_action(model, messages, objective): +async def get_next_action(model, messages, objective, session_id): if model == "gpt-4": return call_gpt_4_v(messages, objective) if model == "gpt-4-with-som": return await call_gpt_4_v_labeled(messages, objective) elif model == "agent-1": - return call_agent_1(messages, objective) + return call_agent_1(session_id, objective) elif model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective) raise ModelNotRecognizedException(model) -def call_agent_1(messages, objective): - return "coming soon" +def call_agent_1(session_id, objective): + print("[call_agent_1]") + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + with open(screenshot_filename, "rb") as img_file: + base64_image = base64.b64encode(img_file.read()).decode("utf-8") + + response = fetch_agent_1_response(session_id, objective, base64_image) + print("[call_agent_1] response", response) + + return response + except Exception as e: + print(f"Error parsing JSON: {e}") + return "Failed take action after looking at the screenshot" def call_gpt_4_v(messages, objective): @@ -384,13 +401,35 @@ async def call_gpt_4_v_labeled(messages, objective): return click_action - except Exception as e: + except: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" ) return call_gpt_4_v(messages, objective) +async def fetch_agent_1_response(session_id, objective, base64_image): + print("[call_agent_1][fetch_agent_1_response]") + url = "http://127.0.0.1:5000/agent/v1/action" + api_token = os.environ.get("AGENT_API_KEY") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_token}", + } + data = { + "session_id": session_id, + "objective": objective, + "image": f"data:image/jpeg;base64,{base64_image}", + } + + async with aiohttp.ClientSession() as session: + async with session.post( + url, headers=headers, data=json.dumps(data) + ) as response: + print("[call_agent_1][fetch_agent_1_response] response", response.json()) + return await response.json() + + async def fetch_openai_response_async(messages): url = "https://api.openai.com/v1/chat/completions" headers = { diff --git a/operate/dialog.py b/operate/dialog.py index 6c95085..619940f 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -99,11 +99,15 @@ def main(model, terminal_prompt, voice_mode=False): loop_count = 0 + session_id = None + while True: if config.debug: print("[loop] messages before next action:\n\n\n", messages[1:]) try: - response = asyncio.run(get_next_action(model, messages, objective)) + response = asyncio.run( + get_next_action(model, messages, objective, session_id) + ) action = parse_response(response) action_type = action.get("type") From 8f8787aa39f5ccdf29817f6c948ff962e6a2af6a Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 9 Jan 2024 12:33:46 -0800 Subject: [PATCH 03/65] Remove `async` from `fetch_agent_1_response` --- operate/actions.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 98df3a6..dc8d413 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -6,6 +6,7 @@ import io import asyncio import aiohttp +import requests from PIL import Image from ultralytics import YOLO @@ -408,7 +409,7 @@ async def call_gpt_4_v_labeled(messages, objective): return call_gpt_4_v(messages, objective) -async def fetch_agent_1_response(session_id, objective, base64_image): +def fetch_agent_1_response(session_id, objective, base64_image): print("[call_agent_1][fetch_agent_1_response]") url = "http://127.0.0.1:5000/agent/v1/action" api_token = os.environ.get("AGENT_API_KEY") @@ -422,12 +423,9 @@ async def fetch_agent_1_response(session_id, objective, base64_image): "image": f"data:image/jpeg;base64,{base64_image}", } - async with aiohttp.ClientSession() as session: - async with session.post( - url, headers=headers, data=json.dumps(data) - ) as response: - print("[call_agent_1][fetch_agent_1_response] response", response.json()) - return await response.json() + response = requests.post(url, headers=headers, data=json.dumps(data)) + print("[call_agent_1][fetch_agent_1_response] response", response.json()) + return response.json() async def fetch_openai_response_async(messages): From 3830598e0e7908892a803461983a24d9a4f59c9b Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 9 Jan 2024 13:53:17 -0800 Subject: [PATCH 04/65] Minor fixes and `capture_screen_with_cursor` --- operate/actions.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index dc8d413..fd6e052 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -71,15 +71,19 @@ def call_agent_1(session_id, objective): os.makedirs(screenshots_dir) screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + + capture_screen_with_cursor(screenshot_filename) + with open(screenshot_filename, "rb") as img_file: base64_image = base64.b64encode(img_file.read()).decode("utf-8") + print("[call_agent_1] about to fetch_agent_1_response") response = fetch_agent_1_response(session_id, objective, base64_image) print("[call_agent_1] response", response) return response except Exception as e: - print(f"Error parsing JSON: {e}") + print(f"Error: {e}") return "Failed take action after looking at the screenshot" @@ -149,7 +153,7 @@ def call_gpt_4_v(messages, objective): return content except Exception as e: - print(f"Error parsing JSON: {e}") + print(f"Error: {e}") return "Failed take action after looking at the screenshot" @@ -201,7 +205,7 @@ def call_gemini_pro_vision(messages, objective): return content except Exception as e: - print(f"Error parsing JSON: {e}") + print(f"Error: {e}") return "Failed take action after looking at the screenshot" @@ -424,8 +428,8 @@ def fetch_agent_1_response(session_id, objective, base64_image): } response = requests.post(url, headers=headers, data=json.dumps(data)) - print("[call_agent_1][fetch_agent_1_response] response", response.json()) - return response.json() + print("[call_agent_1][fetch_agent_1_response] response", response.text) + return response.text async def fetch_openai_response_async(messages): From eecc0f762192dbfdae0ebb3b816d2ae2c9b0dfd7 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Thu, 11 Jan 2024 07:48:17 -0800 Subject: [PATCH 05/65] Create `execute_operations` function --- operate/actions.py | 6 ++--- operate/dialog.py | 55 ++++++++++++++++++++++++------------------- operate/prompts.py | 54 ------------------------------------------ operate/utils/misc.py | 2 +- operate/utils/os.py | 16 ++++++++++++- 5 files changed, 50 insertions(+), 83 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index fd6e052..8c4094b 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -51,13 +51,13 @@ async def get_next_action(model, messages, objective, session_id): if model == "gpt-4": - return call_gpt_4_v(messages, objective) + return [call_gpt_4_v(messages, objective)] if model == "gpt-4-with-som": - return await call_gpt_4_v_labeled(messages, objective) + return await [call_gpt_4_v_labeled(messages, objective)] elif model == "agent-1": return call_agent_1(session_id, objective) elif model == "gemini-pro-vision": - return call_gemini_pro_vision(messages, objective) + return [call_gemini_pro_vision(messages, objective)] raise ModelNotRecognizedException(model) diff --git a/operate/dialog.py b/operate/dialog.py index 619940f..25dcd2e 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -17,12 +17,12 @@ style, ) from operate.utils.os import ( - keyboard_type, + keyboard, search, click, ) from operate.actions import get_next_action, summarize -from operate.utils.misc import parse_response +from operate.utils.misc import parse_operation # Load configuration config = Config() @@ -105,14 +105,10 @@ def main(model, terminal_prompt, voice_mode=False): if config.debug: print("[loop] messages before next action:\n\n\n", messages[1:]) try: - response = asyncio.run( + operations = asyncio.run( get_next_action(model, messages, objective, session_id) ) - action = parse_response(response) - action_type = action.get("type") - action_detail = action.get("data") - except ModelNotRecognizedException as e: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" @@ -124,7 +120,23 @@ def main(model, terminal_prompt, voice_mode=False): ) break - if action_type == "DONE": + stop = execute_operations(operations, messages, model, objective) + if stop: + break + + loop_count += 1 + if loop_count > 15: + break + + +def execute_operations(operations, messages, model, objective): + for operate in operations: + o = parse_operation(operate) + operation_type = o.get("type") + operation_detail = o.get("data") + function_response = "" + + if operation_type == "DONE": print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" ) @@ -132,31 +144,30 @@ def main(model, terminal_prompt, voice_mode=False): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" ) - break + return True - if action_type != "UNKNOWN": + if operation_type != "UNKNOWN": print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} {ANSI_RESET}{action_detail}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} {ANSI_RESET}{operation_detail}" ) - function_response = "" - if action_type == "SEARCH": - function_response = search(action_detail) - elif action_type == "TYPE": - function_response = keyboard_type(action_detail) - elif action_type == "CLICK": - function_response = click(action_detail) + if operation_type == "SEARCH": + function_response = search(operation_detail) + elif operation_type == "TYPE": + function_response = keyboard(operation_detail) + elif operation_type == "CLICK": + function_response = click(operation_detail) else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" ) print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{response}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" ) break print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" ) message = { @@ -165,10 +176,6 @@ def main(model, terminal_prompt, voice_mode=False): } messages.append(message) - loop_count += 1 - if loop_count > 15: - break - def validation(model, voice_mode): """ diff --git a/operate/prompts.py b/operate/prompts.py index 0e6b88c..a4c4d6a 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -89,60 +89,6 @@ Please use this context as additional info to further refine the "percent" location in the CLICK action! """ -DECISION_PROMPT = """ -You are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective. - -Here are your methods you can use to operating the computer. - -1. CLICK - Move mouse and click -2. TYPE - Type on the keyboard -3. SEARCH - Search for a program that is installed on Mac locally and open it -4. DONE - When you completed the task respond with the exact following phrase content - -Here are the response formats below. - -1. CLICK -Response: CLICK - -2. TYPE -Response: TYPE "value you want to type" - -2. SEARCH -Response: SEARCH "app you want to search for on Mac" - -3. DONE -Response: DONE - -Here are examples of how to respond. -__ -Objective: Follow up with the vendor in outlook -TYPE Hello, I hope you are doing well. I wanted to follow up -__ -Objective: Open Spotify and play the beatles -SEARCH Spotify -__ -Objective: Find an image of a banana -CLICK -__ -Objective: Go buy a book about the history of the internet -TYPE https://www.amazon.com/ -__ - -A few important notes: - -- Default to opening Google Chrome with SEARCH to find things that are on the Web. -- After you open Google Chrome you need to click on the address bar to find a website. -- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer. -- After you click to enter a field you can go ahead and start typing! -- If you can see the field is active, go ahead and type! -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. - -{previous_action} - -IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. - -{objective} -""" LABELED_IMAGE_PROMPT = """ Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. diff --git a/operate/utils/misc.py b/operate/utils/misc.py index 6959d4d..0c1076e 100644 --- a/operate/utils/misc.py +++ b/operate/utils/misc.py @@ -55,7 +55,7 @@ def extract_json_from_string(s): return None -def parse_response(response): +def parse_operation(response): """ Parses the given response and returns a dictionary with the type and data. diff --git a/operate/utils/os.py b/operate/utils/os.py index 98d05c1..9b1cd4b 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -6,7 +6,7 @@ from operate.utils.misc import convert_percent_to_decimal -def keyboard_type(text): +def keyboard(text): """ Types the given text using the keyboard. @@ -53,6 +53,20 @@ def search(text): return "Open program: " + text +def hotkey(hotkey): + print("[hotkey] ", hotkey) + keys = hotkey.split("+") + print("[hotkey] keys", keys) + for key in keys: + print("[hotkey] keydown", key) + pyautogui.keyDown(key) + time.sleep(0.1) + for key in keys: + print("[hotkey] keyup", key) + pyautogui.keyUp(key) + return True + + def click(click_detail): """ Perform a mouse click at the specified coordinates. From 639d3f64c25b75acb8a740d3e384abf96f56e218 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Thu, 11 Jan 2024 11:44:52 -0800 Subject: [PATCH 06/65] Add back `DECISION_PROMPT` --- operate/actions.py | 2 +- operate/prompts.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/operate/actions.py b/operate/actions.py index 8c4094b..92b93ca 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -55,7 +55,7 @@ async def get_next_action(model, messages, objective, session_id): if model == "gpt-4-with-som": return await [call_gpt_4_v_labeled(messages, objective)] elif model == "agent-1": - return call_agent_1(session_id, objective) + return [call_agent_1(session_id, objective)] elif model == "gemini-pro-vision": return [call_gemini_pro_vision(messages, objective)] diff --git a/operate/prompts.py b/operate/prompts.py index a4c4d6a..5673115 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -143,6 +143,48 @@ Display the results clearly: """ +DECISION_PROMPT = """ +You are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective. +Here are your methods you can use to operating the computer. +1. CLICK - Move mouse and click +2. TYPE - Type on the keyboard +3. SEARCH - Search for a program that is installed on Mac locally and open it +4. DONE - When you completed the task respond with the exact following phrase content +Here are the response formats below. +1. CLICK +Response: CLICK +2. TYPE +Response: TYPE "value you want to type" +2. SEARCH +Response: SEARCH "app you want to search for on Mac" +3. DONE +Response: DONE +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +TYPE Hello, I hope you are doing well. I wanted to follow up +__ +Objective: Open Spotify and play the beatles +SEARCH Spotify +__ +Objective: Find an image of a banana +CLICK +__ +Objective: Go buy a book about the history of the internet +TYPE https://www.amazon.com/ +__ +A few important notes: +- Default to opening Google Chrome with SEARCH to find things that are on the Web. +- After you open Google Chrome you need to click on the address bar to find a website. +- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer. +- After you click to enter a field you can go ahead and start typing! +- If you can see the field is active, go ahead and type! +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. +{previous_action} +IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. +{objective} +""" + def format_summary_prompt(objective): """ From 48f17b17615cd56c5047bcc0cd28715998890ee1 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Thu, 11 Jan 2024 14:37:38 -0800 Subject: [PATCH 07/65] Fixes two bugs --- operate/actions.py | 4 +++- operate/dialog.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 92b93ca..52992e4 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -50,10 +50,12 @@ async def get_next_action(model, messages, objective, session_id): + print("[get_next_action]") if model == "gpt-4": return [call_gpt_4_v(messages, objective)] if model == "gpt-4-with-som": - return await [call_gpt_4_v_labeled(messages, objective)] + action = await call_gpt_4_v_labeled(messages, objective) + return [action] elif model == "agent-1": return [call_agent_1(session_id, objective)] elif model == "gemini-pro-vision": diff --git a/operate/dialog.py b/operate/dialog.py index 25dcd2e..4ebce09 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -164,7 +164,7 @@ def execute_operations(operations, messages, model, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" ) - break + return True print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" From ccf7edfdc7752a951818ac2776e7b67c2c36f2af Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 07:36:13 -0800 Subject: [PATCH 08/65] Iterate `execute_operations_new` --- operate/actions.py | 9 +-- operate/dialog.py | 129 +++++++++++++++++++++++++++++------------- operate/utils/misc.py | 2 +- operate/utils/os.py | 17 +++--- 4 files changed, 103 insertions(+), 54 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 52992e4..b962e97 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -54,10 +54,11 @@ async def get_next_action(model, messages, objective, session_id): if model == "gpt-4": return [call_gpt_4_v(messages, objective)] if model == "gpt-4-with-som": - action = await call_gpt_4_v_labeled(messages, objective) - return [action] + operation = await call_gpt_4_v_labeled(messages, objective) + return [operation] elif model == "agent-1": - return [call_agent_1(session_id, objective)] + operation = call_agent_1(session_id, objective) + return operation elif model == "gemini-pro-vision": return [call_gemini_pro_vision(messages, objective)] @@ -430,7 +431,7 @@ def fetch_agent_1_response(session_id, objective, base64_image): } response = requests.post(url, headers=headers, data=json.dumps(data)) - print("[call_agent_1][fetch_agent_1_response] response", response.text) + return response.text diff --git a/operate/dialog.py b/operate/dialog.py index 4ebce09..28acbe8 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -1,6 +1,8 @@ import sys +import json import os import platform +import time import asyncio from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt @@ -16,11 +18,7 @@ ANSI_BRIGHT_MAGENTA, style, ) -from operate.utils.os import ( - keyboard, - search, - click, -) +from operate.utils.os import keyboard, search, mouse, press from operate.actions import get_next_action, summarize from operate.utils.misc import parse_operation @@ -120,7 +118,7 @@ def main(model, terminal_prompt, voice_mode=False): ) break - stop = execute_operations(operations, messages, model, objective) + stop = execute_operations_new(operations, messages, model, objective) if stop: break @@ -129,37 +127,87 @@ def main(model, terminal_prompt, voice_mode=False): break -def execute_operations(operations, messages, model, objective): - for operate in operations: - o = parse_operation(operate) - operation_type = o.get("type") - operation_detail = o.get("data") - function_response = "" - - if operation_type == "DONE": - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" - ) - summary = summarize(model, messages, objective) - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" - ) - return True - - if operation_type != "UNKNOWN": - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} {ANSI_RESET}{operation_detail}" - ) - - if operation_type == "SEARCH": - function_response = search(operation_detail) - elif operation_type == "TYPE": - function_response = keyboard(operation_detail) - elif operation_type == "CLICK": - function_response = click(operation_detail) +# def execute_operations(operations, messages, model, objective): +# for operate in operations: +# o = parse_operation(operate) +# operation_type = o.get("type") +# operation_detail = o.get("data") +# function_response = "" + +# if operation_type == "DONE": +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" +# ) +# summary = summarize(model, messages, objective) +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" +# ) +# return True + +# if operation_type != "UNKNOWN": +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} {ANSI_RESET}{operation_detail}" +# ) + +# if operation_type == "SEARCH": +# function_response = search(operation_detail) +# elif operation_type == "TYPE": +# function_response = keyboard(operation_detail) +# elif operation_type == "CLICK": +# function_response = click(operation_detail) +# else: +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" +# ) +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" +# ) +# return True + +# print( +# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" +# ) + +# message = { +# "role": "assistant", +# "content": function_response, +# } +# messages.append(message) + + +def execute_operations_new(operation, messages, model, objective): + print("[execute_operations_new] operations before", operation) + print("[execute_operations_new] type(operations) before", type(operation)) + try: + operation = json.loads(operation) + + print("[execute_operations_new] type(operations) after", type(operation)) + except Exception as e: + print("[execute_operations_new] error", e) + + for operate in operation: + # wait one second + time.sleep(5) + print("[execute_operations_new] operation", operation) + operation_type = operate.get("operation") + # print + print("[execute_operations_new] operation_type", operation_type) + # function_response = "" + + if operation_type == "press": + keys = operate.get("keys") + function_response = press(keys) + elif operation_type == "write": + content = operate.get("content") + function_response = keyboard(content) + elif operation_type == "mouse": + x = operate.get("x") + y = operate.get("y") + click_detail = {"x": x, "y": y} + function_response = mouse(click_detail) else: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" ) print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" @@ -170,11 +218,12 @@ def execute_operations(operations, messages, model, objective): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" ) - message = { - "role": "assistant", - "content": function_response, - } - messages.append(message) + # message = { + # "role": "assistant", + # "content": function_response, + # } + # messages.append(message) + return True def validation(model, voice_mode): diff --git a/operate/utils/misc.py b/operate/utils/misc.py index 0c1076e..c7544ae 100644 --- a/operate/utils/misc.py +++ b/operate/utils/misc.py @@ -55,7 +55,7 @@ def extract_json_from_string(s): return None -def parse_operation(response): +def parse_operations(response): """ Parses the given response and returns a dictionary with the type and data. diff --git a/operate/utils/os.py b/operate/utils/os.py index 9b1cd4b..fc31242 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -6,7 +6,7 @@ from operate.utils.misc import convert_percent_to_decimal -def keyboard(text): +def keyboard(content): """ Types the given text using the keyboard. @@ -16,11 +16,11 @@ def keyboard(text): Returns: str: A message indicating the typed text. """ - text = text.replace("\\n", "\n") - for char in text: + content = content.replace("\\n", "\n") + for char in content: pyautogui.write(char) - pyautogui.press("enter") - return "Type: " + text + # pyautogui.press("enter") + return "Type: " + content def search(text): @@ -53,9 +53,8 @@ def search(text): return "Open program: " + text -def hotkey(hotkey): - print("[hotkey] ", hotkey) - keys = hotkey.split("+") +def press(keys): + print("[hotkey] ") print("[hotkey] keys", keys) for key in keys: print("[hotkey] keydown", key) @@ -67,7 +66,7 @@ def hotkey(hotkey): return True -def click(click_detail): +def mouse(click_detail): """ Perform a mouse click at the specified coordinates. From 6855df68610de43f0a90cbcbf09799a41700e8cb Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 07:57:29 -0800 Subject: [PATCH 09/65] Update to `execute_operations1` --- operate/dialog.py | 59 ++++------------------------------------------- 1 file changed, 5 insertions(+), 54 deletions(-) diff --git a/operate/dialog.py b/operate/dialog.py index 28acbe8..29b2d1f 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -20,7 +20,6 @@ ) from operate.utils.os import keyboard, search, mouse, press from operate.actions import get_next_action, summarize -from operate.utils.misc import parse_operation # Load configuration config = Config() @@ -118,64 +117,16 @@ def main(model, terminal_prompt, voice_mode=False): ) break - stop = execute_operations_new(operations, messages, model, objective) + stop = execute_operations(operations, messages, model, objective) if stop: break loop_count += 1 - if loop_count > 15: + if loop_count > 3: break -# def execute_operations(operations, messages, model, objective): -# for operate in operations: -# o = parse_operation(operate) -# operation_type = o.get("type") -# operation_detail = o.get("data") -# function_response = "" - -# if operation_type == "DONE": -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" -# ) -# summary = summarize(model, messages, objective) -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" -# ) -# return True - -# if operation_type != "UNKNOWN": -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} {ANSI_RESET}{operation_detail}" -# ) - -# if operation_type == "SEARCH": -# function_response = search(operation_detail) -# elif operation_type == "TYPE": -# function_response = keyboard(operation_detail) -# elif operation_type == "CLICK": -# function_response = click(operation_detail) -# else: -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" -# ) -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" -# ) -# return True - -# print( -# f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" -# ) - -# message = { -# "role": "assistant", -# "content": function_response, -# } -# messages.append(message) - - -def execute_operations_new(operation, messages, model, objective): +def execute_operations(operation, messages, model, objective): print("[execute_operations_new] operations before", operation) print("[execute_operations_new] type(operations) before", type(operation)) try: @@ -187,7 +138,7 @@ def execute_operations_new(operation, messages, model, objective): for operate in operation: # wait one second - time.sleep(5) + time.sleep(3) print("[execute_operations_new] operation", operation) operation_type = operate.get("operation") # print @@ -223,7 +174,7 @@ def execute_operations_new(operation, messages, model, objective): # "content": function_response, # } # messages.append(message) - return True + return False def validation(model, voice_mode): From c6d92c814e5a9f33ca3fff2493417415606847b9 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 09:34:11 -0800 Subject: [PATCH 10/65] remove `ACCURATE_MODE_VISION_PROMPT` --- operate/actions.py | 51 +--------------------------------------------- operate/prompts.py | 33 ------------------------------ 2 files changed, 1 insertion(+), 83 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index b962e97..0d9a152 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -2,6 +2,7 @@ import time import json import base64 + import re import io import asyncio @@ -21,7 +22,6 @@ from operate.utils.os import get_last_assistant_message from operate.prompts import ( format_vision_prompt, - format_accurate_mode_vision_prompt, format_summary_prompt, format_decision_prompt, format_label_prompt, @@ -212,55 +212,6 @@ def call_gemini_pro_vision(messages, objective): return "Failed take action after looking at the screenshot" -# This function is not used. `-accurate` mode was removed for now until a new PR fixes it. -def accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y): - """ - Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location - """ - try: - screenshot_filename = os.path.join("screenshots", "screenshot_mini.png") - capture_mini_screenshot_with_cursor( - file_path=screenshot_filename, x=prev_x, y=prev_y - ) - - new_screenshot_filename = os.path.join( - "screenshots", "screenshot_mini_with_grid.png" - ) - - with open(new_screenshot_filename, "rb") as img_file: - img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - - accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y) - - accurate_mode_message = { - "role": "user", - "content": [ - {"type": "text", "text": accurate_vision_prompt}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, - }, - ], - } - - pseudo_messages.append(accurate_mode_message) - - response = client.chat.completions.create( - model="gpt-4-vision-preview", - messages=pseudo_messages, - presence_penalty=1, - frequency_penalty=1, - temperature=0.7, - max_tokens=300, - ) - - content = response.choices[0].message.content - - except Exception as e: - print(f"Error reprompting model for accurate_mode: {e}") - return "ERROR" - - def summarize(model, messages, objective): try: screenshots_dir = "screenshots" diff --git a/operate/prompts.py b/operate/prompts.py index 5673115..4728761 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -6,10 +6,6 @@ # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" -# constants for the vision prompt -ACCURATE_PIXEL_COUNT = ( - 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big -) # ------------------------- # VISION PROMPT @@ -73,23 +69,6 @@ """ -# ---------------------------------- -# ACCURATE MODE VISION PROMPT -# ---------------------------------- -ACCURATE_MODE_VISION_PROMPT = """ -It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot. -As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. -This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess. - -If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer. -Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer. - -There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer. - -Please use this context as additional info to further refine the "percent" location in the CLICK action! -""" - - LABELED_IMAGE_PROMPT = """ Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. @@ -206,18 +185,6 @@ def format_vision_prompt(objective, previous_action): return prompt -def format_accurate_mode_vision_prompt(prev_x, prev_y): - """ - Format the accurate mode vision prompt - """ - width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100 - height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100 - prompt = ACCURATE_MODE_VISION_PROMPT.format( - prev_x=prev_x, prev_y=prev_y, width=width, height=height - ) - return prompt - - def format_decision_prompt(objective, previous_action): """ Format the vision prompt From 6caa6a5dc4bf1eefaf8131f62b11aa7b1e39eb18 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 10:52:18 -0800 Subject: [PATCH 11/65] remove `capture_mini_screenshot_with_cursor`, small iteration to new `operate` method --- operate/actions.py | 20 ++++++++--- operate/dialog.py | 16 +++------ operate/utils/screenshot.py | 68 ------------------------------------- 3 files changed, 20 insertions(+), 84 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 0d9a152..fe54d48 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -17,7 +17,6 @@ from operate.utils.screenshot import ( capture_screen_with_cursor, add_grid_to_image, - capture_mini_screenshot_with_cursor, ) from operate.utils.os import get_last_assistant_message from operate.prompts import ( @@ -57,7 +56,7 @@ async def get_next_action(model, messages, objective, session_id): operation = await call_gpt_4_v_labeled(messages, objective) return [operation] elif model == "agent-1": - operation = call_agent_1(session_id, objective) + operation, session_id = call_agent_1(session_id, objective) return operation elif model == "gemini-pro-vision": return [call_gemini_pro_vision(messages, objective)] @@ -81,10 +80,12 @@ def call_agent_1(session_id, objective): base64_image = base64.b64encode(img_file.read()).decode("utf-8") print("[call_agent_1] about to fetch_agent_1_response") - response = fetch_agent_1_response(session_id, objective, base64_image) + response, session_id = fetch_agent_1_response( + session_id, objective, base64_image + ) print("[call_agent_1] response", response) - return response + return response, session_id except Exception as e: print(f"Error: {e}") return "Failed take action after looking at the screenshot" @@ -382,8 +383,17 @@ def fetch_agent_1_response(session_id, objective, base64_image): } response = requests.post(url, headers=headers, data=json.dumps(data)) + response_dict = response.json() + + print("[call_agent_1][fetch_agent_1_response] A response_dict", response_dict) + print( + "[call_agent_1][fetch_agent_1_response] A type(response_dict", + type(response_dict), + ) + operations = response_dict.get("operations") + session_id = response_dict.get("session_id") - return response.text + return operations, session_id async def fetch_openai_response_async(messages): diff --git a/operate/dialog.py b/operate/dialog.py index 29b2d1f..864ceef 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -126,20 +126,14 @@ def main(model, terminal_prompt, voice_mode=False): break -def execute_operations(operation, messages, model, objective): - print("[execute_operations_new] operations before", operation) - print("[execute_operations_new] type(operations) before", type(operation)) - try: - operation = json.loads(operation) +def execute_operations(operations, messages, model, objective): + print("[execute_operations_new] operations before", operations) + print("[execute_operations_new] type(operations) before", type(operations)) - print("[execute_operations_new] type(operations) after", type(operation)) - except Exception as e: - print("[execute_operations_new] error", e) - - for operate in operation: + for operate in operations: # wait one second time.sleep(3) - print("[execute_operations_new] operation", operation) + print("[execute_operations_new] operation", operations) operation_type = operate.get("operation") # print print("[execute_operations_new] operation_type", operation_type) diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index 087416b..cc0f739 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -7,7 +7,6 @@ import Xlib.X import Xlib.Xutil # not sure if Xutil is necessary from operate.settings import Config -from operate.prompts import ACCURATE_PIXEL_COUNT # Load configuration config = Config() @@ -82,73 +81,6 @@ def draw_label_with_background( image.save(new_image_path) -def capture_mini_screenshot_with_cursor( - file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0 -): - """ - Capture a mini screenshot with the cursor at the specified coordinates. - - Args: - file_path (str, optional): The file path to save the screenshot. Defaults to "screenshots/screenshot_mini.png". - x (int or str, optional): The x-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0. - y (int or str, optional): The y-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0. - """ - user_platform = platform.system() - - if user_platform == "Linux": - x = float(x[:-1]) # convert x from "50%" to 50. - y = float(y[:-1]) - - x = (x / 100) * monitor_size[ - "width" - ] # convert x from 50 to 0.5 * monitor_width - y = (y / 100) * monitor_size["height"] - - # Define the coordinates for the rectangle - x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) - x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2) - - screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2)) - screenshot = screenshot.resize( - (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS - ) # upscale the image so it's easier to see and percentage marks more visible - screenshot.save(file_path) - - screenshots_dir = "screenshots" - grid_screenshot_filename = os.path.join( - screenshots_dir, "screenshot_mini_with_grid.png" - ) - - add_grid_to_image( - file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2) - ) - elif user_platform == "Darwin": - x = float(x[:-1]) # convert x from "50%" to 50. - y = float(y[:-1]) - - x = (x / 100) * monitor_size[ - "width" - ] # convert x from 50 to 0.5 * monitor_width - y = (y / 100) * monitor_size["height"] - - x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) - - width = ACCURATE_PIXEL_COUNT - height = ACCURATE_PIXEL_COUNT - # Use the screencapture utility to capture the screen with the cursor - rect = f"-R{x1},{y1},{width},{height}" - subprocess.run(["screencapture", "-C", rect, file_path]) - - screenshots_dir = "screenshots" - grid_screenshot_filename = os.path.join( - screenshots_dir, "screenshot_mini_with_grid.png" - ) - - add_grid_to_image( - file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2) - ) - - def capture_screen_with_cursor(file_path): """ Capture the screen with the cursor and save it to the specified file path. From d0f47e67248f62cae369ee86288492fb3aa766a1 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 10:54:21 -0800 Subject: [PATCH 12/65] Add `session_id` for `agent-1` api --- operate/actions.py | 8 ++++---- operate/dialog.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index fe54d48..11fc25d 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -51,15 +51,15 @@ async def get_next_action(model, messages, objective, session_id): print("[get_next_action]") if model == "gpt-4": - return [call_gpt_4_v(messages, objective)] + return [call_gpt_4_v(messages, objective)], None if model == "gpt-4-with-som": operation = await call_gpt_4_v_labeled(messages, objective) - return [operation] + return [operation], None elif model == "agent-1": operation, session_id = call_agent_1(session_id, objective) - return operation + return operation, session_id elif model == "gemini-pro-vision": - return [call_gemini_pro_vision(messages, objective)] + return [call_gemini_pro_vision(messages, objective)], None raise ModelNotRecognizedException(model) diff --git a/operate/dialog.py b/operate/dialog.py index 864ceef..71dbd24 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -102,7 +102,7 @@ def main(model, terminal_prompt, voice_mode=False): if config.debug: print("[loop] messages before next action:\n\n\n", messages[1:]) try: - operations = asyncio.run( + operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) ) From 909efd7c529ead07dfab79b11e38100864ece04c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 11:00:07 -0800 Subject: [PATCH 13/65] better prints some fixes --- operate/actions.py | 8 ++------ operate/dialog.py | 7 +++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 11fc25d..58d9ae6 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -57,6 +57,8 @@ async def get_next_action(model, messages, objective, session_id): return [operation], None elif model == "agent-1": operation, session_id = call_agent_1(session_id, objective) + print("[get_next_action] returning operation", operation) + print("[get_next_action] returning session_id", session_id) return operation, session_id elif model == "gemini-pro-vision": return [call_gemini_pro_vision(messages, objective)], None @@ -384,12 +386,6 @@ def fetch_agent_1_response(session_id, objective, base64_image): response = requests.post(url, headers=headers, data=json.dumps(data)) response_dict = response.json() - - print("[call_agent_1][fetch_agent_1_response] A response_dict", response_dict) - print( - "[call_agent_1][fetch_agent_1_response] A type(response_dict", - type(response_dict), - ) operations = response_dict.get("operations") session_id = response_dict.get("session_id") diff --git a/operate/dialog.py b/operate/dialog.py index 71dbd24..d839c2b 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -99,12 +99,12 @@ def main(model, terminal_prompt, voice_mode=False): session_id = None while True: - if config.debug: - print("[loop] messages before next action:\n\n\n", messages[1:]) try: operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) ) + print("[loop] asyncio.run get_next_action got operations", operations) + print("[loop] asyncio.run get_next_action got session_id", session_id) except ModelNotRecognizedException as e: print( @@ -127,8 +127,7 @@ def main(model, terminal_prompt, voice_mode=False): def execute_operations(operations, messages, model, objective): - print("[execute_operations_new] operations before", operations) - print("[execute_operations_new] type(operations) before", type(operations)) + print("[execute_operations_new] operations", operations) for operate in operations: # wait one second From 4e5168db3d5b506226157bfbdffc82aca1bdbd9e Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 12:17:25 -0800 Subject: [PATCH 14/65] Update `README.md` to temp version until updated --- README.md | 30 ++++++------------------------ operate/dialog.py | 2 +- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 54c043e..a59ad3d 100644 --- a/README.md +++ b/README.md @@ -45,43 +45,25 @@ Below are instructions to set up the Self-Operating Computer Framework locally o ### Option 1: Traditional Installation -1. **Clone the repo** to a directory on your computer: -``` -git clone https://github.com/OthersideAI/self-operating-computer.git -``` -2. **Cd into directory**: - ``` -cd self-operating-computer -``` - -3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html). - -``` -python3 -m venv venv -``` -4. **Activate the virtual environment**: -``` -source venv/bin/activate -``` -5. **Install Project Requirements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:** +1. **Install Project Requirements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:** ``` pip install self-operating-computer ``` -6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.** +2. ``` -mv .example.env .env + ``` -7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: +3. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: ``` OPENAI_API_KEY='your-key-here' ``` -8. **Run it**! +4. **Run it**! ``` operate ``` -9. **Final Step**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences". +5. **Final Step**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences".
diff --git a/operate/dialog.py b/operate/dialog.py index d839c2b..64b6aa4 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -131,7 +131,7 @@ def execute_operations(operations, messages, model, objective): for operate in operations: # wait one second - time.sleep(3) + time.sleep(1) print("[execute_operations_new] operation", operations) operation_type = operate.get("operation") # print From 4e400016c84532b5255b533ad34557028ec8802d Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 14:06:59 -0800 Subject: [PATCH 15/65] Add `SYSTEM_PROMPT` --- operate/prompts.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/operate/prompts.py b/operate/prompts.py index 4728761..c3f2c53 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -69,6 +69,64 @@ """ +SYSTEM_PROMPT = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` + +1. mouse - Move mouse and click +[{{ "decision": "information about the decision", "operation": "mouse", "x": "percent", "y": "percent" }}] # 'percent' refers to the percentage of the screen's dimensions + +2. write - Write with your keyboard +[{{ "decision": "information about the decision", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "decision": "information about the decision", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "decision": "information about the decision", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Spotlight Search on Mac +[ + {{ "decision": "Opening OS search to look for Google Chrome", "operation": "press", "keys": ["command", "space"] }}, + {{ "decision": "Writing 'Google Chrome' to search the computer for it", "operation": "write", "content": "Google Chrome" }}, + {{ "decision": "Pressing enter to open Chrome", "operation": "press", "keys": ["enter"] }} +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "decision": "Focusing on the address bar in the browser", "operation": "press", "keys": ["command", "l"] }}, + {{ "decision": "Writing the hacker news URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "decision": "Pressing enter to go to the URL", "operation": "press", "keys": ["enter"] }} +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Remember you only have those 4 operations available to you. +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + +OPERATE_FIRST_MESSAGE_PROMPT = """ +Please take the next best action. Remember you only have the following 4 operations available: mouse, write, press, done + +Right now you are probably in the terminal because the human just started up. + +Action:""" + +OPERATE_PROMPT = """ +Please take the next best action. Remember you only have the following 4 operations available: mouse, write, press, done +Action:""" + + LABELED_IMAGE_PROMPT = """ Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. @@ -205,3 +263,21 @@ def format_label_prompt(objective): """ prompt = LABELED_IMAGE_PROMPT.format(objective=objective) return prompt + + +def get_system_prompt(objective): + """ + Format the vision prompt + """ + prompt = SYSTEM_PROMPT.format(objective=objective) + return prompt + + +def get_user_prompt(): + prompt = OPERATE_PROMPT + return prompt + + +def get_user_first_message_prompt(): + prompt = OPERATE_FIRST_MESSAGE_PROMPT + return prompt From d5046791f3f307dbafd304e8b288f7417fb34fbf Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 14:07:16 -0800 Subject: [PATCH 16/65] Update `call_gpt_4_v` for keycommands --- operate/actions.py | 48 ++++++++++++++++++++++----------------------- operate/dialog.py | 11 ++++------- operate/utils/os.py | 2 +- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 58d9ae6..0491698 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -24,6 +24,7 @@ format_summary_prompt, format_decision_prompt, format_label_prompt, + get_user_first_message_prompt, ) @@ -51,7 +52,7 @@ async def get_next_action(model, messages, objective, session_id): print("[get_next_action]") if model == "gpt-4": - return [call_gpt_4_v(messages, objective)], None + return call_gpt_4_v(messages), None if model == "gpt-4-with-som": operation = await call_gpt_4_v_labeled(messages, objective) return [operation], None @@ -93,12 +94,12 @@ def call_agent_1(session_id, objective): return "Failed take action after looking at the screenshot" -def call_gpt_4_v(messages, objective): +def call_gpt_4_v(messages): """ Get the next action for Self-Operating Computer """ - # sleep for a second - time.sleep(1) + print("[call_gpt_4_v]") + try: screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): @@ -111,51 +112,48 @@ def call_gpt_4_v(messages, objective): new_screenshot_filename = os.path.join( "screenshots", "screenshot_with_grid.png" ) + print("[call_gpt_4_v] new_screenshot_filename", new_screenshot_filename) add_grid_to_image(screenshot_filename, new_screenshot_filename, 500) - # sleep for a second - time.sleep(1) with open(new_screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - previous_action = get_last_assistant_message(messages) + user_prompt = get_user_first_message_prompt() - vision_prompt = format_vision_prompt(objective, previous_action) + print("[call_gpt_4_v] user_message", user_prompt) vision_message = { "role": "user", "content": [ - {"type": "text", "text": vision_prompt}, + {"type": "text", "text": user_prompt}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, }, ], } - - # create a copy of messages and save to pseudo_messages - pseudo_messages = messages.copy() - pseudo_messages.append(vision_message) + messages.append(vision_message) response = client.chat.completions.create( model="gpt-4-vision-preview", - messages=pseudo_messages, + messages=messages, presence_penalty=1, frequency_penalty=1, temperature=0.7, max_tokens=300, ) - - messages.append( - { - "role": "user", - "content": "`screenshot.png`", - } - ) + print("[call_gpt_4_v] response", response) content = response.choices[0].message.content + if content.startswith("```json"): + content = content[len("```json") :] # Remove starting ```json + if content.endswith("```"): + content = content[: -len("```")] # Remove ending + + content = json.loads(content) + return content except Exception as e: @@ -349,7 +347,7 @@ async def call_gpt_4_v_labeled(messages, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" ) - return call_gpt_4_v(messages, objective) + return call_gpt_4_v(messages) x_percent = f"{click_position_percent[0]:.2f}%" y_percent = f"{click_position_percent[1]:.2f}%" @@ -359,7 +357,7 @@ async def call_gpt_4_v_labeled(messages, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}" ) - return call_gpt_4_v(messages, objective) + return call_gpt_4_v(messages) return click_action @@ -367,12 +365,12 @@ async def call_gpt_4_v_labeled(messages, objective): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" ) - return call_gpt_4_v(messages, objective) + return call_gpt_4_v(messages) def fetch_agent_1_response(session_id, objective, base64_image): print("[call_agent_1][fetch_agent_1_response]") - url = "http://127.0.0.1:5000/agent/v1/action" + url = "http://127.0.0.1:5000/agent/v2/action" api_token = os.environ.get("AGENT_API_KEY") headers = { "Content-Type": "application/json", diff --git a/operate/dialog.py b/operate/dialog.py index 64b6aa4..81dd86b 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -7,7 +7,7 @@ from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt from operate.exceptions import ModelNotRecognizedException -from operate.prompts import USER_QUESTION +from operate.prompts import USER_QUESTION, get_system_prompt from operate.settings import Config from operate.utils.style import ( ANSI_GREEN, @@ -87,12 +87,9 @@ def main(model, terminal_prompt, voice_mode=False): print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") objective = prompt(style=style) - assistant_message = {"role": "assistant", "content": USER_QUESTION} - user_message = { - "role": "user", - "content": f"Objective: {objective}", - } - messages = [assistant_message, user_message] + system_prompt = get_system_prompt(objective) + system_message = {"role": "system", "content": system_prompt} + messages = [system_message] loop_count = 0 diff --git a/operate/utils/os.py b/operate/utils/os.py index fc31242..fbe8578 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -82,7 +82,7 @@ def mouse(click_detail): if click_detail and isinstance(x, float) and isinstance(y, float): click_at_percentage(x, y) - return click_detail["description"] + return "we clicked successfully" else: return "We failed to click" From 2291810faf96c695a7811aa1749462458debbbbc Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 14:44:57 -0800 Subject: [PATCH 17/65] Update `execute_operations` & remove search --- operate/actions.py | 5 ++++- operate/dialog.py | 24 +++++++++++------------- operate/utils/os.py | 30 ------------------------------ 3 files changed, 15 insertions(+), 44 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 0491698..c3adc6b 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -143,7 +143,6 @@ def call_gpt_4_v(messages): temperature=0.7, max_tokens=300, ) - print("[call_gpt_4_v] response", response) content = response.choices[0].message.content @@ -152,6 +151,10 @@ def call_gpt_4_v(messages): if content.endswith("```"): content = content[: -len("```")] # Remove ending + assistant_message = {"role": "assistant", "content": content} + print("[call_gpt_4_v] message.append(assistant_message)", assistant_message) + messages.append(assistant_message) + content = json.loads(content) return content diff --git a/operate/dialog.py b/operate/dialog.py index 81dd86b..36cf555 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -100,8 +100,7 @@ def main(model, terminal_prompt, voice_mode=False): operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) ) - print("[loop] asyncio.run get_next_action got operations", operations) - print("[loop] asyncio.run get_next_action got session_id", session_id) + print("[loop] operations", operations) except ModelNotRecognizedException as e: print( @@ -114,7 +113,7 @@ def main(model, terminal_prompt, voice_mode=False): ) break - stop = execute_operations(operations, messages, model, objective) + stop = execute_operations(operations) if stop: break @@ -123,25 +122,24 @@ def main(model, terminal_prompt, voice_mode=False): break -def execute_operations(operations, messages, model, objective): - print("[execute_operations_new] operations", operations) - +def execute_operations(operations): for operate in operations: # wait one second time.sleep(1) - print("[execute_operations_new] operation", operations) - operation_type = operate.get("operation") + print("[execute_operations_new] operate", operate) + operate_type = operate.get("operation").lower() + # print - print("[execute_operations_new] operation_type", operation_type) + print("[execute_operations_new] operation_type", operate_type) # function_response = "" - if operation_type == "press": + if operate_type == "press" or operate_type == "hotkey": keys = operate.get("keys") function_response = press(keys) - elif operation_type == "write": + elif operate_type == "write": content = operate.get("content") function_response = keyboard(content) - elif operation_type == "mouse": + elif operate_type == "mouse": x = operate.get("x") y = operate.get("y") click_detail = {"x": x, "y": y} @@ -156,7 +154,7 @@ def execute_operations(operations, messages, model, objective): return True print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operation_type} COMPLETE {ANSI_RESET}{function_response}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operate_type} COMPLETE {ANSI_RESET}{function_response}" ) # message = { diff --git a/operate/utils/os.py b/operate/utils/os.py index fbe8578..fd1ede0 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -23,36 +23,6 @@ def keyboard(content): return "Type: " + content -def search(text): - """ - Searches for a program or file by typing the given text in the search bar and pressing Enter. - - Args: - text (str): The text to be searched. - - Returns: - str: A message indicating that the program or file has been opened. - """ - if platform.system() == "Windows": - pyautogui.press("win") - elif platform.system() == "Linux": - pyautogui.press("win") - else: - # Press and release Command and Space separately - pyautogui.keyDown("command") - pyautogui.press("space") - pyautogui.keyUp("command") - - time.sleep(1) - - # Now type the text - for char in text: - pyautogui.write(char) - - pyautogui.press("enter") - return "Open program: " + text - - def press(keys): print("[hotkey] ") print("[hotkey] keys", keys) From 52d01c0f066d8e653a7df4ad74bb7f222aa8acab Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 14:48:48 -0800 Subject: [PATCH 18/65] Remove `add_grid_to_image` approach --- operate/actions.py | 9 +-------- operate/utils/os.py | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index c3adc6b..621346f 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -109,14 +109,7 @@ def call_gpt_4_v(messages): # Call the function to capture the screen with the cursor capture_screen_with_cursor(screenshot_filename) - new_screenshot_filename = os.path.join( - "screenshots", "screenshot_with_grid.png" - ) - print("[call_gpt_4_v] new_screenshot_filename", new_screenshot_filename) - - add_grid_to_image(screenshot_filename, new_screenshot_filename, 500) - - with open(new_screenshot_filename, "rb") as img_file: + with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") user_prompt = get_user_first_message_prompt() diff --git a/operate/utils/os.py b/operate/utils/os.py index fd1ede0..c6c2284 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -19,7 +19,7 @@ def keyboard(content): content = content.replace("\\n", "\n") for char in content: pyautogui.write(char) - # pyautogui.press("enter") + return "Type: " + content From 1068ab08c6acd21eeb34792f8292db2bb806a1db Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 14:49:49 -0800 Subject: [PATCH 19/65] Remove `add_grid_to_image` --- operate/actions.py | 9 +---- operate/utils/screenshot.py | 68 ------------------------------------- 2 files changed, 1 insertion(+), 76 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 621346f..3054843 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -16,7 +16,6 @@ from operate.exceptions import ModelNotRecognizedException from operate.utils.screenshot import ( capture_screen_with_cursor, - add_grid_to_image, ) from operate.utils.os import get_last_assistant_message from operate.prompts import ( @@ -171,12 +170,6 @@ def call_gemini_pro_vision(messages, objective): screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") # Call the function to capture the screen with the cursor capture_screen_with_cursor(screenshot_filename) - - new_screenshot_filename = os.path.join( - "screenshots", "screenshot_with_grid.png" - ) - - add_grid_to_image(screenshot_filename, new_screenshot_filename, 500) # sleep for a second time.sleep(1) @@ -187,7 +180,7 @@ def call_gemini_pro_vision(messages, objective): model = genai.GenerativeModel("gemini-pro-vision") response = model.generate_content( - [vision_prompt, Image.open(new_screenshot_filename)] + [vision_prompt, Image.open(screenshot_filename)] ) # create a copy of messages and save to pseudo_messages diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index cc0f739..c9c3ac2 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -13,74 +13,6 @@ monitor_size = config.monitor_size -def add_grid_to_image(original_image_path, new_image_path, grid_interval): - """ - Add a grid to an image. - - Args: - original_image_path (str): The file path of the original image. - new_image_path (str): The file path to save the new image with the grid. - grid_interval (int): The interval between grid lines in pixels. - - Returns: - None: The function saves the new image with the grid at the specified path. - """ - # Load the image - image = Image.open(original_image_path) - - # Create a drawing object - draw = ImageDraw.Draw(image) - - # Get the image size - width, height = image.size - - # Reduce the font size a bit - font_size = int(grid_interval / 10) # Reduced font size - - # Calculate the background size based on the font size - bg_width = int(font_size * 4.2) # Adjust as necessary - bg_height = int(font_size * 1.2) # Adjust as necessary - - # Function to draw text with a white rectangle background - def draw_label_with_background( - position, text, draw, font_size, bg_width, bg_height - ): - # Adjust the position based on the background size - text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2) - # Draw the text background - draw.rectangle( - [position[0], position[1], position[0] + bg_width, position[1] + bg_height], - fill="white", - ) - # Draw the text - draw.text(text_position, text, fill="black", font_size=font_size, anchor="mm") - - # Draw vertical lines and labels at every `grid_interval` pixels - for x in range(grid_interval, width, grid_interval): - line = ((x, 0), (x, height)) - draw.line(line, fill="blue") - for y in range(grid_interval, height, grid_interval): - # Calculate the percentage of the width and height - x_percent = round((x / width) * 100) - y_percent = round((y / height) * 100) - draw_label_with_background( - (x - bg_width // 2, y - bg_height // 2), - f"{x_percent}%,{y_percent}%", - draw, - font_size, - bg_width, - bg_height, - ) - - # Draw horizontal lines - labels are already added with vertical lines - for y in range(grid_interval, height, grid_interval): - line = ((0, y), (width, y)) - draw.line(line, fill="blue") - - # Save the image with the grid - image.save(new_image_path) - - def capture_screen_with_cursor(file_path): """ Capture the screen with the cursor and save it to the specified file path. From a204ca051d623444c5cfd1252854545e5bfcc027 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 15:08:10 -0800 Subject: [PATCH 20/65] Update `message_dialog` --- operate/dialog.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/operate/dialog.py b/operate/dialog.py index 36cf555..d4d597a 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -1,5 +1,4 @@ import sys -import json import os import platform import time @@ -12,14 +11,13 @@ from operate.utils.style import ( ANSI_GREEN, ANSI_RESET, - ANSI_BLUE, ANSI_YELLOW, ANSI_RED, ANSI_BRIGHT_MAGENTA, style, ) -from operate.utils.os import keyboard, search, mouse, press -from operate.actions import get_next_action, summarize +from operate.utils.os import keyboard, mouse, press +from operate.actions import get_next_action # Load configuration config = Config() @@ -58,7 +56,7 @@ def main(model, terminal_prompt, voice_mode=False): if not terminal_prompt: message_dialog( title="Self-Operating Computer", - text="Ask a computer to do anything.", + text="Ask a computer to do anything! This is an experimental framework to enable multimodal models to operate computers", style=style, ).run() else: @@ -126,11 +124,11 @@ def execute_operations(operations): for operate in operations: # wait one second time.sleep(1) - print("[execute_operations_new] operate", operate) + print("[execute_operations] operate", operate) operate_type = operate.get("operation").lower() # print - print("[execute_operations_new] operation_type", operate_type) + print("[execute_operations] operation_type", operate_type) # function_response = "" if operate_type == "press" or operate_type == "hotkey": From df002b468862ec1b99b05ea46f022d7ab5598c42 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 15:16:51 -0800 Subject: [PATCH 21/65] Fix `convert_percent_to_decimal` to match new prompt --- operate/prompts.py | 93 ++++++++++++++++++++++--------------------- operate/utils/misc.py | 33 ++------------- operate/utils/os.py | 8 ++-- 3 files changed, 56 insertions(+), 78 deletions(-) diff --git a/operate/prompts.py b/operate/prompts.py index c3f2c53..6c03b91 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -7,6 +7,53 @@ USER_QUESTION = "Hello, I can help you with anything. What would you like done?" +SYSTEM_PROMPT = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` + +1. mouse - Move mouse and click +[{{ "decision": "information about the decision", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "decision": "information about the decision", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "decision": "information about the decision", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "decision": "information about the decision", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Spotlight Search on Mac +[ + {{ "decision": "Opening OS search to look for Google Chrome", "operation": "press", "keys": ["command", "space"] }}, + {{ "decision": "Writing 'Google Chrome' to search the computer for it", "operation": "write", "content": "Google Chrome" }}, + {{ "decision": "Pressing enter to open Chrome", "operation": "press", "keys": ["enter"] }} +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "decision": "Focusing on the address bar in the browser", "operation": "press", "keys": ["command", "l"] }}, + {{ "decision": "Writing the hacker news URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "decision": "Pressing enter to go to the URL", "operation": "press", "keys": ["enter"] }} +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Remember you only have those 4 operations available to you. +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + + # ------------------------- # VISION PROMPT # ------------------------- @@ -69,52 +116,6 @@ """ -SYSTEM_PROMPT = """ -You are operating a computer, using the same operating system as a human. - -From looking at the screen, the objective, and your previous actions, take the next best series of action. - -You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` - -1. mouse - Move mouse and click -[{{ "decision": "information about the decision", "operation": "mouse", "x": "percent", "y": "percent" }}] # 'percent' refers to the percentage of the screen's dimensions - -2. write - Write with your keyboard -[{{ "decision": "information about the decision", "operation": "write", "content": "text to write here" }}] - -3. press - Use a hotkey or press key to operate the computer -[{{ "decision": "information about the decision", "operation": "press", "keys": ["keys to use"] }}] - -4. done - The objective is completed -[{{ "decision": "information about the decision", "operation": "done", "summary": "summary of what was completed" }}] - -Return the actions in array format `[]`. You can take just one action or multiple actions. - -Here are some helpful combinations: - -# Opens Spotlight Search on Mac -[ - {{ "decision": "Opening OS search to look for Google Chrome", "operation": "press", "keys": ["command", "space"] }}, - {{ "decision": "Writing 'Google Chrome' to search the computer for it", "operation": "write", "content": "Google Chrome" }}, - {{ "decision": "Pressing enter to open Chrome", "operation": "press", "keys": ["enter"] }} -] - -# Focuses on the address bar in a browser before typing a website -[ - {{ "decision": "Focusing on the address bar in the browser", "operation": "press", "keys": ["command", "l"] }}, - {{ "decision": "Writing the hacker news URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, - {{ "decision": "Pressing enter to go to the URL", "operation": "press", "keys": ["enter"] }} -] - -A few important notes: - -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar -- Remember you only have those 4 operations available to you. -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. - -Objective: {objective} # take the next best action for this objective -""" - OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. Remember you only have the following 4 operations available: mouse, write, press, done diff --git a/operate/utils/misc.py b/operate/utils/misc.py index c7544ae..18bbeff 100644 --- a/operate/utils/misc.py +++ b/operate/utils/misc.py @@ -2,7 +2,7 @@ import re -def convert_percent_to_decimal(percent_str): +def convert_percent_to_decimal(percent): """ Converts a percentage string to a decimal value. @@ -21,37 +21,12 @@ def convert_percent_to_decimal(percent_str): """ try: # Remove the '%' sign and convert to float - decimal_value = float(percent_str.strip("%")) + decimal_value = float(percent) # Convert to decimal (e.g., 20% -> 0.20) - return decimal_value / 100 + return decimal_value except ValueError as e: - print(f"Error converting percent to decimal: {e}") - return None - - -def extract_json_from_string(s): - """ - Extracts a JSON structure from a string and returns it as a dictionary. - - Args: - s (str): The input string. - - Returns: - dict: The extracted JSON structure as a dictionary, or None if no JSON structure is found or if there is an error parsing the JSON. - - """ - try: - # Find the start of the JSON structure - json_start = s.find("{") - if json_start == -1: - return None - - # Extract the JSON part and convert it to a dictionary - json_str = s[json_start:] - return json.loads(json_str) - except Exception as e: - print(f"Error parsing JSON: {e}") + print(f"[convert_percent_to_decimal] error: {e}") return None diff --git a/operate/utils/os.py b/operate/utils/os.py index c6c2284..800b6ba 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -47,8 +47,10 @@ def mouse(click_detail): str: The description of the click if successful, otherwise "We failed to click". """ try: - x = convert_percent_to_decimal(click_detail["x"]) - y = convert_percent_to_decimal(click_detail["y"]) + print("[mouse]") + print("[mouse] click_detail", click_detail) + x = convert_percent_to_decimal(click_detail.get("x")) + y = convert_percent_to_decimal(click_detail.get("y")) if click_detail and isinstance(x, float) and isinstance(y, float): click_at_percentage(x, y) @@ -57,7 +59,7 @@ def mouse(click_detail): return "We failed to click" except Exception as e: - print(f"Error parsing JSON: {e}") + print(f"[mouse] error {e}") return "We failed to click" From b2a63dec95c6a206f677c6204240c4d30a266945 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 15:18:13 -0800 Subject: [PATCH 22/65] Move `get_last_assistant_message` --- operate/actions.py | 15 ++++++++++++++- operate/utils/misc.py | 35 ----------------------------------- operate/utils/os.py | 14 -------------- 3 files changed, 14 insertions(+), 50 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 3054843..6297e8f 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -17,7 +17,6 @@ from operate.utils.screenshot import ( capture_screen_with_cursor, ) -from operate.utils.os import get_last_assistant_message from operate.prompts import ( format_vision_prompt, format_summary_prompt, @@ -399,3 +398,17 @@ async def fetch_openai_response_async(messages): url, headers=headers, data=json.dumps(data) ) as response: return await response.json() + + +def get_last_assistant_message(messages): + """ + Retrieve the last message from the assistant in the messages array. + If the last assistant message is the first message in the array, return None. + """ + for index in reversed(range(len(messages))): + if messages[index]["role"] == "assistant": + if index == 0: # Check if the assistant message is the first in the array + return None + else: + return messages[index] + return None # Return None if no assistant message is found diff --git a/operate/utils/misc.py b/operate/utils/misc.py index 18bbeff..6fb1f17 100644 --- a/operate/utils/misc.py +++ b/operate/utils/misc.py @@ -3,22 +3,6 @@ def convert_percent_to_decimal(percent): - """ - Converts a percentage string to a decimal value. - - Args: - percent_str (str): The percentage string to be converted. - - Returns: - float: The decimal value equivalent to the percentage. - - Raises: - ValueError: If the input string cannot be converted to a float. - - Example: - >>> convert_percent_to_decimal("20%") - 0.2 - """ try: # Remove the '%' sign and convert to float decimal_value = float(percent) @@ -31,25 +15,6 @@ def convert_percent_to_decimal(percent): def parse_operations(response): - """ - Parses the given response and returns a dictionary with the type and data. - - Args: - response (str): The response to parse. - - Returns: - dict: A dictionary with the type and data extracted from the response. - The dictionary has the following structure: - { - "type": , - "data": - } - If the response is "DONE", the type is "DONE" and the data is None. - If the response starts with "CLICK", the type is "CLICK" and the data is a JSON object. - If the response starts with "TYPE", the type is "TYPE" and the data is the text to type. - If the response starts with "SEARCH", the type is "SEARCH" and the data is the search query. - If the response doesn't match any of the above patterns, the type is "UNKNOWN" and the data is the original response. - """ if response == "DONE": return {"type": "DONE", "data": None} elif response.startswith("CLICK"): diff --git a/operate/utils/os.py b/operate/utils/os.py index 800b6ba..d5f21bb 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -100,17 +100,3 @@ def click_at_percentage( # Finally, click pyautogui.click(x_pixel, y_pixel) return "Successfully clicked" - - -def get_last_assistant_message(messages): - """ - Retrieve the last message from the assistant in the messages array. - If the last assistant message is the first message in the array, return None. - """ - for index in reversed(range(len(messages))): - if messages[index]["role"] == "assistant": - if index == 0: # Check if the assistant message is the first in the array - return None - else: - return messages[index] - return None # Return None if no assistant message is found From cab5912f61d8d3366742658335e2e9d9f2571caa Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 16:00:24 -0800 Subject: [PATCH 23/65] `SYSTEM_PROMPT` updates --- operate/prompts.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/operate/prompts.py b/operate/prompts.py index 6c03b91..39204ed 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -15,33 +15,33 @@ You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` 1. mouse - Move mouse and click -[{{ "decision": "information about the decision", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +[{{ "decision": "information about the decision", "thought": "a thought", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard -[{{ "decision": "information about the decision", "operation": "write", "content": "text to write here" }}] +[{{ "decision": "information about the decision", "thought": "a thought", "operation": "write", "content": "text to write here" }}] 3. press - Use a hotkey or press key to operate the computer -[{{ "decision": "information about the decision", "operation": "press", "keys": ["keys to use"] }}] +[{{ "decision": "information about the decision", "thought": "a thought", "operation": "press", "keys": ["keys to use"] }}] 4. done - The objective is completed -[{{ "decision": "information about the decision", "operation": "done", "summary": "summary of what was completed" }}] +[{{ "decision": "information about the decision", "thought": "a thought", "operation": "done", "summary": "summary of what was completed" }}] Return the actions in array format `[]`. You can take just one action or multiple actions. Here are some helpful combinations: -# Opens Spotlight Search on Mac +# Opens Spotlight Search on Mac (leaving ... for conciseness in examples) [ - {{ "decision": "Opening OS search to look for Google Chrome", "operation": "press", "keys": ["command", "space"] }}, - {{ "decision": "Writing 'Google Chrome' to search the computer for it", "operation": "write", "content": "Google Chrome" }}, - {{ "decision": "Pressing enter to open Chrome", "operation": "press", "keys": ["enter"] }} + {{ "decision": "Opening OS search to look for Google Chrome", "thought": "It appears I am in terminal, this is the right next step", "operation": "press", "keys": ["command", "space"] }}, + {{ "decision": "...", "thought": "...", "operation": "write", "content": "Google Chrome" }}, + {{ "decision": "...", "thought": "...", "operation": "press", "keys": ["enter"] }} ] -# Focuses on the address bar in a browser before typing a website +# Focuses on the address bar in a browser before typing a website (leaving ... for conciseness in examples) [ - {{ "decision": "Focusing on the address bar in the browser", "operation": "press", "keys": ["command", "l"] }}, - {{ "decision": "Writing the hacker news URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, - {{ "decision": "Pressing enter to go to the URL", "operation": "press", "keys": ["enter"] }} + {{ "decision": "Focusing on the address bar in the browser", "thought": "I can see the browser is open already so this should be safe to try", "operation": "press", "keys": ["command", "l"] }}, + {{ "decision": "...", "thought": "...", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "decision": "...", "thought": "...", "operation": "press", "keys": ["enter"] }} ] A few important notes: From d19a376ed4c81c5e0d53383cb0e7e0058e3e6464 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 12 Jan 2024 16:00:34 -0800 Subject: [PATCH 24/65] Create `OperatingSystem` class --- operate/actions.py | 2 +- operate/dialog.py | 11 ++-- operate/utils/os.py | 149 +++++++++++++++++--------------------------- 3 files changed, 64 insertions(+), 98 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 6297e8f..91dee96 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -97,7 +97,7 @@ def call_gpt_4_v(messages): Get the next action for Self-Operating Computer """ print("[call_gpt_4_v]") - + time.sleep(1) try: screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): diff --git a/operate/dialog.py b/operate/dialog.py index d4d597a..854c8eb 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -16,11 +16,12 @@ ANSI_BRIGHT_MAGENTA, style, ) -from operate.utils.os import keyboard, mouse, press +from operate.utils.os import OperatingSystem from operate.actions import get_next_action # Load configuration config = Config() +operating_system = OperatingSystem() def main(model, terminal_prompt, voice_mode=False): @@ -56,7 +57,7 @@ def main(model, terminal_prompt, voice_mode=False): if not terminal_prompt: message_dialog( title="Self-Operating Computer", - text="Ask a computer to do anything! This is an experimental framework to enable multimodal models to operate computers", + text="An experimental framework to enable multimodal models to operate computers", style=style, ).run() else: @@ -133,15 +134,15 @@ def execute_operations(operations): if operate_type == "press" or operate_type == "hotkey": keys = operate.get("keys") - function_response = press(keys) + function_response = operating_system.press(keys) elif operate_type == "write": content = operate.get("content") - function_response = keyboard(content) + function_response = operating_system.write(content) elif operate_type == "mouse": x = operate.get("x") y = operate.get("y") click_detail = {"x": x, "y": y} - function_response = mouse(click_detail) + function_response = operating_system.mouse(click_detail) else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" diff --git a/operate/utils/os.py b/operate/utils/os.py index d5f21bb..5c240e1 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -6,97 +6,62 @@ from operate.utils.misc import convert_percent_to_decimal -def keyboard(content): - """ - Types the given text using the keyboard. - - Args: - text (str): The text to be typed. - - Returns: - str: A message indicating the typed text. - """ - content = content.replace("\\n", "\n") - for char in content: - pyautogui.write(char) - - return "Type: " + content - - -def press(keys): - print("[hotkey] ") - print("[hotkey] keys", keys) - for key in keys: - print("[hotkey] keydown", key) - pyautogui.keyDown(key) - time.sleep(0.1) - for key in keys: - print("[hotkey] keyup", key) - pyautogui.keyUp(key) - return True - - -def mouse(click_detail): - """ - Perform a mouse click at the specified coordinates. - - Args: - click_detail (dict): A dictionary containing the coordinates of the click. - - Returns: - str: The description of the click if successful, otherwise "We failed to click". - """ - try: - print("[mouse]") - print("[mouse] click_detail", click_detail) - x = convert_percent_to_decimal(click_detail.get("x")) - y = convert_percent_to_decimal(click_detail.get("y")) - - if click_detail and isinstance(x, float) and isinstance(y, float): - click_at_percentage(x, y) - return "we clicked successfully" - else: +class OperatingSystem: + def write(self, content): + content = content.replace("\\n", "\n") + for char in content: + pyautogui.write(char) + return "Type: " + content + + def press(self, keys): + print("[hotkey] ") + print("[hotkey] keys", keys) + for key in keys: + print("[hotkey] keydown", key) + pyautogui.keyDown(key) + time.sleep(0.1) + for key in keys: + print("[hotkey] keyup", key) + pyautogui.keyUp(key) + return True + + def mouse(self, click_detail): + try: + print("[mouse]") + print("[mouse] click_detail", click_detail) + x = convert_percent_to_decimal(click_detail.get("x")) + y = convert_percent_to_decimal(click_detail.get("y")) + + if click_detail and isinstance(x, float) and isinstance(y, float): + self._click_at_percentage(x, y) + return "we clicked successfully" + else: + return "We failed to click" + + except Exception as e: + print(f"[mouse] error {e}") return "We failed to click" - except Exception as e: - print(f"[mouse] error {e}") - return "We failed to click" - - -def click_at_percentage( - x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5 -): - """ - Moves the mouse cursor to a specified percentage of the screen and performs a circular movement before clicking. - - Args: - x_percentage (float): The x-coordinate percentage of the screen to move the cursor to. - y_percentage (float): The y-coordinate percentage of the screen to move the cursor to. - duration (float, optional): The duration (in seconds) of the smooth cursor movement. Defaults to 0.2. - circle_radius (int, optional): The radius of the circular movement. Defaults to 50. - circle_duration (float, optional): The duration (in seconds) of the circular movement. Defaults to 0.5. - - Returns: - str: A message indicating that the click was successful. - """ - # Get the size of the primary monitor - screen_width, screen_height = pyautogui.size() - - # Calculate the x and y coordinates in pixels - x_pixel = int(screen_width * float(x_percentage)) - y_pixel = int(screen_height * float(y_percentage)) - - # Move to the position smoothly - pyautogui.moveTo(x_pixel, y_pixel, duration=duration) - - # Circular movement - start_time = time.time() - while time.time() - start_time < circle_duration: - angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi - x = x_pixel + math.cos(angle) * circle_radius - y = y_pixel + math.sin(angle) * circle_radius - pyautogui.moveTo(x, y, duration=0.1) - - # Finally, click - pyautogui.click(x_pixel, y_pixel) - return "Successfully clicked" + def _click_at_percentage( + self, + x_percentage, + y_percentage, + duration=0.2, + circle_radius=50, + circle_duration=0.5, + ): + screen_width, screen_height = pyautogui.size() + x_pixel = int(screen_width * float(x_percentage)) + y_pixel = int(screen_height * float(y_percentage)) + + pyautogui.moveTo(x_pixel, y_pixel, duration=duration) + + start_time = time.time() + while time.time() - start_time < circle_duration: + angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi + x = x_pixel + math.cos(angle) * circle_radius + y = y_pixel + math.sin(angle) * circle_radius + pyautogui.moveTo(x, y, duration=0.1) + + pyautogui.click(x_pixel, y_pixel) + return "Successfully clicked" From 6a1666c305d00607c38b2b698d2905b99b6d29c9 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:07:10 -0800 Subject: [PATCH 25/65] Update `operating` --- operate/actions.py | 4 ++-- operate/prompts.py | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/operate/actions.py b/operate/actions.py index 91dee96..4dfe458 100644 --- a/operate/actions.py +++ b/operate/actions.py @@ -132,7 +132,7 @@ def call_gpt_4_v(messages): presence_penalty=1, frequency_penalty=1, temperature=0.7, - max_tokens=300, + max_tokens=1000, ) content = response.choices[0].message.content @@ -143,7 +143,7 @@ def call_gpt_4_v(messages): content = content[: -len("```")] # Remove ending assistant_message = {"role": "assistant", "content": content} - print("[call_gpt_4_v] message.append(assistant_message)", assistant_message) + print("[call_gpt_4_v] content", content) messages.append(assistant_message) content = json.loads(content) diff --git a/operate/prompts.py b/operate/prompts.py index 39204ed..0307755 100644 --- a/operate/prompts.py +++ b/operate/prompts.py @@ -15,33 +15,33 @@ You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` 1. mouse - Move mouse and click -[{{ "decision": "information about the decision", "thought": "a thought", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +[{{ "thought": "write a thought here", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard -[{{ "decision": "information about the decision", "thought": "a thought", "operation": "write", "content": "text to write here" }}] +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] 3. press - Use a hotkey or press key to operate the computer -[{{ "decision": "information about the decision", "thought": "a thought", "operation": "press", "keys": ["keys to use"] }}] +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] 4. done - The objective is completed -[{{ "decision": "information about the decision", "thought": "a thought", "operation": "done", "summary": "summary of what was completed" }}] +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] Return the actions in array format `[]`. You can take just one action or multiple actions. Here are some helpful combinations: -# Opens Spotlight Search on Mac (leaving ... for conciseness in examples) +# Opens Spotlight Search on Mac [ - {{ "decision": "Opening OS search to look for Google Chrome", "thought": "It appears I am in terminal, this is the right next step", "operation": "press", "keys": ["command", "space"] }}, - {{ "decision": "...", "thought": "...", "operation": "write", "content": "Google Chrome" }}, - {{ "decision": "...", "thought": "...", "operation": "press", "keys": ["enter"] }} + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} ] -# Focuses on the address bar in a browser before typing a website (leaving ... for conciseness in examples) +# Focuses on the address bar in a browser before typing a website [ - {{ "decision": "Focusing on the address bar in the browser", "thought": "I can see the browser is open already so this should be safe to try", "operation": "press", "keys": ["command", "l"] }}, - {{ "decision": "...", "thought": "...", "operation": "write", "content": "https://news.ycombinator.com/" }}, - {{ "decision": "...", "thought": "...", "operation": "press", "keys": ["enter"] }} + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} ] A few important notes: From 862e047996426e4d1b0bccb8594032c860f04042 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:10:37 -0800 Subject: [PATCH 26/65] name updates, `operate()`,etc. --- operate/dialog.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/operate/dialog.py b/operate/dialog.py index 854c8eb..4fd4a23 100644 --- a/operate/dialog.py +++ b/operate/dialog.py @@ -99,7 +99,6 @@ def main(model, terminal_prompt, voice_mode=False): operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) ) - print("[loop] operations", operations) except ModelNotRecognizedException as e: print( @@ -112,7 +111,7 @@ def main(model, terminal_prompt, voice_mode=False): ) break - stop = execute_operations(operations) + stop = operate(operations) if stop: break @@ -121,26 +120,26 @@ def main(model, terminal_prompt, voice_mode=False): break -def execute_operations(operations): - for operate in operations: +def operate(operations): + for operation in operations: # wait one second time.sleep(1) - print("[execute_operations] operate", operate) - operate_type = operate.get("operation").lower() + print("[execute_operations] operation", operation) + operate_type = operation.get("operation").lower() # print print("[execute_operations] operation_type", operate_type) # function_response = "" if operate_type == "press" or operate_type == "hotkey": - keys = operate.get("keys") + keys = operation.get("keys") function_response = operating_system.press(keys) elif operate_type == "write": - content = operate.get("content") + content = operation.get("content") function_response = operating_system.write(content) elif operate_type == "mouse": - x = operate.get("x") - y = operate.get("y") + x = operation.get("x") + y = operation.get("y") click_detail = {"x": x, "y": y} function_response = operating_system.mouse(click_detail) else: @@ -148,7 +147,7 @@ def execute_operations(operations): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" ) print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operate}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operation}" ) return True From e5df2b761b18281307461c80ee60cffd4b0b2ea7 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:15:32 -0800 Subject: [PATCH 27/65] Add `operation.get("summmary")` --- operate/main.py | 2 +- operate/{dialog.py => operate.py} | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) rename operate/{dialog.py => operate.py} (93%) diff --git a/operate/main.py b/operate/main.py index 8b2df0c..3cf991d 100644 --- a/operate/main.py +++ b/operate/main.py @@ -3,7 +3,7 @@ """ import argparse from operate.utils.style import ANSI_BRIGHT_MAGENTA -from operate.dialog import main +from operate.operate import main def main_entry(): diff --git a/operate/dialog.py b/operate/operate.py similarity index 93% rename from operate/dialog.py rename to operate/operate.py index 4fd4a23..9e82618 100644 --- a/operate/dialog.py +++ b/operate/operate.py @@ -142,6 +142,16 @@ def operate(operations): y = operation.get("y") click_detail = {"x": x, "y": y} function_response = operating_system.mouse(click_detail) + elif operate_type == "done": + summary = operation.get("summmary") + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" + ) + return True + else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" From 196e0917979adf965e47409b9451cd113fc40409 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:15:49 -0800 Subject: [PATCH 28/65] Add `ANSI_BLUE` --- operate/operate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/operate/operate.py b/operate/operate.py index 9e82618..1f1554f 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -14,6 +14,7 @@ ANSI_YELLOW, ANSI_RED, ANSI_BRIGHT_MAGENTA, + ANSI_BLUE, style, ) from operate.utils.os import OperatingSystem From 6624da5eab9f803c2f6e9fbfddcf7dfc251fc6ba Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:33:41 -0800 Subject: [PATCH 29/65] Update some file names, add `get_user_prompt` --- operate/{actions.py => models/apis.py} | 10 ++++-- operate/{ => models}/prompts.py | 0 operate/{model => models}/weights/best.pt | Bin operate/operate.py | 42 +++++++++++++++------- 4 files changed, 37 insertions(+), 15 deletions(-) rename operate/{actions.py => models/apis.py} (97%) rename operate/{ => models}/prompts.py (100%) rename operate/{model => models}/weights/best.pt (100%) diff --git a/operate/actions.py b/operate/models/apis.py similarity index 97% rename from operate/actions.py rename to operate/models/apis.py index 4dfe458..c2f4c77 100644 --- a/operate/actions.py +++ b/operate/models/apis.py @@ -17,12 +17,13 @@ from operate.utils.screenshot import ( capture_screen_with_cursor, ) -from operate.prompts import ( +from operate.models.prompts import ( format_vision_prompt, format_summary_prompt, format_decision_prompt, format_label_prompt, get_user_first_message_prompt, + get_user_prompt, ) @@ -44,7 +45,7 @@ client = config.initialize_openai_client() -yolo_model = YOLO("./operate/model/weights/best.pt") # Load your trained model +yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model async def get_next_action(model, messages, objective, session_id): @@ -110,7 +111,10 @@ def call_gpt_4_v(messages): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - user_prompt = get_user_first_message_prompt() + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() print("[call_gpt_4_v] user_message", user_prompt) diff --git a/operate/prompts.py b/operate/models/prompts.py similarity index 100% rename from operate/prompts.py rename to operate/models/prompts.py diff --git a/operate/model/weights/best.pt b/operate/models/weights/best.pt similarity index 100% rename from operate/model/weights/best.pt rename to operate/models/weights/best.pt diff --git a/operate/operate.py b/operate/operate.py index 1f1554f..57d47ce 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -6,7 +6,7 @@ from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt from operate.exceptions import ModelNotRecognizedException -from operate.prompts import USER_QUESTION, get_system_prompt +from operate.models.prompts import USER_QUESTION, get_system_prompt from operate.settings import Config from operate.utils.style import ( ANSI_GREEN, @@ -18,12 +18,14 @@ style, ) from operate.utils.os import OperatingSystem -from operate.actions import get_next_action +from operate.models.apis import get_next_action # Load configuration config = Config() operating_system = OperatingSystem() +VERBOSE = True + def main(model, terminal_prompt, voice_mode=False): """ @@ -62,9 +64,9 @@ def main(model, terminal_prompt, voice_mode=False): style=style, ).run() else: - print("Running direct prompt...") + if VERBOSE: + print("Running direct prompt...") - print("SYSTEM", platform.system()) # Clear the console if platform.system() == "Windows": os.system("cls") @@ -96,11 +98,21 @@ def main(model, terminal_prompt, voice_mode=False): session_id = None while True: + if VERBOSE: + print(f"[Self Operating Computer]") + print(f"[Self Operating Computer] loop_count", loop_count) try: operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) ) + stop = operate(operations) + if stop: + break + + loop_count += 1 + if loop_count > 3: + break except ModelNotRecognizedException as e: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" @@ -112,21 +124,18 @@ def main(model, terminal_prompt, voice_mode=False): ) break - stop = operate(operations) - if stop: - break - - loop_count += 1 - if loop_count > 3: - break - def operate(operations): + if VERBOSE: + print(f"[Self Operating Computer][operate]") for operation in operations: # wait one second time.sleep(1) print("[execute_operations] operation", operation) operate_type = operation.get("operation").lower() + if VERBOSE: + print(f"[Self Operating Computer][operate] operation", operation) + print(f"[Self Operating Computer][operate] operate_type", operate_type) # print print("[execute_operations] operation_type", operate_type) @@ -134,17 +143,26 @@ def operate(operations): if operate_type == "press" or operate_type == "hotkey": keys = operation.get("keys") + if VERBOSE: + print(f"[Self Operating Computer][operate] keys", keys) + function_response = operating_system.press(keys) elif operate_type == "write": content = operation.get("content") + if VERBOSE: + print(f"[Self Operating Computer][operate] content", content) function_response = operating_system.write(content) elif operate_type == "mouse": x = operation.get("x") y = operation.get("y") click_detail = {"x": x, "y": y} + if VERBOSE: + print(f"[Self Operating Computer][operate] click_detail", click_detail) function_response = operating_system.mouse(click_detail) elif operate_type == "done": summary = operation.get("summmary") + if VERBOSE: + print(f"[Self Operating Computer][operate] summary", summary) print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" ) From 35a0047be4885a1e7a908228f93d213202a6fa4f Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:36:13 -0800 Subject: [PATCH 30/65] Remove `monitor_size`, no longer used --- operate/models/prompts.py | 5 ----- operate/settings.py | 5 ----- operate/utils/screenshot.py | 7 ------- 3 files changed, 17 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 0307755..a708c9b 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -1,8 +1,3 @@ -from operate.settings import Config - -config = Config() -monitor_size = config.monitor_size - # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" diff --git a/operate/settings.py b/operate/settings.py index 61b52fd..48e8df0 100644 --- a/operate/settings.py +++ b/operate/settings.py @@ -11,7 +11,6 @@ class Config: debug (bool): Flag indicating whether debug mode is enabled. openai_api_key (str): API key for OpenAI. google_api_key (str): API key for Google. - monitor_size (dict): Dictionary containing the width and height of the monitor. """ def __init__(self): @@ -19,10 +18,6 @@ def __init__(self): self.debug = False self.openai_api_key = os.getenv("OPENAI_API_KEY") self.google_api_key = os.getenv("GOOGLE_API_KEY") - self.monitor_size = { - "width": 1920, - "height": 1080, - } def initialize_openai_client(self): """ diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index c9c3ac2..8ee9227 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -6,11 +6,6 @@ import Xlib.display import Xlib.X import Xlib.Xutil # not sure if Xutil is necessary -from operate.settings import Config - -# Load configuration -config = Config() -monitor_size = config.monitor_size def capture_screen_with_cursor(file_path): @@ -35,8 +30,6 @@ def capture_screen_with_cursor(file_path): # Use xlib to prevent scrot dependency for Linux screen = Xlib.display.Display().screen() size = screen.width_in_pixels, screen.height_in_pixels - monitor_size["width"] = size[0] - monitor_size["height"] = size[1] screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1])) screenshot.save(file_path) elif user_platform == "Darwin": # (Mac OS) From 1444c1a0c01e698e1653fce36ec0acd55b033bd6 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:41:47 -0800 Subject: [PATCH 31/65] Add missing `__init__.py` --- operate/models/__init__.py | 0 operate/operate.py | 12 ++---------- 2 files changed, 2 insertions(+), 10 deletions(-) create mode 100644 operate/models/__init__.py diff --git a/operate/models/__init__.py b/operate/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/operate/operate.py b/operate/operate.py index 57d47ce..d423914 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -6,6 +6,8 @@ from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt from operate.exceptions import ModelNotRecognizedException + +# from operate.models.prompts import USER_QUESTION, get_system_prompt from operate.models.prompts import USER_QUESTION, get_system_prompt from operate.settings import Config from operate.utils.style import ( @@ -131,16 +133,11 @@ def operate(operations): for operation in operations: # wait one second time.sleep(1) - print("[execute_operations] operation", operation) operate_type = operation.get("operation").lower() if VERBOSE: print(f"[Self Operating Computer][operate] operation", operation) print(f"[Self Operating Computer][operate] operate_type", operate_type) - # print - print("[execute_operations] operation_type", operate_type) - # function_response = "" - if operate_type == "press" or operate_type == "hotkey": keys = operation.get("keys") if VERBOSE: @@ -184,11 +181,6 @@ def operate(operations): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operate_type} COMPLETE {ANSI_RESET}{function_response}" ) - # message = { - # "role": "assistant", - # "content": function_response, - # } - # messages.append(message) return False From b31c39cad4de33d3b532884b6d3149b4cb84b7c8 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 06:55:46 -0800 Subject: [PATCH 32/65] Add `config.verbose` and better `print` --- operate/models/apis.py | 21 +++++++++++++------ operate/operate.py | 41 +++++++++++++++++++------------------ operate/settings.py | 2 +- operate/utils/os.py | 6 ------ operate/utils/screenshot.py | 12 ----------- 5 files changed, 37 insertions(+), 45 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index c2f4c77..0437161 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -42,6 +42,7 @@ # Load configuration config = Config() +VERBOSE = config.verbose client = config.initialize_openai_client() @@ -49,7 +50,8 @@ async def get_next_action(model, messages, objective, session_id): - print("[get_next_action]") + if VERBOSE: + print("[Self Operating Computer][get_next_action]") if model == "gpt-4": return call_gpt_4_v(messages), None if model == "gpt-4-with-som": @@ -57,8 +59,6 @@ async def get_next_action(model, messages, objective, session_id): return [operation], None elif model == "agent-1": operation, session_id = call_agent_1(session_id, objective) - print("[get_next_action] returning operation", operation) - print("[get_next_action] returning session_id", session_id) return operation, session_id elif model == "gemini-pro-vision": return [call_gemini_pro_vision(messages, objective)], None @@ -97,7 +97,8 @@ def call_gpt_4_v(messages): """ Get the next action for Self-Operating Computer """ - print("[call_gpt_4_v]") + if VERBOSE: + print("[Self Operating Computer][get_next_action][call_gpt_4_v]") time.sleep(1) try: screenshots_dir = "screenshots" @@ -116,7 +117,11 @@ def call_gpt_4_v(messages): else: user_prompt = get_user_prompt() - print("[call_gpt_4_v] user_message", user_prompt) + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_v] user_prompt", + user_prompt, + ) vision_message = { "role": "user", @@ -147,7 +152,11 @@ def call_gpt_4_v(messages): content = content[: -len("```")] # Remove ending assistant_message = {"role": "assistant", "content": content} - print("[call_gpt_4_v] content", content) + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_v] content", + content, + ) messages.append(assistant_message) content = json.loads(content) diff --git a/operate/operate.py b/operate/operate.py index d423914..c2ee0b2 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -26,7 +26,7 @@ config = Config() operating_system = OperatingSystem() -VERBOSE = True +VERBOSE = config.verbose def main(model, terminal_prompt, voice_mode=False): @@ -101,8 +101,8 @@ def main(model, terminal_prompt, voice_mode=False): while True: if VERBOSE: - print(f"[Self Operating Computer]") - print(f"[Self Operating Computer] loop_count", loop_count) + print("[Self Operating Computer]") + print("[Self Operating Computer] loop_count", loop_count) try: operations, session_id = asyncio.run( get_next_action(model, messages, objective, session_id) @@ -129,37 +129,35 @@ def main(model, terminal_prompt, voice_mode=False): def operate(operations): if VERBOSE: - print(f"[Self Operating Computer][operate]") + print("[Self Operating Computer][operate]") for operation in operations: # wait one second time.sleep(1) operate_type = operation.get("operation").lower() + operate_thought = operation.get("thought") + operate_detail = "" if VERBOSE: - print(f"[Self Operating Computer][operate] operation", operation) - print(f"[Self Operating Computer][operate] operate_type", operate_type) + print("[Self Operating Computer][operate] operation", operation) + print("[Self Operating Computer][operate] operate_type", operate_type) if operate_type == "press" or operate_type == "hotkey": keys = operation.get("keys") - if VERBOSE: - print(f"[Self Operating Computer][operate] keys", keys) - - function_response = operating_system.press(keys) + operate_detail = keys + operating_system.press(keys) elif operate_type == "write": content = operation.get("content") - if VERBOSE: - print(f"[Self Operating Computer][operate] content", content) - function_response = operating_system.write(content) + operate_detail = content + operating_system.write(content) elif operate_type == "mouse": x = operation.get("x") y = operation.get("y") click_detail = {"x": x, "y": y} - if VERBOSE: - print(f"[Self Operating Computer][operate] click_detail", click_detail) - function_response = operating_system.mouse(click_detail) + operate_detail = click_detail + + operating_system.mouse(click_detail) elif operate_type == "done": - summary = operation.get("summmary") - if VERBOSE: - print(f"[Self Operating Computer][operate] summary", summary) + summary = operation.get("summary") + print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" ) @@ -178,7 +176,10 @@ def operate(operations): return True print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {operate_type} COMPLETE {ANSI_RESET}{function_response}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Operate] Thought {ANSI_RESET} {operate_thought}" + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Operate] {operate_type} {ANSI_RESET} {operate_detail}" ) return False diff --git a/operate/settings.py b/operate/settings.py index 48e8df0..0bccf55 100644 --- a/operate/settings.py +++ b/operate/settings.py @@ -15,7 +15,7 @@ class Config: def __init__(self): load_dotenv() - self.debug = False + self.verbose = True self.openai_api_key = os.getenv("OPENAI_API_KEY") self.google_api_key = os.getenv("GOOGLE_API_KEY") diff --git a/operate/utils/os.py b/operate/utils/os.py index 5c240e1..566591f 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -11,7 +11,6 @@ def write(self, content): content = content.replace("\\n", "\n") for char in content: pyautogui.write(char) - return "Type: " + content def press(self, keys): print("[hotkey] ") @@ -23,7 +22,6 @@ def press(self, keys): for key in keys: print("[hotkey] keyup", key) pyautogui.keyUp(key) - return True def mouse(self, click_detail): try: @@ -34,9 +32,6 @@ def mouse(self, click_detail): if click_detail and isinstance(x, float) and isinstance(y, float): self._click_at_percentage(x, y) - return "we clicked successfully" - else: - return "We failed to click" except Exception as e: print(f"[mouse] error {e}") @@ -64,4 +59,3 @@ def _click_at_percentage( pyautogui.moveTo(x, y, duration=0.1) pyautogui.click(x_pixel, y_pixel) - return "Successfully clicked" diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index 8ee9227..597911a 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -9,18 +9,6 @@ def capture_screen_with_cursor(file_path): - """ - Capture the screen with the cursor and save it to the specified file path. - - Args: - file_path (str): The file path where the screenshot will be saved. - - Raises: - None - - Returns: - None - """ user_platform = platform.system() if user_platform == "Windows": From 27c62840d46d6bfb742b0286919ca71687978325 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 07:01:16 -0800 Subject: [PATCH 33/65] Increase loop max --- operate/operate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index c2ee0b2..d6f68b3 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -113,7 +113,7 @@ def main(model, terminal_prompt, voice_mode=False): break loop_count += 1 - if loop_count > 3: + if loop_count > 10: break except ModelNotRecognizedException as e: print( @@ -159,10 +159,10 @@ def operate(operations): summary = operation.get("summary") print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective Completed {ANSI_RESET}" ) print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary {ANSI_RESET}{summary}" ) return True @@ -171,7 +171,7 @@ def operate(operations): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" ) print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{operation}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}" ) return True From 3fddff3e7e399f73cb945a810f0111c44bb2ace3 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 07:03:31 -0800 Subject: [PATCH 34/65] Add better error handling for `OperatingSystem` --- operate/utils/os.py | 60 +++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/operate/utils/os.py b/operate/utils/os.py index 566591f..6ba1436 100644 --- a/operate/utils/os.py +++ b/operate/utils/os.py @@ -8,36 +8,35 @@ class OperatingSystem: def write(self, content): - content = content.replace("\\n", "\n") - for char in content: - pyautogui.write(char) + try: + content = content.replace("\\n", "\n") + for char in content: + pyautogui.write(char) + except Exception as e: + print("[OperatingSystem][write] error:", e) def press(self, keys): - print("[hotkey] ") - print("[hotkey] keys", keys) - for key in keys: - print("[hotkey] keydown", key) - pyautogui.keyDown(key) - time.sleep(0.1) - for key in keys: - print("[hotkey] keyup", key) - pyautogui.keyUp(key) + try: + for key in keys: + pyautogui.keyDown(key) + time.sleep(0.1) + for key in keys: + pyautogui.keyUp(key) + except Exception as e: + print("[OperatingSystem][press] error:", e) def mouse(self, click_detail): try: - print("[mouse]") - print("[mouse] click_detail", click_detail) x = convert_percent_to_decimal(click_detail.get("x")) y = convert_percent_to_decimal(click_detail.get("y")) if click_detail and isinstance(x, float) and isinstance(y, float): - self._click_at_percentage(x, y) + self.click_at_percentage(x, y) except Exception as e: - print(f"[mouse] error {e}") - return "We failed to click" + print("[OperatingSystem][mouse] error:", e) - def _click_at_percentage( + def click_at_percentage( self, x_percentage, y_percentage, @@ -45,17 +44,20 @@ def _click_at_percentage( circle_radius=50, circle_duration=0.5, ): - screen_width, screen_height = pyautogui.size() - x_pixel = int(screen_width * float(x_percentage)) - y_pixel = int(screen_height * float(y_percentage)) + try: + screen_width, screen_height = pyautogui.size() + x_pixel = int(screen_width * float(x_percentage)) + y_pixel = int(screen_height * float(y_percentage)) - pyautogui.moveTo(x_pixel, y_pixel, duration=duration) + pyautogui.moveTo(x_pixel, y_pixel, duration=duration) - start_time = time.time() - while time.time() - start_time < circle_duration: - angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi - x = x_pixel + math.cos(angle) * circle_radius - y = y_pixel + math.sin(angle) * circle_radius - pyautogui.moveTo(x, y, duration=0.1) + start_time = time.time() + while time.time() - start_time < circle_duration: + angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi + x = x_pixel + math.cos(angle) * circle_radius + y = y_pixel + math.sin(angle) * circle_radius + pyautogui.moveTo(x, y, duration=0.1) - pyautogui.click(x_pixel, y_pixel) + pyautogui.click(x_pixel, y_pixel) + except Exception as e: + print("[OperatingSystem][click_at_percentage] error:", e) From cee42ce7f4fee08f96338d14c548cb637061f253 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sat, 13 Jan 2024 07:04:09 -0800 Subject: [PATCH 35/65] Update to `operating_system.py` --- operate/operate.py | 2 +- operate/utils/{os.py => operating_system.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename operate/utils/{os.py => operating_system.py} (100%) diff --git a/operate/operate.py b/operate/operate.py index d6f68b3..0004539 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -19,7 +19,7 @@ ANSI_BLUE, style, ) -from operate.utils.os import OperatingSystem +from operate.utils.operating_system import OperatingSystem from operate.models.apis import get_next_action # Load configuration diff --git a/operate/utils/os.py b/operate/utils/operating_system.py similarity index 100% rename from operate/utils/os.py rename to operate/utils/operating_system.py From 6cd36fda4ce81a86715f7cf5e09f44d480a97958 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 05:16:12 -0800 Subject: [PATCH 36/65] Iterate `call_gpt_4_vision_preview_labeled` --- operate/models/apis.py | 212 ++++++++++++++++---------------------- operate/models/prompts.py | 88 ++++++++++------ operate/operate.py | 12 ++- operate/utils/label.py | 24 ----- 4 files changed, 155 insertions(+), 181 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 0437161..80a8504 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -19,9 +19,6 @@ ) from operate.models.prompts import ( format_vision_prompt, - format_summary_prompt, - format_decision_prompt, - format_label_prompt, get_user_first_message_prompt, get_user_prompt, ) @@ -29,7 +26,6 @@ from operate.utils.label import ( add_labels, - parse_click_content, get_click_position_in_percent, get_label_coordinates, ) @@ -50,13 +46,11 @@ async def get_next_action(model, messages, objective, session_id): - if VERBOSE: - print("[Self Operating Computer][get_next_action]") if model == "gpt-4": - return call_gpt_4_v(messages), None + return call_gpt_4_vision_preview(messages), None if model == "gpt-4-with-som": - operation = await call_gpt_4_v_labeled(messages, objective) - return [operation], None + operation = await call_gpt_4_vision_preview_labeled(messages, objective) + return operation, None elif model == "agent-1": operation, session_id = call_agent_1(session_id, objective) return operation, session_id @@ -93,7 +87,7 @@ def call_agent_1(session_id, objective): return "Failed take action after looking at the screenshot" -def call_gpt_4_v(messages): +def call_gpt_4_vision_preview(messages): """ Get the next action for Self-Operating Computer """ @@ -214,56 +208,7 @@ def call_gemini_pro_vision(messages, objective): return "Failed take action after looking at the screenshot" -def summarize(model, messages, objective): - try: - screenshots_dir = "screenshots" - if not os.path.exists(screenshots_dir): - os.makedirs(screenshots_dir) - - screenshot_filename = os.path.join(screenshots_dir, "summary_screenshot.png") - # Call the function to capture the screen with the cursor - capture_screen_with_cursor(screenshot_filename) - - summary_prompt = format_summary_prompt(objective) - - if model == "gpt-4-vision-preview": - with open(screenshot_filename, "rb") as img_file: - img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - - summary_message = { - "role": "user", - "content": [ - {"type": "text", "text": summary_prompt}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, - }, - ], - } - # create a copy of messages and save to pseudo_messages - messages.append(summary_message) - - response = client.chat.completions.create( - model="gpt-4-vision-preview", - messages=messages, - max_tokens=500, - ) - - content = response.choices[0].message.content - elif model == "gemini-pro-vision": - model = genai.GenerativeModel("gemini-pro-vision") - summary_message = model.generate_content( - [summary_prompt, Image.open(screenshot_filename)] - ) - content = summary_message.text - return content - - except Exception as e: - print(f"Error in summarize: {e}") - return "Failed to summarize the workflow" - - -async def call_gpt_4_v_labeled(messages, objective): +async def call_gpt_4_vision_preview_labeled(messages, objective): time.sleep(1) try: screenshots_dir = "screenshots" @@ -277,19 +222,25 @@ async def call_gpt_4_v_labeled(messages, objective): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - previous_action = get_last_assistant_message(messages) - img_base64_labeled, img_base64_original, label_coordinates = add_labels( img_base64, yolo_model ) - decision_prompt = format_decision_prompt(objective, previous_action) - labeled_click_prompt = format_label_prompt(objective) + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() - click_message = { + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] user_prompt", + user_prompt, + ) + + vision_message = { "role": "user", "content": [ - {"type": "text", "text": labeled_click_prompt}, + {"type": "text", "text": user_prompt}, { "type": "image_url", "image_url": { @@ -298,75 +249,94 @@ async def call_gpt_4_v_labeled(messages, objective): }, ], } - decision_message = { - "role": "user", - "content": [ - {"type": "text", "text": decision_prompt}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{img_base64_original}" - }, - }, - ], - } - - click_messages = messages.copy() - click_messages.append(click_message) - decision_messages = messages.copy() - decision_messages.append(decision_message) - - click_future = fetch_openai_response_async(click_messages) - decision_future = fetch_openai_response_async(decision_messages) - - click_response, decision_response = await asyncio.gather( - click_future, decision_future - ) - - # Extracting the message content from the ChatCompletionMessage object - click_content = click_response.get("choices")[0].get("message").get("content") + messages.append(vision_message) - decision_content = ( - decision_response.get("choices")[0].get("message").get("content") + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=messages, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=1000, ) - if not decision_content.startswith("CLICK"): - return decision_content + content = response.choices[0].message.content - label_data = parse_click_content(click_content) + if content.startswith("```json"): + content = content[len("```json") :] # Remove starting ```json + if content.endswith("```"): + content = content[: -len("```")] # Remove ending - if label_data and "label" in label_data: - coordinates = get_label_coordinates(label_data["label"], label_coordinates) - image = Image.open( - io.BytesIO(base64.b64decode(img_base64)) - ) # Load the image to get its size - image_size = image.size # Get the size of the image (width, height) - click_position_percent = get_click_position_in_percent( - coordinates, image_size + assistant_message = {"role": "assistant", "content": content} + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] content", + content, ) - if not click_position_percent: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" - ) - return call_gpt_4_v(messages) + messages.append(assistant_message) - x_percent = f"{click_position_percent[0]:.2f}%" - y_percent = f"{click_position_percent[1]:.2f}%" - click_action = f'CLICK {{ "x": "{x_percent}", "y": "{y_percent}", "description": "{label_data["decision"]}", "reason": "{label_data["reason"]}" }}' + content = json.loads(content) - else: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}" - ) - return call_gpt_4_v(messages) + processed_content = [] + + for operation in content: + if operation.get("operation") == "mouse": + label = operation.get("label") + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] label", + label, + ) + + coordinates = get_label_coordinates(label, label_coordinates) + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] coordinates", + coordinates, + ) + image = Image.open( + io.BytesIO(base64.b64decode(img_base64)) + ) # Load the image to get its size + image_size = image.size # Get the size of the image (width, height) + click_position_percent = get_click_position_in_percent( + coordinates, image_size + ) + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] click_position_percent", + click_position_percent, + ) + if not click_position_percent: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_vision_preview(messages) + + x_percent = f"{click_position_percent[0]:.2f/100}" + y_percent = f"{click_position_percent[1]:.2f/100}" + operation["x"] = x_percent + operation["y"] = y_percent + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new click operation", + operation, + ) + processed_content.append(operation) + else: + processed_content.append(operation) - return click_action + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gpt_4_vision_preview_labeled] new processed_content", + processed_content, + ) + return processed_content except: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" ) - return call_gpt_4_v(messages) + return call_gpt_4_vision_preview(messages) def fetch_agent_1_response(session_id, objective, base64_image): diff --git a/operate/models/prompts.py b/operate/models/prompts.py index a708c9b..2a7c2e4 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -7,7 +7,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you which you use in the `pyautogui` library. Your output should always be valid `json` because it will be used in `json.loads` +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. mouse - Move mouse and click [{{ "thought": "write a thought here", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format @@ -42,7 +42,51 @@ A few important notes: - Go to Google Docs and Google Sheets by typing in the Chrome Address bar -- Remember you only have those 4 operations available to you. +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + + +SYSTEM_PROMPT_LABELED = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. mouse - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +[{{ "thought": "write a thought here", "operation": "mouse", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Spotlight Search on Mac +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Send a "Hello World" message in the chat +[ + {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "mouse", "label": "~34" }}, + {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. Objective: {objective} # take the next best action for this objective @@ -123,38 +167,6 @@ Action:""" -LABELED_IMAGE_PROMPT = """ -Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. - -Important to remember, you can only click on labeled elements. - -Label IDs are in the following format with `x` being a number: `~x` - -The labels are placed just above the bounding boxes so that they can be read clearly. - -Response formats below. - -1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. -Response: {{ "decision": "~decision here~", "reason": "~reason here~", "label": "~x" }} - -Here are examples of how to respond. -__ -Objective: Follow up with the vendor in outlook -{{ "decision": "Click the Outlook send button", "reason": "I can see the email is already written and now I just need to send it.", "label": "~27" }} -__ -Objective: Play the Holiday music on YouTube -{{ "decision": "Click on the Play button", "reason": "It appears there is a row with a holiday song available in the Spotify UI", "label": "~3" }} -__ - -A few important notes: -- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number. -- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. -- Do not preappend with ```json, just return the JSON object. - -{objective} -""" - - # ------------------------- # SUMMARY PROMPT # ------------------------- @@ -269,6 +281,14 @@ def get_system_prompt(objective): return prompt +def get_system_prompt_labeled(objective): + """ + Format the vision prompt + """ + prompt = SYSTEM_PROMPT_LABELED.format(objective=objective) + return prompt + + def get_user_prompt(): prompt = OPERATE_PROMPT return prompt diff --git a/operate/operate.py b/operate/operate.py index 0004539..579df02 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -8,7 +8,11 @@ from operate.exceptions import ModelNotRecognizedException # from operate.models.prompts import USER_QUESTION, get_system_prompt -from operate.models.prompts import USER_QUESTION, get_system_prompt +from operate.models.prompts import ( + USER_QUESTION, + get_system_prompt, + get_system_prompt_labeled, +) from operate.settings import Config from operate.utils.style import ( ANSI_GREEN, @@ -91,7 +95,11 @@ def main(model, terminal_prompt, voice_mode=False): print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") objective = prompt(style=style) - system_prompt = get_system_prompt(objective) + if model == "gpt-4-with-som": + system_prompt = get_system_prompt_labeled(objective) + print("labeled prompt", system_prompt) + else: + system_prompt = get_system_prompt(objective) system_message = {"role": "system", "content": system_prompt} messages = [system_message] diff --git a/operate/utils/label.py b/operate/utils/label.py index 2d3674f..7565dcc 100644 --- a/operate/utils/label.py +++ b/operate/utils/label.py @@ -136,30 +136,6 @@ def add_labels(base64_data, yolo_model): return img_base64_labeled, img_base64_original, label_coordinates -def parse_click_content(message_content): - """ - Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data. - - :param message_content: The content of the response message. - :return: A dictionary with the relevant data or a message indicating a NONE action. - """ - try: - # Check for and remove erroneous ```json at the start and ``` at the end - if message_content.startswith("```json"): - message_content = message_content[ - len("```json") : - ] # Remove starting ```json - if message_content.endswith("```"): - message_content = message_content[: -len("```")] # Remove ending ``` - - # Convert JSON string to dictionary - return json.loads(message_content.strip()) - except json.JSONDecodeError as e: - return {"error": "Invalid JSON format"} - - return {"error": "Invalid response format"} - - def get_click_position_in_percent(coordinates, image_size): """ Calculates the click position at the center of the bounding box and converts it to percentages. From 42a8786616bef59680cc43eb674eaba8ab72b741 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 05:16:29 -0800 Subject: [PATCH 37/65] remove `format_label_prompt` --- operate/models/prompts.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 2a7c2e4..3353852 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -265,14 +265,6 @@ def format_decision_prompt(objective, previous_action): return prompt -def format_label_prompt(objective): - """ - Format the vision prompt - """ - prompt = LABELED_IMAGE_PROMPT.format(objective=objective) - return prompt - - def get_system_prompt(objective): """ Format the vision prompt From 84742274d3b7a45f100920d1bb9883d9f470957c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 05:21:01 -0800 Subject: [PATCH 38/65] fix `get_click_position_in_percent` --- operate/models/apis.py | 9 +++++---- operate/utils/label.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 80a8504..af6cd04 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -312,8 +312,8 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): ) return call_gpt_4_vision_preview(messages) - x_percent = f"{click_position_percent[0]:.2f/100}" - y_percent = f"{click_position_percent[1]:.2f/100}" + x_percent = f"{click_position_percent[0]:.2f}" + y_percent = f"{click_position_percent[1]:.2f}" operation["x"] = x_percent operation["y"] = y_percent if VERBOSE: @@ -332,9 +332,10 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): ) return processed_content - except: + except Exception as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", + e, ) return call_gpt_4_vision_preview(messages) diff --git a/operate/utils/label.py b/operate/utils/label.py index 7565dcc..4dddcc9 100644 --- a/operate/utils/label.py +++ b/operate/utils/label.py @@ -152,7 +152,7 @@ def get_click_position_in_percent(coordinates, image_size): y_center = (coordinates[1] + coordinates[3]) / 2 # Convert to percentages - x_percent = (x_center / image_size[0]) * 100 - y_percent = (y_center / image_size[1]) * 100 + x_percent = x_center / image_size[0] + y_percent = y_center / image_size[1] return x_percent, y_percent From 182accee747441e1b7ff0be5af42037f3babb1db Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 05:31:45 -0800 Subject: [PATCH 39/65] Remove `img_base64_original` from `add_labels` function --- operate/models/apis.py | 4 +--- operate/utils/label.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index af6cd04..52517e2 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -222,9 +222,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - img_base64_labeled, img_base64_original, label_coordinates = add_labels( - img_base64, yolo_model - ) + img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model) if len(messages) == 1: user_prompt = get_user_first_message_prompt() diff --git a/operate/utils/label.py b/operate/utils/label.py index 4dddcc9..1423239 100644 --- a/operate/utils/label.py +++ b/operate/utils/label.py @@ -133,7 +133,7 @@ def add_labels(base64_data, yolo_model): image_labeled.save(buffered_labeled, format="PNG") # I guess this is needed img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode("utf-8") - return img_base64_labeled, img_base64_original, label_coordinates + return img_base64_labeled, label_coordinates def get_click_position_in_percent(coordinates, image_size): From 76c28c911858b3c959de12e03880177e434709bd Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:03:14 -0800 Subject: [PATCH 40/65] Update `call_gemini_pro_vision` to new method --- operate/models/apis.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 52517e2..03801cd 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -21,6 +21,7 @@ format_vision_prompt, get_user_first_message_prompt, get_user_prompt, + get_system_prompt, ) @@ -181,31 +182,25 @@ def call_gemini_pro_vision(messages, objective): previous_action = get_last_assistant_message(messages) - vision_prompt = format_vision_prompt(objective, previous_action) + # vision_prompt = format_vision_prompt(objective, previous_action) + prompt = get_system_prompt(objective) model = genai.GenerativeModel("gemini-pro-vision") - response = model.generate_content( - [vision_prompt, Image.open(screenshot_filename)] - ) - - # create a copy of messages and save to pseudo_messages - pseudo_messages = messages.copy() - pseudo_messages.append(response.text) + response = model.generate_content([prompt, Image.open(screenshot_filename)]) - messages.append( - { - "role": "user", - "content": "`screenshot.png`", - } - ) content = response.text[1:] + print("content", content) + content = json.loads(content) return content except Exception as e: - print(f"Error: {e}") - return "Failed take action after looking at the screenshot" + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}", + e, + ) + return call_gpt_4_vision_preview(messages) async def call_gpt_4_vision_preview_labeled(messages, objective): From e36d774d60e6964e2a86ae2616300ae04a12f788 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:05:56 -0800 Subject: [PATCH 41/65] remove extra list dimension in `call_gemini_pro_vision` --- operate/models/apis.py | 12 +++++++----- operate/operate.py | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 03801cd..afac188 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -56,7 +56,7 @@ async def get_next_action(model, messages, objective, session_id): operation, session_id = call_agent_1(session_id, objective) return operation, session_id elif model == "gemini-pro-vision": - return [call_gemini_pro_vision(messages, objective)], None + return call_gemini_pro_vision(messages, objective), None raise ModelNotRecognizedException(model) @@ -179,9 +179,6 @@ def call_gemini_pro_vision(messages, objective): capture_screen_with_cursor(screenshot_filename) # sleep for a second time.sleep(1) - - previous_action = get_last_assistant_message(messages) - # vision_prompt = format_vision_prompt(objective, previous_action) prompt = get_system_prompt(objective) @@ -190,8 +187,13 @@ def call_gemini_pro_vision(messages, objective): response = model.generate_content([prompt, Image.open(screenshot_filename)]) content = response.text[1:] - print("content", content) + content = json.loads(content) + if VERBOSE: + print( + "[Self Operating Computer][get_next_action][call_gemini_pro_vision] content", + content, + ) return content diff --git a/operate/operate.py b/operate/operate.py index 579df02..b4460f5 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -139,13 +139,14 @@ def operate(operations): if VERBOSE: print("[Self Operating Computer][operate]") for operation in operations: + if VERBOSE: + print("[Self Operating Computer][operate] operation", operation) # wait one second time.sleep(1) operate_type = operation.get("operation").lower() operate_thought = operation.get("thought") operate_detail = "" if VERBOSE: - print("[Self Operating Computer][operate] operation", operation) print("[Self Operating Computer][operate] operate_type", operate_type) if operate_type == "press" or operate_type == "hotkey": From e15bbd69b101d4759086eefce9433e3c0ac1364a Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:11:55 -0800 Subject: [PATCH 42/65] Change `mouse` to `click` as it is clear what the purpose is --- operate/models/apis.py | 3 ++- operate/models/prompts.py | 10 +++++----- operate/settings.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index afac188..8c1770d 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -336,7 +336,8 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): def fetch_agent_1_response(session_id, objective, base64_image): - print("[call_agent_1][fetch_agent_1_response]") + if VERBOSE: + print("[call_agent_1][fetch_agent_1_response]") url = "http://127.0.0.1:5000/agent/v2/action" api_token = os.environ.get("AGENT_API_KEY") headers = { diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 3353852..6dc9a2c 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -9,8 +9,8 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. -1. mouse - Move mouse and click -[{{ "thought": "write a thought here", "operation": "mouse", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -55,8 +55,8 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. -1. mouse - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` -[{{ "thought": "write a thought here", "operation": "mouse", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -80,7 +80,7 @@ # Send a "Hello World" message in the chat [ - {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "mouse", "label": "~34" }}, + {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, ] diff --git a/operate/settings.py b/operate/settings.py index 0bccf55..1dbc91f 100644 --- a/operate/settings.py +++ b/operate/settings.py @@ -21,10 +21,10 @@ def __init__(self): def initialize_openai_client(self): """ - Initializes and returns an OpenAI client with the configured API key. + Initializes and returns an OpenAI client with the configured API key. Returns: - OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. + OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. """ if self.openai_api_key: client = OpenAI() From 4aa05bcf64d15f8c32ed92259657d39744c186b6 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:13:03 -0800 Subject: [PATCH 43/65] update `operate_type == "click"` condition --- operate/operate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/operate.py b/operate/operate.py index b4460f5..a051b47 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -157,7 +157,7 @@ def operate(operations): content = operation.get("content") operate_detail = content operating_system.write(content) - elif operate_type == "mouse": + elif operate_type == "click": x = operation.get("x") y = operation.get("y") click_detail = {"x": x, "y": y} From c79b06521182ee5fa370524bcb16e46dabc7b2c2 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:18:19 -0800 Subject: [PATCH 44/65] Remove unused prompts, `VISION_PROMPT`, etc. --- operate/models/apis.py | 9 ++- operate/models/prompts.py | 164 +------------------------------------- 2 files changed, 7 insertions(+), 166 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 8c1770d..4f482f8 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -18,7 +18,6 @@ capture_screen_with_cursor, ) from operate.models.prompts import ( - format_vision_prompt, get_user_first_message_prompt, get_user_prompt, get_system_prompt, @@ -159,8 +158,11 @@ def call_gpt_4_vision_preview(messages): return content except Exception as e: - print(f"Error: {e}") - return "Failed take action after looking at the screenshot" + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying again {ANSI_RESET}", + e, + ) + return call_gpt_4_vision_preview(messages) def call_gemini_pro_vision(messages, objective): @@ -179,7 +181,6 @@ def call_gemini_pro_vision(messages, objective): capture_screen_with_cursor(screenshot_filename) # sleep for a second time.sleep(1) - # vision_prompt = format_vision_prompt(objective, previous_action) prompt = get_system_prompt(objective) model = genai.GenerativeModel("gemini-pro-vision") diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 6dc9a2c..4735ce6 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -93,178 +93,18 @@ """ -# ------------------------- -# VISION PROMPT -# ------------------------- -VISION_PROMPT = """ -You are a Self-Operating Computer. You use the same operating system as a human. - -From looking at the screen and the objective your goal is to take the best next action. - -To operate the computer you have the four options below. - -1. CLICK - Move mouse and click -2. TYPE - Type on the keyboard -3. SEARCH - Search for a program on Mac and open it -4. DONE - When you completed the task respond with the exact following phrase content - -Here are the response formats below. - -1. CLICK -Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }} -Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%" - -2. TYPE -Response: TYPE - -2. SEARCH -Response: SEARCH - -3. DONE -Response: DONE - -Here are examples of how to respond. -__ -Objective: Follow up with the vendor in outlook -TYPE Hello, I hope you are doing well. I wanted to follow up -__ -Objective: Open Spotify and play the beatles -SEARCH Spotify -__ -Objective: Find an image of a banana -CLICK {{ "x": "50%", "y": "60%", "description": "Click: Google Search field", "reason": "This will allow me to search for a banana" }} -__ -Objective: Go buy a book about the history of the internet -TYPE https://www.amazon.com/ -__ - -A few important notes: - -- Default to opening Google Chrome with SEARCH to find things that are on the internet. -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar -- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ "x": "50%", "y": "55%" }} -- The Chrome address bar is generally at: {{ "x": "50%", "y": "9%" }} -- After you click to enter a field you can go ahead and start typing! -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. - -{previous_action} - -IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. - -Objective: {objective} -""" - - OPERATE_FIRST_MESSAGE_PROMPT = """ -Please take the next best action. Remember you only have the following 4 operations available: mouse, write, press, done +Please take the next best action. Remember you only have the following 4 operations available: click, write, press, done Right now you are probably in the terminal because the human just started up. Action:""" OPERATE_PROMPT = """ -Please take the next best action. Remember you only have the following 4 operations available: mouse, write, press, done +Please take the next best action. Remember you only have the following 4 operations available: click, write, press, done Action:""" -# ------------------------- -# SUMMARY PROMPT -# ------------------------- -SUMMARY_PROMPT = """ -You are a Self-Operating Computer. A user request has been executed. Present the results succinctly. - -Include the following key contexts of the completed request: - -1. State the original objective. -2. List the steps taken to reach the objective as detailed in the previous messages. -3. Reference the screenshot that was used. - -Summarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user. - -Remember: The user will not interact with this summary. You are solely reporting the outcomes. - -Original objective: {objective} - -Display the results clearly: -""" - -DECISION_PROMPT = """ -You are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective. -Here are your methods you can use to operating the computer. -1. CLICK - Move mouse and click -2. TYPE - Type on the keyboard -3. SEARCH - Search for a program that is installed on Mac locally and open it -4. DONE - When you completed the task respond with the exact following phrase content -Here are the response formats below. -1. CLICK -Response: CLICK -2. TYPE -Response: TYPE "value you want to type" -2. SEARCH -Response: SEARCH "app you want to search for on Mac" -3. DONE -Response: DONE -Here are examples of how to respond. -__ -Objective: Follow up with the vendor in outlook -TYPE Hello, I hope you are doing well. I wanted to follow up -__ -Objective: Open Spotify and play the beatles -SEARCH Spotify -__ -Objective: Find an image of a banana -CLICK -__ -Objective: Go buy a book about the history of the internet -TYPE https://www.amazon.com/ -__ -A few important notes: -- Default to opening Google Chrome with SEARCH to find things that are on the Web. -- After you open Google Chrome you need to click on the address bar to find a website. -- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer. -- After you click to enter a field you can go ahead and start typing! -- If you can see the field is active, go ahead and type! -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. -{previous_action} -IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. -{objective} -""" - - -def format_summary_prompt(objective): - """ - Format the summary prompt - """ - prompt = SUMMARY_PROMPT.format(objective=objective) - return prompt - - -def format_vision_prompt(objective, previous_action): - """ - Format the vision prompt - """ - if previous_action: - previous_action = f"Here was the previous action you took: {previous_action}" - else: - previous_action = "" - prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action) - return prompt - - -def format_decision_prompt(objective, previous_action): - """ - Format the vision prompt - """ - if previous_action: - previous_action = f"Here was the previous action you took: {previous_action}" - else: - previous_action = "" - prompt = DECISION_PROMPT.format( - objective=objective, previous_action=previous_action - ) - return prompt - - def get_system_prompt(objective): """ Format the vision prompt From f0b606cf7b6661403a63341e18ad1b92bbd1fc90 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:20:04 -0800 Subject: [PATCH 45/65] improve `OPERATE_FIRST_MESSAGE_PROMPT` and `OPERATE_PROMPT` --- operate/models/prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 4735ce6..3abfa32 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -94,14 +94,14 @@ OPERATE_FIRST_MESSAGE_PROMPT = """ -Please take the next best action. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done -Right now you are probably in the terminal because the human just started up. +Right now you are probably in the terminal because the human just started up. Remember Action:""" OPERATE_PROMPT = """ -Please take the next best action. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done Action:""" From 9ce0aa11c54ba0766eb80927a832bf334048947c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 06:57:06 -0800 Subject: [PATCH 46/65] fix `call_gpt_4_vision_preview_labeled` --- operate/models/apis.py | 2 +- operate/models/prompts.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 4f482f8..5b358d0 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -276,7 +276,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): processed_content = [] for operation in content: - if operation.get("operation") == "mouse": + if operation.get("operation") == "click": label = operation.get("label") if VERBOSE: print( diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 3abfa32..af5725f 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -75,7 +75,13 @@ [ {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, - {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} ] # Send a "Hello World" message in the chat From cd8e6e3673b17e787ffad5b13de48479e358fd2f Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 07:12:44 -0800 Subject: [PATCH 47/65] bump up `max_tokens` --- operate/models/apis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 5b358d0..b343e65 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -135,7 +135,7 @@ def call_gpt_4_vision_preview(messages): presence_penalty=1, frequency_penalty=1, temperature=0.7, - max_tokens=1000, + max_tokens=2000, ) content = response.choices[0].message.content From 66043001d649adef032cbc3639cc907fc7d6c139 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 07:21:01 -0800 Subject: [PATCH 48/65] increase `max_tokens` --- operate/models/apis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index b343e65..4897344 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -135,7 +135,7 @@ def call_gpt_4_vision_preview(messages): presence_penalty=1, frequency_penalty=1, temperature=0.7, - max_tokens=2000, + max_tokens=3000, ) content = response.choices[0].message.content From 0f54222e98ec775750ee1ad91c864ca092758af9 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 07:42:48 -0800 Subject: [PATCH 49/65] Update `README.md` --- README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a59ad3d..2f3b431 100644 --- a/README.md +++ b/README.md @@ -43,23 +43,22 @@ https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96- ## Quick Start Instructions Below are instructions to set up the Self-Operating Computer Framework locally on your computer. -### Option 1: Traditional Installation - -``` -1. **Install Project Requirements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:** +### Option 1: Quick Start +Install `self-operating-computer` +1. **Install `self-operating-computer`**: ``` pip install self-operating-computer ``` -2. +2. **Add your Open AI key. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: +For Mac, ``` - +export OPENAI_API_KEY='your-key-here' ``` -3. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: +For Windows ``` -OPENAI_API_KEY='your-key-here' +set GOOGLE_API_KEY='your_api_key' ``` - -4. **Run it**! +3. **Run it**! ``` operate ``` From e5903232a8b3ea3bcc3826a73a6c5491057c8c9c Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 07:44:32 -0800 Subject: [PATCH 50/65] Clean up `README.md` --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2f3b431..0ed4bbc 100644 --- a/README.md +++ b/README.md @@ -44,17 +44,17 @@ https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96- Below are instructions to set up the Self-Operating Computer Framework locally on your computer. ### Option 1: Quick Start -Install `self-operating-computer` -1. **Install `self-operating-computer`**: +1. **Install the project locally**: ``` pip install self-operating-computer ``` 2. **Add your Open AI key. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: -For Mac, + +For Mac: ``` export OPENAI_API_KEY='your-key-here' ``` -For Windows +For Windows: ``` set GOOGLE_API_KEY='your_api_key' ``` From a071bb3a315956dd467b7ee11d09a239b1c8a1a4 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Sun, 14 Jan 2024 07:45:31 -0800 Subject: [PATCH 51/65] Remove `Additional Thoughts` now that hotkeys are added --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 0ed4bbc..c960275 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,6 @@ We will soon be offering API access to our Agent-1-Vision model. If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com). -### Additional Thoughts -We recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape. ## Demo https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0 From 00720939fdcd46e70b5c0c0bdf7f1a6ae46d713a Mon Sep 17 00:00:00 2001 From: Michael Hogue Date: Sun, 14 Jan 2024 13:10:47 -0500 Subject: [PATCH 52/65] Add multi-platform system prompts --- operate/models/prompts.py | 113 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 4 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index af5725f..f0bd519 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -1,8 +1,10 @@ +import platform + # General user Prompts USER_QUESTION = "Hello, I can help you with anything. What would you like done?" -SYSTEM_PROMPT = """ +SYSTEM_PROMPT_MAC = """ You are operating a computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. @@ -47,8 +49,53 @@ Objective: {objective} # take the next best action for this objective """ +SYSTEM_PROMPT_WIN_LINUX= """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click +[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Menu Search on Windows and Linux +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + -SYSTEM_PROMPT_LABELED = """ +SYSTEM_PROMPT_LABELED_MAC = """ You are operating a computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. @@ -98,6 +145,56 @@ Objective: {objective} # take the next best action for this objective """ +SYSTEM_PROMPT_LABELED_WIN_LINUX = """ +You are operating a computer, using the same operating system as a human. + +From looking at the screen, the objective, and your previous actions, take the next best series of action. + +You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. + +1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format + +2. write - Write with your keyboard +[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] + +3. press - Use a hotkey or press key to operate the computer +[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] + +4. done - The objective is completed +[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] + +Return the actions in array format `[]`. You can take just one action or multiple actions. + +Here are some helpful combinations: + +# Opens Menu Search on Windows and Linux +[ + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, + {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, +] + +# Focuses on the address bar in a browser before typing a website +[ + {{ "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }}, + {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, + {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} +] + +# Send a "Hello World" message in the chat +[ + {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, + {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, +] + +A few important notes: + +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +Objective: {objective} # take the next best action for this objective +""" + OPERATE_FIRST_MESSAGE_PROMPT = """ Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done @@ -115,7 +212,11 @@ def get_system_prompt(objective): """ Format the vision prompt """ - prompt = SYSTEM_PROMPT.format(objective=objective) + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) + return prompt @@ -123,7 +224,11 @@ def get_system_prompt_labeled(objective): """ Format the vision prompt """ - prompt = SYSTEM_PROMPT_LABELED.format(objective=objective) + if platform.system() == "Darwin": + prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) + else: + prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) + return prompt From ec891c9af27886355c36afe68db831063daf8f0e Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 07:47:44 -0800 Subject: [PATCH 53/65] remove `fetch_agent_1_response` for now --- operate/models/apis.py | 53 +----------------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 4897344..4c5f5af 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -52,41 +52,13 @@ async def get_next_action(model, messages, objective, session_id): operation = await call_gpt_4_vision_preview_labeled(messages, objective) return operation, None elif model == "agent-1": - operation, session_id = call_agent_1(session_id, objective) - return operation, session_id + return "coming soon" elif model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None raise ModelNotRecognizedException(model) -def call_agent_1(session_id, objective): - print("[call_agent_1]") - time.sleep(1) - try: - screenshots_dir = "screenshots" - if not os.path.exists(screenshots_dir): - os.makedirs(screenshots_dir) - - screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") - - capture_screen_with_cursor(screenshot_filename) - - with open(screenshot_filename, "rb") as img_file: - base64_image = base64.b64encode(img_file.read()).decode("utf-8") - - print("[call_agent_1] about to fetch_agent_1_response") - response, session_id = fetch_agent_1_response( - session_id, objective, base64_image - ) - print("[call_agent_1] response", response) - - return response, session_id - except Exception as e: - print(f"Error: {e}") - return "Failed take action after looking at the screenshot" - - def call_gpt_4_vision_preview(messages): """ Get the next action for Self-Operating Computer @@ -336,29 +308,6 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): return call_gpt_4_vision_preview(messages) -def fetch_agent_1_response(session_id, objective, base64_image): - if VERBOSE: - print("[call_agent_1][fetch_agent_1_response]") - url = "http://127.0.0.1:5000/agent/v2/action" - api_token = os.environ.get("AGENT_API_KEY") - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {api_token}", - } - data = { - "session_id": session_id, - "objective": objective, - "image": f"data:image/jpeg;base64,{base64_image}", - } - - response = requests.post(url, headers=headers, data=json.dumps(data)) - response_dict = response.json() - operations = response_dict.get("operations") - session_id = response_dict.get("session_id") - - return operations, session_id - - async def fetch_openai_response_async(messages): url = "https://api.openai.com/v1/chat/completions" headers = { From de93bb452277681734e29bd028b5a343fc6c2506 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 09:46:47 -0800 Subject: [PATCH 54/65] update name to `config` --- operate/config.py | 71 +++++++++++++++++++++++++++++++++++++++++++++ operate/settings.py | 34 ---------------------- 2 files changed, 71 insertions(+), 34 deletions(-) create mode 100644 operate/config.py delete mode 100644 operate/settings.py diff --git a/operate/config.py b/operate/config.py new file mode 100644 index 0000000..87eb1e5 --- /dev/null +++ b/operate/config.py @@ -0,0 +1,71 @@ +import os +from dotenv import load_dotenv +from openai import OpenAI +import sys + + +class Config: + """ + Configuration class for managing settings. + + Attributes: + debug (bool): Flag indicating whether debug mode is enabled. + openai_api_key (str): API key for OpenAI. + google_api_key (str): API key for Google. + """ + + def __init__(self): + load_dotenv() + self.verbose = True + self.openai_api_key = os.getenv("OPENAI_API_KEY") + self.google_api_key = os.getenv("GOOGLE_API_KEY") + + def initialize_apis(self): + """ + Initializes and returns an OpenAI client with the configured API key. + + Returns: + OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. + """ + try: + if self.openai_api_key: + client = OpenAI() + print("setting openai key") + client.api_key = self.openai_api_key + client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) + return client + return "Failed to initialize OpenAI" + except Exception as e: + print("[Config] Failed to initialize OpenAI", e) + return "Failed to initialize OpenAI" + + def validation(self, model, voice_mode): + """ + Validate the input parameters for the dialog operation. + + Args: + model (str): The model to be used for the dialog operation. + voice_mode (bool): Flag indicating whether to use voice mode. + + Raises: + SystemExit: If the input parameters are invalid. + + """ + print("[validation]") + print("[validation] self.openai_api_key", self.openai_api_key) + + if model == "gpt-4-vision-preview" and not self.openai_api_key: + print("Please add your OpenAI API key before running `operate`") + sys.exit(1) + + if voice_mode and not self.openai_api_key: + print("To use voice mode, please add an OpenAI API key") + sys.exit(1) + + if model == "gpt-4-vision-preview" and not self.openai_api_key: + print("To use `gpt-4-vision-preview` add an OpenAI API key") + sys.exit(1) + + if model == "gemini-pro-vision" and not self.google_api_key: + print("To use `gemini-pro-vision` add a Google API key") + sys.exit(1) diff --git a/operate/settings.py b/operate/settings.py deleted file mode 100644 index 1dbc91f..0000000 --- a/operate/settings.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from dotenv import load_dotenv -from openai import OpenAI - - -class Config: - """ - Configuration class for managing settings. - - Attributes: - debug (bool): Flag indicating whether debug mode is enabled. - openai_api_key (str): API key for OpenAI. - google_api_key (str): API key for Google. - """ - - def __init__(self): - load_dotenv() - self.verbose = True - self.openai_api_key = os.getenv("OPENAI_API_KEY") - self.google_api_key = os.getenv("GOOGLE_API_KEY") - - def initialize_openai_client(self): - """ - Initializes and returns an OpenAI client with the configured API key. - - Returns: - OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. - """ - if self.openai_api_key: - client = OpenAI() - client.api_key = self.openai_api_key - client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) - return client - return None From 6975d1672553d57cfd37e2d3dd37996817170c07 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 09:58:20 -0800 Subject: [PATCH 55/65] fix validation bug --- operate/config.py | 6 +----- operate/models/apis.py | 4 ++-- operate/operate.py | 31 +++---------------------------- 3 files changed, 6 insertions(+), 35 deletions(-) diff --git a/operate/config.py b/operate/config.py index 87eb1e5..37f03f8 100644 --- a/operate/config.py +++ b/operate/config.py @@ -54,15 +54,11 @@ def validation(self, model, voice_mode): print("[validation]") print("[validation] self.openai_api_key", self.openai_api_key) - if model == "gpt-4-vision-preview" and not self.openai_api_key: - print("Please add your OpenAI API key before running `operate`") - sys.exit(1) - if voice_mode and not self.openai_api_key: print("To use voice mode, please add an OpenAI API key") sys.exit(1) - if model == "gpt-4-vision-preview" and not self.openai_api_key: + if model == "gpt-4" and not self.openai_api_key: print("To use `gpt-4-vision-preview` add an OpenAI API key") sys.exit(1) diff --git a/operate/models/apis.py b/operate/models/apis.py index 4c5f5af..e4f029f 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -12,7 +12,7 @@ from PIL import Image from ultralytics import YOLO import google.generativeai as genai -from operate.settings import Config +from operate.config import Config from operate.exceptions import ModelNotRecognizedException from operate.utils.screenshot import ( capture_screen_with_cursor, @@ -40,7 +40,7 @@ config = Config() VERBOSE = config.verbose -client = config.initialize_openai_client() +client = config.initialize_apis() yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model diff --git a/operate/operate.py b/operate/operate.py index a051b47..3124d61 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -13,7 +13,7 @@ get_system_prompt, get_system_prompt_labeled, ) -from operate.settings import Config +from operate.config import Config from operate.utils.style import ( ANSI_GREEN, ANSI_RESET, @@ -48,7 +48,7 @@ def main(model, terminal_prompt, voice_mode=False): mic = None # Initialize `WhisperMic`, if `voice_mode` is True - validation(model, voice_mode) + config.validation(model, voice_mode) if voice_mode: try: @@ -69,6 +69,7 @@ def main(model, terminal_prompt, voice_mode=False): text="An experimental framework to enable multimodal models to operate computers", style=style, ).run() + else: if VERBOSE: print("Running direct prompt...") @@ -192,29 +193,3 @@ def operate(operations): ) return False - - -def validation(model, voice_mode): - """ - Validate the input parameters for the dialog operation. - - Args: - model (str): The model to be used for the dialog operation. - voice_mode (bool): Flag indicating whether to use voice mode. - - Raises: - SystemExit: If the input parameters are invalid. - - """ - - if voice_mode and not config.openai_api_key: - print("To use voice mode, please add an OpenAI API key") - sys.exit(1) - - if model == "gpt-4-vision-preview" and not config.openai_api_key: - print("To use `gpt-4-vision-preview` add an OpenAI API key") - sys.exit(1) - - if model == "gemini-pro-vision" and not config.google_api_key: - print("To use `gemini-pro-vision` add a Google API key") - sys.exit(1) From 62fe6a9d06813c7654b8589c5cf102a3086dacca Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 10:03:45 -0800 Subject: [PATCH 56/65] update `initialize_apis1` --- operate/config.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/operate/config.py b/operate/config.py index 37f03f8..18d9c3e 100644 --- a/operate/config.py +++ b/operate/config.py @@ -27,17 +27,12 @@ def initialize_apis(self): Returns: OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. """ - try: - if self.openai_api_key: - client = OpenAI() - print("setting openai key") - client.api_key = self.openai_api_key - client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) - return client - return "Failed to initialize OpenAI" - except Exception as e: - print("[Config] Failed to initialize OpenAI", e) - return "Failed to initialize OpenAI" + if self.openai_api_key: + client = OpenAI() + client.api_key = self.openai_api_key + client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) + return client + return None def validation(self, model, voice_mode): """ @@ -51,8 +46,6 @@ def validation(self, model, voice_mode): SystemExit: If the input parameters are invalid. """ - print("[validation]") - print("[validation] self.openai_api_key", self.openai_api_key) if voice_mode and not self.openai_api_key: print("To use voice mode, please add an OpenAI API key") @@ -61,7 +54,6 @@ def validation(self, model, voice_mode): if model == "gpt-4" and not self.openai_api_key: print("To use `gpt-4-vision-preview` add an OpenAI API key") sys.exit(1) - + print("self.google_api_key", self.google_api_key) if model == "gemini-pro-vision" and not self.google_api_key: - print("To use `gemini-pro-vision` add a Google API key") sys.exit(1) From 5b3c75ca6b271060a7730185a65ce7c310113b35 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 10:08:16 -0800 Subject: [PATCH 57/65] move `yolo_model` to only be required if using that mode --- operate/config.py | 1 - operate/models/apis.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/operate/config.py b/operate/config.py index 18d9c3e..dc00753 100644 --- a/operate/config.py +++ b/operate/config.py @@ -54,6 +54,5 @@ def validation(self, model, voice_mode): if model == "gpt-4" and not self.openai_api_key: print("To use `gpt-4-vision-preview` add an OpenAI API key") sys.exit(1) - print("self.google_api_key", self.google_api_key) if model == "gemini-pro-vision" and not self.google_api_key: sys.exit(1) diff --git a/operate/models/apis.py b/operate/models/apis.py index e4f029f..cd865da 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -42,8 +42,6 @@ client = config.initialize_apis() -yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model - async def get_next_action(model, messages, objective, session_id): if model == "gpt-4": @@ -181,6 +179,7 @@ def call_gemini_pro_vision(messages, objective): async def call_gpt_4_vision_preview_labeled(messages, objective): time.sleep(1) try: + yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): os.makedirs(screenshots_dir) From c010fc3ca8373faa3693daebfc7cd990a4f5deb3 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 10:50:28 -0800 Subject: [PATCH 58/65] Fix `Config` bug --- operate/config.py | 67 ++++++++++++++++++++++++++++-------------- operate/models/apis.py | 16 ++++------ operate/operate.py | 22 +++++++------- 3 files changed, 62 insertions(+), 43 deletions(-) diff --git a/operate/config.py b/operate/config.py index dc00753..db69e4f 100644 --- a/operate/config.py +++ b/operate/config.py @@ -1,7 +1,8 @@ import os +import sys from dotenv import load_dotenv from openai import OpenAI -import sys +from prompt_toolkit.shortcuts import input_dialog class Config: @@ -20,16 +21,14 @@ def __init__(self): self.openai_api_key = os.getenv("OPENAI_API_KEY") self.google_api_key = os.getenv("GOOGLE_API_KEY") - def initialize_apis(self): - """ - Initializes and returns an OpenAI client with the configured API key. - - Returns: - OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. - """ + def initialize_openai(self): + print("[initialize_openai]") + print("[initialize_openai] self.openai_api_key", self.openai_api_key) if self.openai_api_key: client = OpenAI() + print("[initialize_openai] client = OpenAI()") client.api_key = self.openai_api_key + print("[initialize_openai] client.api_key", client.api_key) client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) return client return None @@ -37,22 +36,46 @@ def initialize_apis(self): def validation(self, model, voice_mode): """ Validate the input parameters for the dialog operation. + """ + self.require_api_key( + "OPENAI_API_KEY", "OpenAI API key", model == "gpt-4" or voice_mode + ) + self.require_api_key( + "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" + ) - Args: - model (str): The model to be used for the dialog operation. - voice_mode (bool): Flag indicating whether to use voice mode. + def require_api_key(self, key_name, key_description, is_required): + if is_required and not getattr(self, key_name.lower()): + self.prompt_and_save_api_key(key_name, key_description) - Raises: - SystemExit: If the input parameters are invalid. + def prompt_and_save_api_key(self, key_name, key_description): + key_value = input_dialog( + title="API Key Required", text=f"Please enter your {key_description}:" + ).run() + print("[prompt_and_save_api_key]") + print("[prompt_and_save_api_key] key_value", key_value) - """ + if key_value is None: # User pressed cancel or closed the dialog + sys.exit("Operation cancelled by user.") + + if key_value: + self.save_api_key_to_env(key_name, key_value) + load_dotenv() # Reload environment variables + # Update the instance attribute with the new key + print("[prompt_and_save_api_key] key_name", key_name) + print("[prompt_and_save_api_key] key_value", key_value) - if voice_mode and not self.openai_api_key: - print("To use voice mode, please add an OpenAI API key") - sys.exit(1) + if key_value: + self.save_api_key_to_env(key_name, key_value) + load_dotenv() # Reload environment variables + setattr(self, key_name.lower(), key_value) + print("[prompt_and_save_api_key] self.openai_api_key ", self.openai_api_key) + print("[prompt_and_save_api_key] self.google_api_key ", self.google_api_key) - if model == "gpt-4" and not self.openai_api_key: - print("To use `gpt-4-vision-preview` add an OpenAI API key") - sys.exit(1) - if model == "gemini-pro-vision" and not self.google_api_key: - sys.exit(1) + @staticmethod + def save_api_key_to_env(key_name, key_value): + print("[save_api_key_to_env]") + print("[save_api_key_to_env] key_name", key_name) + print("[save_api_key_to_env] key_value", key_value) + with open(".env", "a") as file: + file.write(f"\n{key_name}='{key_value}'") diff --git a/operate/models/apis.py b/operate/models/apis.py index cd865da..376beaf 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -3,11 +3,9 @@ import json import base64 -import re import io -import asyncio import aiohttp -import requests + from PIL import Image from ultralytics import YOLO @@ -37,10 +35,7 @@ # Load configuration -config = Config() -VERBOSE = config.verbose - -client = config.initialize_apis() +VERBOSE = Config().verbose async def get_next_action(model, messages, objective, session_id): @@ -58,9 +53,8 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): - """ - Get the next action for Self-Operating Computer - """ + config = Config() + client = config.initialize_openai() if VERBOSE: print("[Self Operating Computer][get_next_action][call_gpt_4_v]") time.sleep(1) @@ -177,6 +171,8 @@ def call_gemini_pro_vision(messages, objective): async def call_gpt_4_vision_preview_labeled(messages, objective): + config = Config() + client = config.initialize_openai() time.sleep(1) try: yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model diff --git a/operate/operate.py b/operate/operate.py index 3124d61..96aa18b 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -1,6 +1,5 @@ import sys import os -import platform import time import asyncio from prompt_toolkit.shortcuts import message_dialog @@ -64,21 +63,22 @@ def main(model, terminal_prompt, voice_mode=False): # Skip message dialog if prompt was given directly if not terminal_prompt: - message_dialog( - title="Self-Operating Computer", - text="An experimental framework to enable multimodal models to operate computers", - style=style, - ).run() + # message_dialog( + # title="Self-Operating Computer", + # text="An experimental framework to enable multimodal models to operate computers", + # style=style, + # ).run() + pass else: if VERBOSE: print("Running direct prompt...") - # Clear the console - if platform.system() == "Windows": - os.system("cls") - else: - print("\033c", end="") + # # Clear the console + # if platform.system() == "Windows": + # os.system("cls") + # else: + # print("\033c", end="") if terminal_prompt: # Skip objective prompt if it was given as an argument objective = terminal_prompt From 849ce90fbde807f543a364e3520789a9c63945de Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 11:13:33 -0800 Subject: [PATCH 59/65] remove `print` --- operate/config.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/operate/config.py b/operate/config.py index db69e4f..e04b65d 100644 --- a/operate/config.py +++ b/operate/config.py @@ -22,13 +22,9 @@ def __init__(self): self.google_api_key = os.getenv("GOOGLE_API_KEY") def initialize_openai(self): - print("[initialize_openai]") - print("[initialize_openai] self.openai_api_key", self.openai_api_key) if self.openai_api_key: client = OpenAI() - print("[initialize_openai] client = OpenAI()") client.api_key = self.openai_api_key - print("[initialize_openai] client.api_key", client.api_key) client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) return client return None @@ -52,8 +48,6 @@ def prompt_and_save_api_key(self, key_name, key_description): key_value = input_dialog( title="API Key Required", text=f"Please enter your {key_description}:" ).run() - print("[prompt_and_save_api_key]") - print("[prompt_and_save_api_key] key_value", key_value) if key_value is None: # User pressed cancel or closed the dialog sys.exit("Operation cancelled by user.") @@ -62,20 +56,13 @@ def prompt_and_save_api_key(self, key_name, key_description): self.save_api_key_to_env(key_name, key_value) load_dotenv() # Reload environment variables # Update the instance attribute with the new key - print("[prompt_and_save_api_key] key_name", key_name) - print("[prompt_and_save_api_key] key_value", key_value) if key_value: self.save_api_key_to_env(key_name, key_value) load_dotenv() # Reload environment variables setattr(self, key_name.lower(), key_value) - print("[prompt_and_save_api_key] self.openai_api_key ", self.openai_api_key) - print("[prompt_and_save_api_key] self.google_api_key ", self.google_api_key) @staticmethod def save_api_key_to_env(key_name, key_value): - print("[save_api_key_to_env]") - print("[save_api_key_to_env] key_name", key_name) - print("[save_api_key_to_env] key_value", key_value) with open(".env", "a") as file: file.write(f"\n{key_name}='{key_value}'") From 5bf795c70172025a81fc49193712a9fb52d1d2f1 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 11:16:38 -0800 Subject: [PATCH 60/65] add back clearing --- operate/operate.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index 96aa18b..e653ea7 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -5,6 +5,7 @@ from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt from operate.exceptions import ModelNotRecognizedException +import platform # from operate.models.prompts import USER_QUESTION, get_system_prompt from operate.models.prompts import ( @@ -63,22 +64,21 @@ def main(model, terminal_prompt, voice_mode=False): # Skip message dialog if prompt was given directly if not terminal_prompt: - # message_dialog( - # title="Self-Operating Computer", - # text="An experimental framework to enable multimodal models to operate computers", - # style=style, - # ).run() - pass + message_dialog( + title="Self-Operating Computer", + text="An experimental framework to enable multimodal models to operate computers", + style=style, + ).run() else: if VERBOSE: print("Running direct prompt...") # # Clear the console - # if platform.system() == "Windows": - # os.system("cls") - # else: - # print("\033c", end="") + if platform.system() == "Windows": + os.system("cls") + else: + print("\033c", end="") if terminal_prompt: # Skip objective prompt if it was given as an argument objective = terminal_prompt From b16d21aa842be1952b2ca344660fba61d491f756 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 11:17:06 -0800 Subject: [PATCH 61/65] remove unused `fetch_openai_response_async` --- operate/models/apis.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 376beaf..c21b24e 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -303,28 +303,6 @@ async def call_gpt_4_vision_preview_labeled(messages, objective): return call_gpt_4_vision_preview(messages) -async def fetch_openai_response_async(messages): - url = "https://api.openai.com/v1/chat/completions" - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {config.openai_api_key}", - } - data = { - "model": "gpt-4-vision-preview", - "messages": messages, - "frequency_penalty": 1, - "presence_penalty": 1, - "temperature": 0.7, - "max_tokens": 300, - } - - async with aiohttp.ClientSession() as session: - async with session.post( - url, headers=headers, data=json.dumps(data) - ) as response: - return await response.json() - - def get_last_assistant_message(messages): """ Retrieve the last message from the assistant in the messages array. From 47b058177fb12eda06fa0ea1cb67f5264bfd2539 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Mon, 15 Jan 2024 11:22:13 -0800 Subject: [PATCH 62/65] turn off `verbose` --- operate/config.py | 2 +- operate/models/apis.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/operate/config.py b/operate/config.py index e04b65d..28b6a9a 100644 --- a/operate/config.py +++ b/operate/config.py @@ -17,7 +17,7 @@ class Config: def __init__(self): load_dotenv() - self.verbose = True + self.verbose = False self.openai_api_key = os.getenv("OPENAI_API_KEY") self.google_api_key = os.getenv("GOOGLE_API_KEY") diff --git a/operate/models/apis.py b/operate/models/apis.py index c21b24e..8ce2785 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -4,7 +4,6 @@ import base64 import io -import aiohttp from PIL import Image From e9bef10337921141f69ed823492f60b1656cfdc7 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 16 Jan 2024 05:52:34 -0800 Subject: [PATCH 63/65] Switch back `README.md` for now --- README.md | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index c960275..54c043e 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ We will soon be offering API access to our Agent-1-Vision model. If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com). +### Additional Thoughts +We recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape. ## Demo https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0 @@ -41,26 +43,45 @@ https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96- ## Quick Start Instructions Below are instructions to set up the Self-Operating Computer Framework locally on your computer. -### Option 1: Quick Start -1. **Install the project locally**: +### Option 1: Traditional Installation + +1. **Clone the repo** to a directory on your computer: ``` -pip install self-operating-computer +git clone https://github.com/OthersideAI/self-operating-computer.git +``` +2. **Cd into directory**: + +``` +cd self-operating-computer ``` -2. **Add your Open AI key. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: -For Mac: +3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html). + +``` +python3 -m venv venv +``` +4. **Activate the virtual environment**: +``` +source venv/bin/activate +``` +5. **Install Project Requirements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:** ``` -export OPENAI_API_KEY='your-key-here' +pip install self-operating-computer +``` +6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.** +``` +mv .example.env .env ``` -For Windows: +7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: ``` -set GOOGLE_API_KEY='your_api_key' +OPENAI_API_KEY='your-key-here' ``` -3. **Run it**! + +8. **Run it**! ``` operate ``` -5. **Final Step**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences". +9. **Final Step**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences".
From 12b29d0667687d925f4657aebf986b42a7eb7914 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 16 Jan 2024 06:29:06 -0800 Subject: [PATCH 64/65] Add another useful `print` --- operate/models/apis.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 8ce2785..da5de6e 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -2,7 +2,7 @@ import time import json import base64 - +import traceback import io @@ -114,10 +114,10 @@ def call_gpt_4_vision_preview(messages): "[Self Operating Computer][get_next_action][call_gpt_4_v] content", content, ) - messages.append(assistant_message) - content = json.loads(content) + messages.append(assistant_message) + return content except Exception as e: @@ -125,6 +125,11 @@ def call_gpt_4_vision_preview(messages): f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying again {ANSI_RESET}", e, ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", + content, + ) + traceback.print_exc() return call_gpt_4_vision_preview(messages) From 01bd8504906876dd557dd2d82c14c0f106d53e08 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Tue, 16 Jan 2024 06:36:30 -0800 Subject: [PATCH 65/65] Minor prompt improvements --- operate/models/prompts.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index f0bd519..e3c9f71 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -12,7 +12,7 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click -[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -49,7 +49,7 @@ Objective: {objective} # take the next best action for this objective """ -SYSTEM_PROMPT_WIN_LINUX= """ +SYSTEM_PROMPT_WIN_LINUX = """ You are operating a computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. @@ -57,7 +57,7 @@ You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click -[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format +[{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format 2. write - Write with your keyboard [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] @@ -216,7 +216,7 @@ def get_system_prompt(objective): prompt = SYSTEM_PROMPT_MAC.format(objective=objective) else: prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective) - + return prompt @@ -228,7 +228,7 @@ def get_system_prompt_labeled(objective): prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) else: prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) - + return prompt