diff --git a/GUI_README.md b/GUI_README.md new file mode 100644 index 00000000..c819af77 --- /dev/null +++ b/GUI_README.md @@ -0,0 +1,121 @@ +# Self-Operating Computer GUI + +A graphical user interface for the Self-Operating Computer, allowing easy interaction with AI models to automate computer tasks. + +## Features + +- **Intuitive Chat Interface**: Communicate with the Self-Operating Computer through a familiar chat interface +- **Live Screenshot Preview**: See what the AI sees in real-time +- **Model Selection**: Choose from multiple AI models including GPT-4, Claude, Qwen, and more +- **Voice Control**: Speak your commands using the built-in voice recognition (requires whisper_mic) +- **Real-time Logs**: Monitor detailed logs of operations in real-time +- **Multi-platform**: Works on Windows, macOS, and Linux + +## Installation + +### Prerequisites + +- Python 3.8 or higher +- Self-Operating Computer installed and configured +- pip (Python package manager) + +### Required Packages + +```bash +pip install PyQt5 +pip install whisper_mic # Optional, for voice commands +``` + +## Usage + +### Running the GUI + +From the Self-Operating Computer directory: + +```bash +python gui_main.py +``` + +### Command Line Options + +``` +usage: gui_main.py [-h] [-m MODEL] [--verbose] [--light] + +Run the Self-Operating Computer GUI with a specified model. + +optional arguments: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Specify the default model to use + --verbose Run with verbose logging + --light Use light mode instead of dark mode +``` + +### Examples + +```bash +# Run with GPT-4 model and verbose logging +python gui_main.py -m gpt-4-vision --verbose + +# Run with Claude 3 model in light mode +python gui_main.py -m claude-3 --light +``` + +## Interface Guide + +The GUI is divided into several sections: + +1. **Top Bar**: Contains model selection dropdown and verbose mode toggle +2. **Left Panel**: Displays the current screenshot that the AI sees +3. **Right Panel - Top**: Chat history showing your requests and system messages +4. **Right Panel - Bottom**: Detailed logs of operations in real-time +5. **Bottom Input**: Text field for typing tasks, Send button, and voice recording button + +## Model Support + +The GUI supports all models that the Self-Operating Computer supports: + +- GPT-4 Vision +- GPT-4 with SOM (Spatial Object Memory) +- GPT-4 with OCR +- Claude 3 +- Claude 3.7 +- Qwen-VL +- O1 with OCR +- Gemini Pro Vision +- LLaVA + +## API Keys + +The GUI uses the same API key configuration as the main Self-Operating Computer. If a required API key is missing, a prompt will appear asking you to enter it. + +## Troubleshooting + +### Voice Recognition Not Working + +Make sure you have installed whisper_mic: +```bash +pip install whisper_mic +``` + +### GUI Not Launching + +Check that PyQt5 is properly installed: +```bash +pip install PyQt5 +``` + +### Model Not Responding + +Ensure your API keys are properly configured in the Self-Operating Computer settings. + +## Integration with Existing Codebase + +The GUI integrates seamlessly with the existing Self-Operating Computer codebase: + +- It uses the same `operate.py` functions for executing tasks +- It leverages the same model APIs from `apis.py` +- It inherits configuration from `config.py` +- It preserves the same prompt formats from `prompts.py` + +The UI simply provides a graphical wrapper around these core components, making them more accessible to users who prefer not to use the comman \ No newline at end of file diff --git a/README.md b/README.md index ab24691c..108bafa6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ome
- A framework to enable multimodal models to operate a computer. + A framework to enable multimodal models to operate a computer GUI INCLUDED and double click, right click, scroll and wait operations defined.
Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. Released Nov 2023, the Self-Operating Computer Framework was one of the first examples of using a multimodal model to view the screen and operate a computer. @@ -20,7 +20,7 @@ ome ## Key Features - **Compatibility**: Designed for various multimodal models. -- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3 and LLaVa.** +- **Integration**: Currently integrated with **GPT-4o, o1, Claude 3.7, Gemini Pro Vision, Claude 3, qwuen-VL and LLaVa.** - **Future Plans**: Support for additional models. ## Demo @@ -62,6 +62,14 @@ operate -m o1-with-ocr ### Multimodal Models `-m` + +#### Try claude 3.7 `-m claude-3.7` +Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Anthropic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. + +``` +operate -m claude-3.7 +``` + Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model ``` operate -m gemini-pro-vision @@ -76,6 +84,13 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a operate -m claude-3 ``` +#### Try qwen `-m qwen-vl` +Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. + +``` +operate -m qwen-vl +``` + #### Try LLaVa Hosted Through Ollama `-m llava` If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* diff --git a/gui.py b/gui.py new file mode 100644 index 00000000..ac176bfc --- /dev/null +++ b/gui.py @@ -0,0 +1,563 @@ +import sys +import os +import time +import threading +import asyncio +import platform +import json +import base64 +from PyQt5.QtWidgets import QSizePolicy +from PyQt5.QtWidgets import ( + QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, + QTextEdit, QLineEdit, QPushButton, QComboBox, QCheckBox, + QLabel, QScrollArea, QFrame, QSplitter, QMessageBox, QProgressBar +) +from PyQt5.QtCore import Qt, QThread, pyqtSignal, pyqtSlot, QSize, QTimer +from PyQt5.QtGui import QFont, QIcon, QTextCursor, QColor, QPalette, QPixmap + +# Import directly from local modules +from operate.models.prompts import USER_QUESTION, get_system_prompt +from operate.config import Config +from operate.models.apis import get_next_action +from operate.utils.screenshot import capture_screen_with_cursor +from operate.exceptions import ModelNotRecognizedException +from operate.operate import operate, get_scaling_factor + +# Setup config +config = Config() + +# Define available models - match the models in apis.py +AVAILABLE_MODELS = [ + "gpt-4-vision", + "gpt-4-with-som", + "gpt-4-with-ocr", + "claude-3", + "claude-3.7", + "qwen-vl", + "o1-with-ocr", + "gemini-pro-vision", + "llava" +] + + +class LogRedirector: + """Redirects print output to the GUI log window""" + + def __init__(self, text_widget): + self.text_widget = text_widget + self.original_stdout = sys.stdout + self.original_stderr = sys.stderr + + def write(self, text): + self.original_stdout.write(text) + self.text_widget.append(text) + # Auto-scroll to bottom + self.text_widget.moveCursor(QTextCursor.End) + + def flush(self): + self.original_stdout.flush() + QApplication.processEvents() + + +class RecordButton(QPushButton): + """Custom button for voice recording that changes appearance when pressed""" + + def __init__(self, parent=None): + super().__init__(parent) + self.setText("Hold to Record") + self.setCheckable(True) + self.setStyleSheet(""" + QPushButton { + background-color: #f0f0f0; + border: 2px solid #c0c0c0; + border-radius: 15px; + padding: 8px; + color: #404040; + } + QPushButton:checked { + background-color: #ff4444; + color: white; + border: 2px solid #dd2222; + } + """) + self.mic = None + + +class ScreenshotDisplay(QLabel): + """Widget to display the current screenshot""" + + def __init__(self, parent=None): + super().__init__(parent) + self.setAlignment(Qt.AlignCenter) + self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding) + self.setMinimumHeight(200) + self.setStyleSheet("background-color: #121212; border: 1px solid #333;") + self.setText("No screenshot available") + + def update_screenshot(self, filename): + if os.path.exists(filename): + pixmap = QPixmap(filename) + # Scale pixmap to fit widget while maintaining aspect ratio + scaled_pixmap = pixmap.scaled( + self.width(), self.height(), + Qt.KeepAspectRatio, Qt.SmoothTransformation + ) + self.setPixmap(scaled_pixmap) + else: + self.setText("Screenshot not found") + + def resizeEvent(self, event): + # If we have a pixmap, rescale it when the widget is resized + if hasattr(self, 'pixmap') and self.pixmap(): + scaled_pixmap = self.pixmap().scaled( + self.width(), self.height(), + Qt.KeepAspectRatio, Qt.SmoothTransformation + ) + self.setPixmap(scaled_pixmap) + super().resizeEvent(event) + + +class OperateThread(QThread): + update_signal = pyqtSignal(str) + completed_signal = pyqtSignal() + error_signal = pyqtSignal(str) + screenshot_signal = pyqtSignal(str) + + def __init__(self, model, objective, voice_mode=False, verbose_mode=False): + super().__init__() + self.model = model + self.objective = objective + self.voice_mode = voice_mode + self.verbose_mode = verbose_mode + self.running = True + + def run(self): + try: + config.verbose = self.verbose_mode + config.validation(self.model, self.voice_mode) + + mic = None + if self.voice_mode: + try: + from whisper_mic import WhisperMic + mic = WhisperMic() + self.update_signal.emit("Voice recognition initialized.") + except ImportError: + self.error_signal.emit( + "Voice mode requires 'whisper_mic' module. Install with 'pip install -r requirements-audio.txt'") + return + + system_prompt = get_system_prompt(self.model, self.objective) + system_message = {"role": "system", "content": system_prompt} + messages = [system_message] + loop_count = 0 + session_id = None + + self.update_signal.emit(f"Starting task: {self.objective}") + + task_completed = False + while not task_completed and self.running: + if config.verbose: + self.update_signal.emit(f"[Self Operating Computer] loop_count {loop_count}") + + # Capture screenshot for UI + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + capture_screen_with_cursor(screenshot_filename) + self.screenshot_signal.emit(screenshot_filename) + + # Get next action from the model + operations, session_id = self.run_async( + get_next_action(self.model, messages, self.objective, session_id) + ) + + # Process the operations and update task_completed accordingly + task_completed = operate(operations, session_id, self.model) + + loop_count += 1 + if loop_count > 10: + task_completed = True + self.update_signal.emit("[Self-Operating Computer] Max loop count reached. Task considered complete.") + + # If the thread was stopped by the user, we can check the running flag: + if not self.running: + self.update_signal.emit("Task stopped by the user.") + else: + self.update_signal.emit("Task completed.") + self.completed_signal.emit() + + except Exception as e: + self.error_signal.emit(f"Thread error: {str(e)}") + + def stop(self): + self.running = False + + def run_async(self, coroutine): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coroutine) + finally: + loop.close() + + + +class VoiceRecordingThread(QThread): + finished_signal = pyqtSignal(str) + + def __init__(self, mic): + super().__init__() + self.mic = mic + + def run(self): + try: + # Call listen() without a stop_flag since it's not supported + result = self.mic.listen() + self.finished_signal.emit(result) + except Exception as e: + self.finished_signal.emit(f"Error: {str(e)}") + +class MessageWidget(QFrame): + """Widget to display a single message in the chat view""" + + def __init__(self, text, is_user=False, parent=None): + super().__init__(parent) + self.setFrameShape(QFrame.StyledPanel) + self.setStyleSheet( + "background-color: #c8c8c8; border-radius: 10px; margin: 5px;" if is_user else + "background-color: #d0d0d0; border-radius: 10px; margin: 5px;" + ) + + layout = QVBoxLayout(self) + + # Add a label for the sender + sender = QLabel("You:" if is_user else "System:") + sender.setStyleSheet("font-weight: bold; color: #333;") + layout.addWidget(sender) + + # Add the message text + message = QLabel(text) + message.setWordWrap(True) + message.setTextInteractionFlags(Qt.TextSelectableByMouse) + layout.addWidget(message) + + self.setLayout(layout) + + + +class SOCChatWindow(QMainWindow): + """Main chat window for the Self-Operating Computer interface""" + + def __init__(self): + super().__init__() + + self.setWindowTitle("Self-Operating Computer") + self.setMinimumSize(1000, 700) + + # Initialize mic to None + self.mic = None + self.operate_thread = None + + self.init_ui() + + # Try to initialize whisper_mic if available + try: + from whisper_mic import WhisperMic + self.mic = WhisperMic() + self.record_button.setEnabled(True) + except ImportError: + self.record_button.setEnabled(False) + self.record_button.setToolTip("Install whisper_mic module to use voice") + + def init_ui(self): + """Initialize the user interface""" + # Create the central widget and main layout + central_widget = QWidget() + main_layout = QVBoxLayout(central_widget) + + # Settings bar at the top + settings_layout = QHBoxLayout() + + # Model selection dropdown + model_label = QLabel("Model:") + self.model_combo = QComboBox() + self.model_combo.addItems(AVAILABLE_MODELS) + self.model_combo.setCurrentIndex( + AVAILABLE_MODELS.index("gpt-4-with-ocr") if "gpt-4-with-ocr" in AVAILABLE_MODELS else 0) + + # Verbose mode checkbox + self.verbose_checkbox = QCheckBox("Verbose Logs") + + # Add widgets to settings layout + settings_layout.addWidget(model_label) + settings_layout.addWidget(self.model_combo) + settings_layout.addWidget(self.verbose_checkbox) + settings_layout.addStretch(1) + + # Add settings to main layout + main_layout.addLayout(settings_layout) + + # Create a horizontal splitter for screenshot and chat views + h_splitter = QSplitter(Qt.Horizontal) + + # Left panel - Screenshot view + screenshot_container = QWidget() + screenshot_layout = QVBoxLayout(screenshot_container) + + # Screenshot label + screenshot_label = QLabel("Screen Preview:") + screenshot_layout.addWidget(screenshot_label) + + # Screenshot display + self.screenshot_display = ScreenshotDisplay() + screenshot_layout.addWidget(self.screenshot_display) + + h_splitter.addWidget(screenshot_container) + + # Right panel - Chat view and log + chat_log_splitter = QSplitter(Qt.Vertical) + + # Chat view area (top part of right panel) + chat_container = QWidget() + chat_layout = QVBoxLayout(chat_container) + + # Create the scrollable chat view + self.chat_scroll_area = QScrollArea() + self.chat_scroll_area.setWidgetResizable(True) + self.chat_content = QWidget() + self.chat_content_layout = QVBoxLayout(self.chat_content) + self.chat_content_layout.addStretch(1) # Push messages to the top + + self.chat_scroll_area.setWidget(self.chat_content) + chat_layout.addWidget(self.chat_scroll_area) + + # Input area + input_layout = QHBoxLayout() + + # Text input field + self.text_input = QLineEdit() + self.text_input.setPlaceholderText("Type your request here...") + self.text_input.returnPressed.connect(self.send_message) + + # Record button + self.record_button = RecordButton() + self.record_button.pressed.connect(self.start_recording) + self.record_button.released.connect(self.stop_recording) + + # Send button + self.send_button = QPushButton("Send") + self.send_button.clicked.connect(self.send_message) + + # **New Stop button** + self.stop_button = QPushButton("Stop") + self.stop_button.clicked.connect(self.stop_task) + + # Add widgets to input layout + input_layout.addWidget(self.text_input) + input_layout.addWidget(self.record_button) + input_layout.addWidget(self.send_button) + input_layout.addWidget(self.stop_button) # Add the Stop button + + # Add input area to chat layout + chat_layout.addLayout(input_layout) + + # Log view (bottom part of right panel) + self.log_view = QTextEdit() + self.log_view.setReadOnly(True) + self.log_view.setStyleSheet("font-family: Consolas, monospace; background-color: #222; color: #ddd;") + + # Add chat view and log view to the chat_log_splitter + chat_log_splitter.addWidget(chat_container) + chat_log_splitter.addWidget(self.log_view) + chat_log_splitter.setStretchFactor(0, 3) # Give chat view more space + chat_log_splitter.setStretchFactor(1, 2) + + # Add chat_log_splitter to the right side of h_splitter + h_splitter.addWidget(chat_log_splitter) + h_splitter.setStretchFactor(0, 1) # Screenshot area + h_splitter.setStretchFactor(1, 2) # Chat + log area + + # Add h_splitter to main layout + main_layout.addWidget(h_splitter) + + # Add progress indicator at the bottom (hidden by default) + self.progress_bar = QProgressBar() + self.progress_bar.setRange(0, 0) # Indeterminate mode + self.progress_bar.setVisible(False) + main_layout.addWidget(self.progress_bar) + + # Set the central widget + self.setCentralWidget(central_widget) + + # Redirect stdout to the log view + self.log_redirector = LogRedirector(self.log_view) + sys.stdout = self.log_redirector + sys.stderr = self.log_redirector + + # Add a welcome message to the chat + self.add_message("Welcome to Self-Operating Computer! What would you like done?", is_user=False) + + # Set focus to the text input + self.text_input.setFocus() + + # Check for screenshots directory and display the latest screenshot if available + screenshots_dir = "screenshots" + if os.path.exists(screenshots_dir): + screenshot_files = [f for f in os.listdir(screenshots_dir) if f.endswith('.png')] + if screenshot_files: + latest_screenshot = os.path.join(screenshots_dir, sorted(screenshot_files)[-1]) + self.screenshot_display.update_screenshot(latest_screenshot) + + def add_message(self, text, is_user=True): + """Add a message to the chat view""" + message_widget = MessageWidget(text, is_user) + self.chat_content_layout.insertWidget(self.chat_content_layout.count() - 1, message_widget) + + # Scroll to the bottom to show the new message + self.chat_scroll_area.verticalScrollBar().setValue( + self.chat_scroll_area.verticalScrollBar().maximum() + ) + + def send_message(self): + """Send a message and start processing the task""" + text = self.text_input.text().strip() + if not text: + return + + # Add the message to the chat view + self.add_message(text, is_user=True) + self.text_input.clear() + + # Start processing in a separate thread + self.process_task(text) + + def process_task(self, objective): + """Process a task in a separate thread""" + # Disable input while processing + self.text_input.setEnabled(False) + self.send_button.setEnabled(False) + self.record_button.setEnabled(False) + self.model_combo.setEnabled(False) + self.verbose_checkbox.setEnabled(False) + + # Show progress indicator + self.progress_bar.setVisible(True) + + # Get selected model and verbose setting + model = self.model_combo.currentText() + verbose = self.verbose_checkbox.isChecked() + + # Create and start the thread + self.operate_thread = OperateThread(model, objective, False, verbose) + self.operate_thread.update_signal.connect(self.update_log) + self.operate_thread.completed_signal.connect(self.task_completed) + self.operate_thread.error_signal.connect(self.handle_error) + self.operate_thread.screenshot_signal.connect(self.update_screenshot) + self.operate_thread.start() + + @pyqtSlot() + def stop_task(self): + if self.operate_thread is not None and self.operate_thread.isRunning(): + self.operate_thread.stop() # Signal the thread to stop + self.operate_thread.wait() # Wait for it to finish + self.add_message("Task stopped by the user.", is_user=False) + + # Re-enable input and hide progress indicator + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.record_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + self.progress_bar.setVisible(False) + self.text_input.setFocus() + + @pyqtSlot(str) + def update_log(self, text): + """Update the log view with new text""" + print(text) + + @pyqtSlot(str) + def update_screenshot(self, filename): + """Update the screenshot display with the latest screenshot""" + self.screenshot_display.update_screenshot(filename) + + @pyqtSlot() + def task_completed(self): + """Handle task completion""" + # Add completion message to chat + self.add_message("Task completed! What would you like to do next?", is_user=False) + + # Re-enable input + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + if self.mic: + self.record_button.setEnabled(True) + + # Hide progress indicator + self.progress_bar.setVisible(False) + + # Set focus back to text input + self.text_input.setFocus() + + @pyqtSlot(str) + def handle_error(self, error_text): + """Handle errors from the operate thread""" + print(f"ERROR: {error_text}") + self.add_message(f"An error occurred: {error_text}", is_user=False) + + # Re-enable input + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + if self.mic: + self.record_button.setEnabled(True) + + # Hide progress indicator + self.progress_bar.setVisible(False) + + # Set focus back to text input + self.text_input.setFocus() + + def start_recording(self): + """Start voice recording""" + if not self.mic: + return + + self.record_thread = VoiceRecordingThread(self.mic) + self.record_thread.finished_signal.connect(self.process_voice_result) + self.record_thread.start() + + def stop_recording(self): + """Stop voice recording gracefully.""" + if hasattr(self, 'record_thread') and self.record_thread.isRunning(): + self.record_thread.stop() # signal the thread to stop + self.record_thread.wait(2000) # wait up to 2 seconds for the thread to finish + + @pyqtSlot(str) + def process_voice_result(self, result): + """Process the result from voice recognition""" + if result.startswith("Error:"): + QMessageBox.warning(self, "Voice Recognition Error", result) + return + + # Set the recognized text to the input field and send it + self.text_input.setText(result) + self.send_message() + + def closeEvent(self, event): + """Handle window close event""" + # Stop any running thread + if self.operate_thread and self.operate_thread.isRunning(): + self.operate_thread.stop() + self.operate_thread.wait() + + # Restore stdout and stderr + sys.stdout = self.log_redirector.original_stdout + sys.stderr = self.log_redirector.original_stderr + + event.accept() \ No newline at end of file diff --git a/gui_main.py b/gui_main.py new file mode 100644 index 00000000..f0154db6 --- /dev/null +++ b/gui_main.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +""" +Self-Operating Computer GUI +""" +import sys +import os +import argparse +from PyQt5.QtWidgets import QApplication + +# Add the root directory to the system path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Import after setting path +from operate.config import Config +from operate.utils.style import ANSI_BRIGHT_MAGENTA +from gui import SOCChatWindow + + +def main_entry(): + """ + Main entry point for the Self-Operating Computer GUI + """ + parser = argparse.ArgumentParser( + description="Run the Self-Operating Computer GUI with a specified model." + ) + parser.add_argument( + "-m", + "--model", + help="Specify the default model to use", + required=False, + default="gpt-4-with-ocr", + ) + + # Add a flag for verbose mode + parser.add_argument( + "--verbose", + help="Run with verbose logging", + action="store_true", + ) + + # Allow for dark or light mode + parser.add_argument( + "--light", + help="Use light mode instead of dark mode", + action="store_true", + ) + + try: + args = parser.parse_args() + + # Create Qt application + app = QApplication(sys.argv) + app.setStyle("Fusion") + + # Apply dark mode palette unless light mode is requested + if not args.light: + from PyQt5.QtGui import QPalette, QColor + from PyQt5.QtCore import Qt + + palette = QPalette() + palette.setColor(QPalette.Window, QColor(53, 53, 53)) + palette.setColor(QPalette.WindowText, Qt.white) + palette.setColor(QPalette.Base, QColor(25, 25, 25)) + palette.setColor(QPalette.AlternateBase, QColor(53, 53, 53)) + palette.setColor(QPalette.ToolTipBase, Qt.white) + palette.setColor(QPalette.ToolTipText, Qt.white) + palette.setColor(QPalette.Text, Qt.white) + palette.setColor(QPalette.Button, QColor(53, 53, 53)) + palette.setColor(QPalette.ButtonText, Qt.white) + palette.setColor(QPalette.BrightText, Qt.red) + palette.setColor(QPalette.Link, QColor(42, 130, 218)) + palette.setColor(QPalette.Highlight, QColor(42, 130, 218)) + palette.setColor(QPalette.HighlightedText, Qt.black) + app.setPalette(palette) + + # Initialize configuration + config = Config() + config.verbose = args.verbose + + # Create and show the main window + window = SOCChatWindow() + + # Set the default model based on command-line argument + model_index = window.model_combo.findText(args.model) + if model_index >= 0: + window.model_combo.setCurrentIndex(model_index) + + # Set verbose checkbox based on command-line argument + window.verbose_checkbox.setChecked(args.verbose) + + # Show the window + window.show() + + # Run the application + sys.exit(app.exec_()) + + except KeyboardInterrupt: + print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") + except Exception as e: + print(f"Error starting GUI: {str(e)}") + sys.exit(1) + + +if __name__ == "__main__": + main_entry() \ No newline at end of file diff --git a/operate/config.py b/operate/config.py index b97b20ac..6d72cb13 100644 --- a/operate/config.py +++ b/operate/config.py @@ -44,6 +44,10 @@ def __init__(self): None # instance variables are backups in case saving to a `.env` fails ) + self.qwen_api_key = ( + None # instance variables are backups in case saving to a `.env` fails + ) + def initialize_openai(self): if self.verbose: print("[Config][initialize_openai]") @@ -66,6 +70,29 @@ def initialize_openai(self): client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) return client + def initialize_qwen(self): + if self.verbose: + print("[Config][initialize_qwen]") + + if self.qwen_api_key: + if self.verbose: + print("[Config][initialize_qwen] using cached qwen_api_key") + api_key = self.qwen_api_key + else: + if self.verbose: + print( + "[Config][initialize_qwen] no cached qwen_api_key, try to get from env." + ) + api_key = os.getenv("QWEN_API_KEY") + + client = OpenAI( + api_key=api_key, + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + ) + client.api_key = api_key + client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + return client + def initialize_google(self): if self.google_api_key: if self.verbose: @@ -97,9 +124,16 @@ def initialize_ollama(self): def initialize_anthropic(self): if self.anthropic_api_key: + if self.verbose: + print("[Config][initialize_anthropic] using cached anthropic_api_key") api_key = self.anthropic_api_key else: + if self.verbose: + print( + "[Config][initialize_anthropic] no cached anthropic_api_key, try to get from env." + ) api_key = os.getenv("ANTHROPIC_API_KEY") + return anthropic.Anthropic(api_key=api_key) def validation(self, model, voice_mode): @@ -119,8 +153,10 @@ def validation(self, model, voice_mode): "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" ) self.require_api_key( - "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" + "ANTHROPIC_API_KEY", "Anthropic API key", + model == "claude-3" or model == "claude-3.7" ) + self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) @@ -147,6 +183,8 @@ def prompt_and_save_api_key(self, key_name, key_description): self.google_api_key = key_value elif key_name == "ANTHROPIC_API_KEY": self.anthropic_api_key = key_value + elif key_name == "QWEN_API_KEY": + self.qwen_api_key = key_value self.save_api_key_to_env(key_name, key_value) load_dotenv() # Reload environment variables # Update the instance attribute with the new key diff --git a/operate/models/apis.py b/operate/models/apis.py index d0ccb0c4..7db63ac1 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -25,6 +25,7 @@ ) from operate.utils.ocr import get_text_coordinates, get_text_element from operate.utils.screenshot import capture_screen_with_cursor +from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET # Load configuration @@ -37,6 +38,11 @@ async def get_next_action(model, messages, objective, session_id): print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": return call_gpt_4o(messages), None + if model == "claude-3.7": + return call_claude_37(messages), None + if model == "qwen-vl": + operation = await call_qwen_vl_with_ocr(messages, objective, model) + return operation, None if model == "gpt-4-with-som": operation = await call_gpt_4o_labeled(messages, objective, model) return operation, None @@ -136,6 +142,731 @@ def call_gpt_4o(messages): return call_gpt_4o(messages) +def extract_target_from_text(text): + """ + Extract target file/folder names from text with intelligent priority. + + Args: + text (str): Text to analyze (thought or operation text) + + Returns: + str: The extracted target description + """ + import re + + # Priority 1: Look for quoted text which often indicates file/folder names + quoted_pattern = re.compile(r"['\"]([^'\"]+)['\"]") + quoted_matches = quoted_pattern.findall(text) + if quoted_matches: + return quoted_matches[0] + + # Priority 2: Look for file/folder patterns (word-word or words with extensions) + file_pattern = re.compile(r"(\w+[-\.]\w+[-\.]\w+|\w+[-\.]\w+)") + file_matches = file_pattern.findall(text) + for match in file_matches: + # Filter out things that don't look like folder/file names + if any(x in match.lower() for x in ['-main', 'folder', 'file', 'image', 'doc', '.', 'sbc']): + return match + + # Priority 3: Look for phrases after "click on X" or "open X" + click_phrases = ["click on ", "click the ", "clicking on ", "clicking the ", "open ", "opening "] + for phrase in click_phrases: + if phrase in text.lower(): + parts = text.lower().split(phrase, 1) + if len(parts) > 1: + # Extract up to a period, comma, or space + target = parts[1].split(".")[0].split(",")[0].strip() + # Only return if it's not too long (likely not a file name if very long) + if 2 <= len(target.split()) <= 5: + return target + + # Priority 4: Look for capitalized words which might be file/folder names + cap_word_pattern = re.compile(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b') + cap_matches = cap_word_pattern.findall(text) + if cap_matches: + # Filter to likely file/folder names + likely_matches = [m for m in cap_matches if len(m) > 3] + if likely_matches: + return likely_matches[0] + + # Default: just return the original text if nothing better found + return text + + +def find_ui_element_by_text_and_vision(target_description, screenshot_filename): + """ + Finds UI elements using multiple methods: text OCR, template matching, and shape detection. + Specialized for finding desktop icons, folders, and common UI elements. + + Args: + target_description (str): Description of what we're trying to find (e.g., "sbc-images-main") + screenshot_filename (str): Path to screenshot file + + Returns: + tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found + """ + import cv2 + import numpy as np + from PIL import Image + import easyocr + import os + import re + + # Clean up the target description for better matching + target_words = target_description.lower().split() + # Remove common words that don't help with identification + stop_words = ['the', 'a', 'an', 'to', 'on', 'in', 'by', 'it', 'this', 'that', 'for', 'with', 'click', 'double'] + target_words = [word for word in target_words if word not in stop_words] + clean_target = ' '.join(target_words) + + print(f"[Target Finder] Looking for: '{clean_target}'") + + # Load the screenshot + screenshot = Image.open(screenshot_filename) + screenshot_np = np.array(screenshot) + screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR) + + # Create a debug image to visualize findings + debug_img = screenshot_rgb.copy() + + # Results will store all potential matches with their confidence scores + results = [] + + # APPROACH 1: Template matching with saved templates + icon_folder = "icon_templates" + if os.path.exists(icon_folder) and any(os.listdir(icon_folder)): + for filename in os.listdir(icon_folder): + if filename.endswith(('.png', '.jpg')): + # Extract the template name for matching + template_name = filename.replace('_', ' ').replace('.png', '').replace('.jpg', '') + + # Check if template name matches any part of the target + if any(word in template_name.lower() for word in target_words) or \ + any(word in clean_target for word in template_name.lower().split()): + + template_path = os.path.join(icon_folder, filename) + template = cv2.imread(template_path) + + if template is None: + continue + + # Apply template matching + res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED) + min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) + + if max_val > 0.7: # Good match + template_h, template_w = template.shape[:2] + top_left = max_loc + bottom_right = (top_left[0] + template_w, top_left[1] + template_h) + center_x = top_left[0] + template_w // 2 + center_y = top_left[1] + template_h // 2 + + # Add to results with high confidence since it's a template match + match_score = max_val * 1.5 # Boost template matches + results.append({ + "type": "template", + "confidence": match_score, + "center": (center_x, center_y), + "bbox": (top_left[0], top_left[1], bottom_right[0], bottom_right[1]) + }) + + # Draw on debug image + cv2.rectangle(debug_img, top_left, bottom_right, (0, 255, 0), 2) + cv2.putText(debug_img, f"Template: {template_name} ({match_score:.2f})", + (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) + + # APPROACH 2: OCR text detection + try: + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + ocr_results = reader.readtext(screenshot_filename) + + for idx, (bbox, text, conf) in enumerate(ocr_results): + text_lower = text.lower() + + # Check for any word match + word_match = False + for word in target_words: + if len(word) > 2 and word in text_lower: # Avoid matching very short words + word_match = True + break + + # Calculate match score based on text similarity + if word_match or clean_target in text_lower or text_lower in clean_target: + # Calculate match score + from difflib import SequenceMatcher + similarity = SequenceMatcher(None, clean_target, text_lower).ratio() + match_score = similarity * conf + + # Especially boost exact matches or strong partial matches + if similarity > 0.8: + match_score *= 1.5 + + # Get center of text bounding box + bbox_points = np.array(bbox).astype(int) + center_x = np.mean([p[0] for p in bbox_points]) + center_y = np.mean([p[1] for p in bbox_points]) + + # Calculate bounding box rectangle + x_points = [p[0] for p in bbox_points] + y_points = [p[1] for p in bbox_points] + bbox_rect = (min(x_points), min(y_points), max(x_points), max(y_points)) + + # Add to results + results.append({ + "type": "text", + "text": text, + "confidence": match_score, + "center": (center_x, center_y), + "bbox": bbox_rect + }) + + # Draw on debug image + top_left = (int(bbox_rect[0]), int(bbox_rect[1])) + bottom_right = (int(bbox_rect[2]), int(bbox_rect[3])) + cv2.rectangle(debug_img, top_left, bottom_right, (0, 0, 255), 2) + cv2.putText(debug_img, f"OCR: {text} ({match_score:.2f})", + (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) + + # For text results, look for potential UI elements above (desktop icon case) + # If this looks like a desktop icon label, the actual icon is likely above it + if any(word in text_lower for word in ['folder', 'file', 'image', 'doc']) or \ + re.search(r'\w+[-\.]\w+', text_lower) or \ + "sbc" in text_lower: + # Define a region above the text to look for the icon + icon_area_width = bbox_rect[2] - bbox_rect[0] + icon_area_height = icon_area_width # Make it square + icon_area_top = max(0, bbox_rect[1] - icon_area_height - 10) # Above text with a small gap + icon_area_left = bbox_rect[0] + + icon_center_x = icon_area_left + icon_area_width // 2 + icon_center_y = icon_area_top + icon_area_height // 2 + + # Add this as a potential icon location with boosted confidence + icon_match_score = match_score * 1.2 # Boost confidence for icon targets + results.append({ + "type": "icon", + "confidence": icon_match_score, + "center": (icon_center_x, icon_center_y), + "bbox": (icon_area_left, icon_area_top, + icon_area_left + icon_area_width, icon_area_top + icon_area_height) + }) + + # Draw the potential icon area + cv2.rectangle(debug_img, + (int(icon_area_left), int(icon_area_top)), + (int(icon_area_left + icon_area_width), int(icon_area_top + icon_area_height)), + (255, 0, 0), 2) + cv2.putText(debug_img, f"Icon target ({icon_match_score:.2f})", + (int(icon_area_left), int(icon_area_top) - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) + + except Exception as e: + print(f"[Target Finder] OCR detection error: {e}") + + # APPROACH 3: Folder icon detection (color/shape based) + if "folder" in clean_target or "file" in clean_target or "sbc" in clean_target: + try: + # Convert to HSV for better color segmentation + hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV) + + # Define color ranges for common folder icons (yellow folders in Windows) + lower_yellow = np.array([20, 100, 100]) + upper_yellow = np.array([40, 255, 255]) + + # Create mask for yellow color + mask = cv2.inRange(hsv, lower_yellow, upper_yellow) + + # Find contours in the mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter contours by size (folder icons are usually of similar size) + min_area = 100 + max_area = 5000 + + for contour in contours: + area = cv2.contourArea(contour) + if min_area < area < max_area: + # Get center of contour + M = cv2.moments(contour) + if M["m00"] > 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + + # Get bounding box + x, y, w, h = cv2.boundingRect(contour) + + # Add to results with lower confidence for shape-based detection + match_score = 0.5 # Base confidence for shape detection + results.append({ + "type": "shape", + "confidence": match_score, + "center": (center_x, center_y), + "bbox": (x, y, x + w, y + h) + }) + + # Draw on debug image + cv2.rectangle(debug_img, (x, y), (x + w, y + h), (255, 255, 0), 2) + cv2.putText(debug_img, f"Shape ({match_score:.2f})", + (x, y - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) + except Exception as e: + print(f"[Target Finder] Shape detection error: {e}") + + # Save the debug image + cv2.imwrite("debug_target_detection.jpg", debug_img) + + if results: + # Sort by confidence + results.sort(key=lambda x: x.get("confidence", 0), reverse=True) + best_match = results[0] + + # Print debug info + print(f"[Target Finder] Best match: {best_match['type']} with confidence {best_match['confidence']:.2f}") + + # Get the center point + center_x, center_y = best_match["center"] + + # Convert to percentage of screen size + screen_width, screen_height = screenshot.size + x_percent = center_x / screen_width + y_percent = center_y / screen_height + + # Mark the final target on the debug image + result_img = cv2.circle(debug_img, (int(center_x), int(center_y)), 10, (0, 255, 255), -1) + cv2.imwrite("debug_final_target.jpg", result_img) + + return (x_percent, y_percent) + + print(f"[Target Finder] No match found for '{clean_target}'") + return None + + +def verify_success(screenshot_before, task_type="open_folder"): + """ + Verifies if an operation was successful by comparing before/after screenshots. + + Args: + screenshot_before: Screenshot taken before the operation + task_type: Type of task we're verifying (open_folder, click_button, etc.) + + Returns: + bool: True if operation appears successful, False otherwise + """ + import cv2 + import numpy as np + import pyautogui + + # Take a screenshot after the operation + screenshot_after = pyautogui.screenshot() + + # Convert to numpy arrays for comparison + before_np = np.array(screenshot_before) + after_np = np.array(screenshot_after) + + # Resize if dimensions don't match + if before_np.shape != after_np.shape: + after_np = cv2.resize(after_np, (before_np.shape[1], before_np.shape[0])) + + # For opening a folder, check for significant window change + if task_type == "open_folder": + # Calculate difference between images + diff = cv2.absdiff(before_np, after_np) + gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) + _, thresholded = cv2.threshold(gray_diff, 30, 255, cv2.THRESH_BINARY) + + # Calculate percentage of changed pixels + changed_pixels = np.count_nonzero(thresholded) + total_pixels = thresholded.size + change_percentage = (changed_pixels / total_pixels) * 100 + + # Save debug images + cv2.imwrite("debug_before.jpg", cv2.cvtColor(before_np, cv2.COLOR_RGB2BGR)) + cv2.imwrite("debug_after.jpg", cv2.cvtColor(after_np, cv2.COLOR_RGB2BGR)) + cv2.imwrite("debug_diff.jpg", thresholded) + + print(f"[Verification] Screen change: {change_percentage:.2f}%") + + # If significant portion of screen changed, likely a new window opened + return change_percentage > 15 + + return False + + +def call_claude_37(messages): + if config.verbose: + print("[call_claude_37]") + time.sleep(1) + + # Import all required modules + import anthropic + import cv2 + import numpy as np + import re + import pyautogui + from PIL import Image + + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + # Convert PNG to JPEG format to ensure compatibility + img = Image.open(screenshot_filename) + if img.mode in ('RGBA', 'LA'): + # Remove alpha channel for JPEG compatibility + background = Image.new("RGB", img.size, (255, 255, 255)) + background.paste(img, mask=img.split()[3]) # 3 is the alpha channel + img = background + + # Save as JPEG + jpeg_filename = os.path.join(screenshots_dir, "screenshot.jpg") + img.save(jpeg_filename, "JPEG", quality=95) + + with open(jpeg_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + # Determine which prompt to use + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if config.verbose: + print("[call_claude_37] user_prompt", user_prompt) + + # Initialize Anthropic client directly with the environment variable + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + api_key = config.anthropic_api_key # Fallback to instance variable + + if config.verbose: + print("[call_claude_37] Using Anthropic API key (masked):", "*" * len(api_key) if api_key else "None") + + client = anthropic.Anthropic(api_key=api_key) + + # Extract system message + system_content = None + if messages and messages[0]["role"] == "system": + system_content = messages[0]["content"] + user_messages = messages[1:-1] if len(messages) > 1 else [] # Skip system message and last message + else: + user_messages = messages[:-1] if messages else [] # No system message, include all but last + + # Convert previous messages to Anthropic format + anthropic_messages = [] + for msg in user_messages: + if msg["role"] in ["user", "assistant"]: # Only include user and assistant messages + anthropic_messages.append({ + "role": msg["role"], + "content": msg["content"] + }) + + # Create vision message for Claude + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": img_base64 + } + } + ] + } + + # Add the vision message + anthropic_messages.append(vision_message) + + if config.verbose: + print("[call_claude_37] System content length:", len(system_content) if system_content else 0) + print("[call_claude_37] Number of messages:", len(anthropic_messages)) + + # Create the message request + response = client.messages.create( + model="claude-3-7-sonnet-20250219", + messages=anthropic_messages, + system=system_content, + max_tokens=2048, + ) + + # Extract the content from the response + content = response.content[0].text + + # Check if Claude added text before the JSON + if content.strip().startswith("[") or content.strip().startswith("{"): + # Content is already in JSON format, just clean it + content = clean_json(content) + else: + # Claude might have added a message before the JSON + # Try to find JSON in the content + json_match = re.search(r'(\[.*\]|\{.*\})', content, re.DOTALL) + if json_match: + # Extract the JSON part + content = clean_json(json_match.group(1)) + else: + # If no JSON found, try to create a done operation + if "done" in content.lower() or "complete" in content.lower(): + content = '[{"thought": "Task complete", "operation": "done"}]' + else: + # Create a fallback operation + content = '[{"thought": "Continuing task", "operation": "wait", "duration": 1}]' + + # Log the cleaned content + if config.verbose: + print("[call_claude_37] cleaned content", content) + + # Create assistant message with the original response + assistant_message = {"role": "assistant", "content": response.content[0].text} + + try: + # Try to parse as JSON + parsed_content = json.loads(content) + if config.verbose: + print("[call_claude_37] Successfully parsed content as JSON") + except json.JSONDecodeError as e: + # If JSON parsing fails, create a simple operation + print(f"[call_claude_37] JSON parsing failed: {e}. Creating fallback operation.") + parsed_content = [{"thought": "Continuing with task", "operation": "wait", "duration": 1}] + + # Process the operations with enhanced handling + processed_content = [] + + # Check if Claude is trying to do a double-click + need_double_click = False + for operation in parsed_content: + if operation.get("double_click", False): + need_double_click = True + break + if "thought" in operation: + if "double" in operation["thought"].lower() and "click" in operation["thought"].lower(): + need_double_click = True + break + + for i, operation in enumerate(parsed_content): + if operation.get("operation") == "click": + # Extract target description + target_description = "" + if "text" in operation: + target_description = operation.get("text") + elif "thought" in operation: + # Try to extract what we're clicking on from the thought + thought = operation.get("thought", "") + + # Look for quoted text first + quoted_match = re.search(r'[\'"]([^\'\"]+)[\'"]', thought) + if quoted_match: + target_description = quoted_match.group(1) + else: + # Look for instances of "sbc-images-main" or similar patterns + pattern_match = re.search(r'(\b\w+-\w+-\w+\b|\bsbc[- ]\w+\b)', thought, re.IGNORECASE) + if pattern_match: + target_description = pattern_match.group(1) + else: + # Fall back to looking for phrases after click indicators + click_indicators = ["click on", "click the", "clicking on", "clicking the"] + for indicator in click_indicators: + if indicator in thought.lower(): + parts = thought.lower().split(indicator, 1) + if len(parts) > 1: + target_description = parts[1].split(".")[0].split(",")[0].strip() + break + + if not target_description: + target_description = f"target at position ({operation['x']}, {operation['y']})" + + if config.verbose: + print(f"[call_claude_37] Target description: {target_description}") + + # Handle double-clicking if detected + if need_double_click and i == 0: # Only process the first click for double-click + # Extract coordinates + try: + x = operation["x"] + y = operation["y"] + + # Add a special marker to signal double-click + operation["double_click"] = True + + # Log the double-click intention + print( + f"[call_claude_37] Detected double-click operation on '{target_description}' at ({x}, {y})") + except Exception as e: + print(f"[call_claude_37] Error processing double-click: {e}") + + # For double-click operations, we only need to add the first click + # Skip adding second clicks to avoid duplicate operations + if need_double_click and i > 0: + if config.verbose: + print("[call_claude_37] Skipping duplicate click for double-click operation") + continue + + # Add the operation + if config.verbose: + print(f"[call_claude_37] Adding operation: {operation}") + + processed_content.append(operation) + else: + # For non-click operations, just append as is + processed_content.append(operation) + + # Add the assistant message to the history + messages.append(assistant_message) + + # Return the processed content + return processed_content if processed_content else [{"operation": "wait", "duration": 1}] + + except Exception as e: + error_msg = str(e) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", + error_msg, + ) + + # Define content_str before using it to avoid the "referenced before assignment" error + content_str = "No content received" + if 'content' in locals(): + content_str = content + + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", + content_str, + ) + + if config.verbose: + traceback.print_exc() + + # If an exception occurs, return a simple operation to keep things moving + return [{"thought": "Continuing task after error", "operation": "wait", "duration": 1}] +async def call_qwen_vl_with_ocr(messages, objective, model): + if config.verbose: + print("[call_qwen_vl_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_qwen() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + # Call the function to capture the screen with the cursor + raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png") + capture_screen_with_cursor(raw_screenshot_filename) + + # Compress screenshot image to make size be smaller + screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg") + compress_screenshot(raw_screenshot_filename, screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", + "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="qwen2.5-vl-72b-instruct", + messages=messages, + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + # used later for the messages + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_qwen_vl_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_qwen_vl_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_qwen_vl_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_qwen_vl_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + + def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 0a7e0ad1..c3b4e2ad 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -13,28 +13,48 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click ``` [{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ``` -2. write - Write with your keyboard +2. doubleclick - Move mouse and double click +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +``` + +3. rightclick - Move mouse and right click +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` +8. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] +``` + Return the actions in array format `[]`. You can take just one action or multiple actions. Here a helpful example: @@ -48,7 +68,28 @@ ] ``` -Example 2: Focuses on the address bar in a browser before typing a website +Example 2: Double-clicking to open a file or application +``` +[ + {{ "thought": "I want to open a file or application by double-clicking", "operation": "doubleclick", "x": "0.50", "y": "0.60" }} +] +``` + +Example 3: Right-clicking to open a context menu +``` +[ + {{ "thought": "I want to open the context menu to see available options", "operation": "rightclick", "x": "0.50", "y": "0.60" }} +] +``` + +Example 4: Scrolling down a webpage +``` +[ + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} +] +``` + +Example 5: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -57,6 +98,14 @@ ] ``` +Example 6: Waits for the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] +] +``` + A few important notes: - Go to Google Docs and Google Sheets by typing in the Chrome Address bar @@ -71,25 +120,48 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` ``` [{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format ``` -2. write - Write with your keyboard + +2. doubleclick - Move mouse and double click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "label": "~x" }}] +``` + +3. rightclick - Move mouse and right click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "label": "~x" }}] +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer + +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` + +8. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] +``` + Return the actions in array format `[]`. You can take just one action or multiple actions. Here a helpful example: @@ -102,7 +174,28 @@ ] ``` -Example 2: Focuses on the address bar in a browser before typing a website +Example 2: Double-clicking to open a file or application with a labeled element +``` +[ + {{ "thought": "I want to open a file or application by double-clicking on its labeled element", "operation": "doubleclick", "label": "~42" }} +] +``` + +Example 3: Right-clicking to open a context menu with a labeled element +``` +[ + {{ "thought": "I want to open the context menu for this element to see available options", "operation": "rightclick", "label": "~42" }} +] +``` + +Example 4: Scrolling down a webpage +``` +[ + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} +] +``` + +Example 5: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -111,7 +204,7 @@ ] ``` -Example 3: Send a "Hello World" message in the chat +Example 6: Send a "Hello World" message in the chat ``` [ {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, @@ -119,6 +212,14 @@ ] ``` +Example 7: Waits to the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] +] +``` + A few important notes: - Go to Google Docs and Google Sheets by typing in the Chrome Address bar @@ -128,31 +229,53 @@ """ -# TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll SYSTEM_PROMPT_OCR = """ You are operating a {operating_system} computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method. ``` [{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] ``` -2. write - Write with your keyboard + +2. doubleclick - Move mouse and double click - Look for text to double click +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "text": "The text in the item to double click" }}] +``` + +3. rightclick - Move mouse and right click - Look for text to right click +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "text": "The text in the item to right click" }}] +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer + +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed + +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` +8. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] +``` + Return the actions in array format `[]`. You can take just one action or multiple actions. Here a helpful example: @@ -175,7 +298,28 @@ ] ``` -Example 3: Search for someone on Linkedin when already on linkedin.com +Example 3: Double-clicking to open a file +``` +[ + {{ "thought": "I want to open a file by finding its text label and double-clicking", "operation": "doubleclick", "text": "my_document.txt" }} +] +``` + +Example 4: Right-clicking to open a context menu +``` +[ + {{ "thought": "I want to open the context menu to see available options for this item", "operation": "rightclick", "text": "my_document.txt" }} +] +``` + +Example 5: Scrolling through content +``` +[ + {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} +] +``` + +Example 6: Search for someone on Linkedin when already on linkedin.com ``` [ {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, @@ -184,6 +328,14 @@ ] ``` +Example 7: Waits to the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] +] +``` + A few important notes: - Default to Google Chrome as the browser @@ -196,17 +348,16 @@ """ OPERATE_FIRST_MESSAGE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. Action:""" OPERATE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait Action:""" - def get_system_prompt(model, objective): """ Format the vision prompt more efficiently and print the name of the prompt used @@ -232,7 +383,7 @@ def get_system_prompt(model, objective): os_search_str=os_search_str, operating_system=operating_system, ) - elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3": + elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl": prompt = SYSTEM_PROMPT_OCR.format( objective=objective, diff --git a/operate/operate.py b/operate/operate.py index c63d9851..fb84c3e1 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -2,33 +2,12 @@ import os import time import asyncio +import pyautogui from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt -from operate.exceptions import ModelNotRecognizedException import platform # from operate.models.prompts import USER_QUESTION, get_system_prompt -from operate.models.prompts import ( - USER_QUESTION, - get_system_prompt, -) -from operate.config import Config -from operate.utils.style import ( - ANSI_GREEN, - ANSI_RESET, - ANSI_YELLOW, - ANSI_RED, - ANSI_BRIGHT_MAGENTA, - ANSI_BLUE, - style, -) -from operate.utils.operating_system import OperatingSystem -from operate.models.apis import get_next_action - -# Load configuration -config = Config() -operating_system = OperatingSystem() - def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): """ @@ -42,146 +21,567 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): Returns: None """ + from operate.config import Config + from operate.exceptions import ModelNotRecognizedException + + from operate.utils.style import ( + ANSI_GREEN, + ANSI_RESET, + ANSI_YELLOW, + ANSI_RED, + ANSI_BRIGHT_MAGENTA, + ANSI_BLUE, + style, + ) - mic = None - # Initialize `WhisperMic`, if `voice_mode` is True + from operate.utils.operating_system import OperatingSystem + from operate.models.prompts import ( + USER_QUESTION, + get_system_prompt, + ) - config.verbose = verbose_mode - config.validation(model, voice_mode) + # Load configuration + config = Config() + operating_system = OperatingSystem() + + from operate.models.apis import get_next_action - if voice_mode: - try: - from whisper_mic import WhisperMic + while True: # Add outer loop to enable restarting after completion + mic = None + # Initialize `WhisperMic`, if `voice_mode` is True + + config.verbose = verbose_mode + config.validation(model, voice_mode) + + if voice_mode: + try: + from whisper_mic import WhisperMic + + # Initialize WhisperMic if import is successful + mic = WhisperMic() + except ImportError: + print( + "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" + ) + sys.exit(1) - # Initialize WhisperMic if import is successful - mic = WhisperMic() - except ImportError: + # Skip message dialog if prompt was given directly + if not terminal_prompt: + message_dialog( + title="Self-Operating Computer", + text="An experimental framework to enable multimodal models to operate computers", + style=style, + ).run() + + else: + print("Running direct prompt...") + + # # Clear the console + if platform.system() == "Windows": + os.system("cls") + else: + print("\033c", end="") + + if terminal_prompt and not hasattr(main, 'first_run_complete'): + # Only use the terminal prompt on the first iteration + objective = terminal_prompt + main.first_run_complete = True + elif voice_mode: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" + ) + try: + objective = mic.listen() + except Exception as e: + print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") + return # Exit if voice input fails + else: print( - "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" + f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}" ) - sys.exit(1) - - # Skip message dialog if prompt was given directly - if not terminal_prompt: - message_dialog( - title="Self-Operating Computer", - text="An experimental framework to enable multimodal models to operate computers", - style=style, - ).run() - - else: - print("Running direct prompt...") - - # # Clear the console - if platform.system() == "Windows": - os.system("cls") - else: - print("\033c", end="") - - if terminal_prompt: # Skip objective prompt if it was given as an argument - objective = terminal_prompt - elif voice_mode: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" - ) + print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") + objective = prompt(style=style) + + system_prompt = get_system_prompt(model, objective) + system_message = {"role": "system", "content": system_prompt} + messages = [system_message] + + loop_count = 0 + + session_id = None + + task_completed = False # Flag to indicate if the task was completed + while not task_completed: + if config.verbose: + print("[Self Operating Computer] loop_count", loop_count) + try: + operations, session_id = asyncio.run( + get_next_action(model, messages, objective, session_id) + ) + + # Instead of breaking out of the whole program, we set a flag if "done" is reached + task_completed = operate(operations, session_id, model) + + loop_count += 1 + if loop_count > 10: + task_completed = True # Force completion if loop count exceeds 10 + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Max loop count reached. Moving to next task.{ANSI_RESET}") + except ModelNotRecognizedException as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + task_completed = True # Exit inner loop and start over + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + task_completed = True # Exit inner loop and start over + + print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Task completed. Ready for a new task.") + if terminal_prompt: + # If the session was started with a terminal prompt, we need to clear it after the first use + terminal_prompt = None + + +# def verify_click_target(x_percent, y_percent, target_description, client): +# import pyautogui +# import base64 +# import io +# from PIL import Image, ImageDraw +# +# screen_width, screen_height = pyautogui.size() +# x = int(float(x_percent) * screen_width) +# y = int(float(y_percent) * screen_height) +# +# region_size = 100 +# region_left = max(0, x - region_size) +# region_top = max(0, y - region_size) +# region_width = min(region_size * 2, screen_width - region_left) +# region_height = min(region_size * 2, screen_height - region_top) +# +# region_screenshot = pyautogui.screenshot(region=(region_left, region_top, region_width, region_height)) +# +# draw = ImageDraw.Draw(region_screenshot) +# center_x = x - region_left +# center_y = y - region_top +# line_length = 20 +# draw.line((center_x - line_length, center_y, center_x + line_length, center_y), fill='red', width=2) +# draw.line((center_x, center_y - line_length, center_x, center_y + line_length), fill='red', width=2) +# +# buffer = io.BytesIO() +# region_screenshot.save(buffer, format="JPEG") +# img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') +# +# try: +# verification_prompt = f""" +# I'm about to click at the position marked with the red crosshair. +# I'm trying to click on: "{target_description}" +# +# Does the crosshair appear to be positioned correctly on or very near the target? +# Respond ONLY with "YES" if it's correct or "NO" if it's wrong. +# """ +# +# response = client.messages.create( +# model="claude-3-7-sonnet-20250219", +# messages=[{ +# "role": "user", +# "content": [ +# {"type": "text", "text": verification_prompt}, +# { +# "type": "image", +# "source": { +# "type": "base64", +# "media_type": "image/jpeg", +# "data": img_base64 +# } +# } +# ] +# }], +# max_tokens=50, +# ) +# +# verification_result = response.content[0].text.strip().upper() +# +# print(f"[Click Verification] Target: {target_description}") +# print(f"[Click Verification] Claude's response: {verification_result}") +# +# region_screenshot.save("debug_last_click_verification.jpg") +# +# return "YES" in verification_result +# +# except Exception as e: +# print(f"[Click Verification] Error during verification: {e}") +# return False + + +import cv2 +import numpy as np +import pyautogui +import os +import io +from PIL import Image, ImageDraw + + +def find_icon_on_screen(target_description): + """ + Uses computer vision to find an icon or UI element that matches the target description. + + Args: + target_description (str): Description of what we're trying to find (e.g., "sbc-images-main folder") + + Returns: + tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found + """ + # Take a screenshot of the entire screen + screenshot = pyautogui.screenshot() + screenshot_np = np.array(screenshot) + screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR) + + # Save the screenshot for debugging + cv2.imwrite("debug_full_screen.jpg", screenshot_rgb) + + # Initialize results + results = [] + + # 1. Text detection for folder/file names (optional, requires pytesseract) + try: + import pytesseract + gray = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2GRAY) + + # Extract text from the screenshot + text_data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT) + + # Look for the target text in detected text + target_words = target_description.lower().split() + + for i, text in enumerate(text_data['text']): + if text and any(word in text.lower() for word in target_words): + # Get coordinates for this text + x = text_data['left'][i] + text_data['width'][i] // 2 + y = text_data['top'][i] + text_data['height'][i] // 2 + + # Add to results with high confidence + results.append((x, y, 0.9)) # 0.9 is confidence score + + # Draw a rectangle around the text for debugging + x1, y1 = text_data['left'][i], text_data['top'][i] + x2 = x1 + text_data['width'][i] + y2 = y1 + text_data['height'][i] + cv2.rectangle(screenshot_rgb, (x1, y1), (x2, y2), (0, 255, 0), 2) + except (ImportError, Exception) as e: + print(f"Text detection not available: {e}") + + # 2. Template matching for common desktop icons + icon_folder = "icon_templates" + if os.path.exists(icon_folder): + for filename in os.listdir(icon_folder): + if filename.endswith(('.png', '.jpg')): + template_path = os.path.join(icon_folder, filename) + template = cv2.imread(template_path) + + if template is None: + continue + + # Apply template matching + template_h, template_w = template.shape[:2] + res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED) + + # Get locations where the match exceeds threshold + threshold = 0.7 + loc = np.where(res >= threshold) + + for pt in zip(*loc[::-1]): + # Get center point of the match + x = pt[0] + template_w // 2 + y = pt[1] + template_h // 2 + confidence = res[pt[1], pt[0]] + + # Add to results + results.append((x, y, confidence)) + + # Draw for debugging + cv2.rectangle(screenshot_rgb, pt, (pt[0] + template_w, pt[1] + template_h), (0, 0, 255), 2) + + # 3. Folder icon detection using color and shape (backup method) + if not results: + # Convert to HSV for better color segmentation + hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV) + + # Define color ranges for common folder icons (yellow folders in Windows) + lower_yellow = np.array([20, 100, 100]) + upper_yellow = np.array([40, 255, 255]) + + # Create mask for yellow color + mask = cv2.inRange(hsv, lower_yellow, upper_yellow) + + # Find contours in the mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter contours by size (folder icons are usually of similar size) + min_area = 100 + max_area = 5000 + + for contour in contours: + area = cv2.contourArea(contour) + if min_area < area < max_area: + # Get center of contour + M = cv2.moments(contour) + if M["m00"] > 0: + x = int(M["m10"] / M["m00"]) + y = int(M["m01"] / M["m00"]) + + # Add to results with lower confidence + results.append((x, y, 0.5)) + + # Draw for debugging + cv2.drawContours(screenshot_rgb, [contour], -1, (255, 0, 0), 2) + + # Save the annotated screenshot for debugging + cv2.imwrite("debug_target_detection.jpg", screenshot_rgb) + + if results: + # Sort by confidence + results.sort(key=lambda x: x[2], reverse=True) + best_match = results[0] + + # Convert to percentage of screen size + screen_width, screen_height = screenshot.size + x_percent = best_match[0] / screen_width + y_percent = best_match[1] / screen_height + + return (x_percent, y_percent) + + return None + + +# def enhanced_click(target_description, model=None): +# """ +# Enhanced clicking function that uses computer vision to find and click on targets. +# +# Args: +# target_description (str): Description of what to click on +# model (str, optional): Model name for verification +# +# Returns: +# bool: True if click was successful, False otherwise +# """ +# # Try to find the target using computer vision +# coords = find_icon_on_screen(target_description) +# +# if coords: +# x_percent, y_percent = coords +# print(f"[Visual Target Finder] Found target '{target_description}' at ({x_percent:.3f}, {y_percent:.3f})") +# +# # Convert percentages to actual screen coordinates +# screen_width, screen_height = pyautogui.size() +# x_coord = int(x_percent * screen_width) +# y_coord = int(y_percent * screen_height) +# +# # Click on the found location +# pyautogui.click(x_coord, y_coord) +# return True +# else: +# print(f"[Visual Target Finder] Could not find target '{target_description}' on screen") +# return False + + +import pyautogui +import platform +import ctypes +import subprocess + + +def get_scaling_factor(): + """ + Detect the current DPI scaling factor based on the operating system. + Returns: + scaling_factor (float): A multiplier to adjust coordinates. + """ + os_name = platform.system() + scaling_factor = 1.0 + + if os_name == "Windows": try: - objective = mic.listen() + user32 = ctypes.windll.user32 + user32.SetProcessDPIAware() + dc = user32.GetDC(0) + logical_width = user32.GetDeviceCaps(dc, 8) # HORZRES (logical width) + physical_width = user32.GetDeviceCaps(dc, 118) # DESKTOPHORZRES (physical width) + scaling_factor = physical_width / logical_width + user32.ReleaseDC(0, dc) except Exception as e: - print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") - return # Exit if voice input fails - else: - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}" - ) - print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") - objective = prompt(style=style) - - system_prompt = get_system_prompt(model, objective) - system_message = {"role": "system", "content": system_prompt} - messages = [system_message] - - loop_count = 0 - - session_id = None - - while True: - if config.verbose: - print("[Self Operating Computer] loop_count", loop_count) + print("Windows scaling detection error:", e) + scaling_factor = 1.0 + elif os_name == "Darwin": # macOS try: - operations, session_id = asyncio.run( - get_next_action(model, messages, objective, session_id) + output = subprocess.check_output(["system_profiler", "SPDisplaysDataType"]) + output = output.decode("utf-8") + if "Retina" in output: + scaling_factor = 2.0 + else: + scaling_factor = 1.0 + except Exception as e: + print("macOS scaling detection error:", e) + scaling_factor = 1.0 + elif os_name == "Linux": + try: + output = subprocess.check_output( + ["gsettings", "get", "org.gnome.desktop.interface", "scaling-factor"] ) + scaling_factor = float(output.decode("utf-8").strip()) + except Exception as e: + print("Linux scaling detection error:", e) + scaling_factor = 1.0 - stop = operate(operations, model) - if stop: - break + return scaling_factor - loop_count += 1 - if loop_count > 10: - break - except ModelNotRecognizedException as e: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" - ) - break - except Exception as e: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" - ) - break - - -def operate(operations, model): - if config.verbose: - print("[Self Operating Computer][operate]") - for operation in operations: - if config.verbose: - print("[Self Operating Computer][operate] operation", operation) - # wait one second - time.sleep(1) - operate_type = operation.get("operation").lower() - operate_thought = operation.get("thought") - operate_detail = "" - if config.verbose: - print("[Self Operating Computer][operate] operate_type", operate_type) - - if operate_type == "press" or operate_type == "hotkey": - keys = operation.get("keys") - operate_detail = keys - operating_system.press(keys) - elif operate_type == "write": - content = operation.get("content") - operate_detail = content - operating_system.write(content) - elif operate_type == "click": - x = operation.get("x") - y = operation.get("y") - click_detail = {"x": x, "y": y} - operate_detail = click_detail - - operating_system.mouse(click_detail) - elif operate_type == "done": - summary = operation.get("summary") - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" - ) - print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") - return True +def click_relative(x_percent, y_percent, x_divisor=1.50, y_multiplier=1.25): + """ + Converts relative coordinates to absolute screen coordinates, applies DPI scaling, + then divides the x-coordinate by x_divisor and multiplies the y-coordinate by y_multiplier before clicking. - else: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" - ) - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}" - ) - return True + Args: + x_percent (float): Relative x-coordinate (e.g., 0.10 for 10% across). + y_percent (float): Relative y-coordinate (e.g., 0.20 for 20% down). + x_divisor (float): Value to divide the computed x-coordinate by (default 1.50). + y_multiplier (float): Value to multiply the computed y-coordinate by (default 1.25). + """ + screen_width, screen_height = pyautogui.size() + scaling_factor = get_scaling_factor() + + # Compute the base absolute coordinates. + base_x = x_percent * screen_width * scaling_factor + base_y = y_percent * screen_height * scaling_factor + + # Adjust: divide x-coordinate and multiply y-coordinate. + adjusted_x = int(base_x / x_divisor) + adjusted_y = int(base_y * y_multiplier) + + print( + f"Clicking at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen with scaling factor {scaling_factor}") + pyautogui.click(adjusted_x, adjusted_y) + + +def operate(operations, session_id, model=None): + """ + Processes a list of operations and executes them. + Supports click, doubleclick, rightclick, scroll, write, press, wait, and done operations. + For click/doubleclick/rightclick operations, it uses the adjusted coordinate conversion: + - x-coordinate divided by 1.50. + - y-coordinate multiplied by 1.25. + + Returns: + bool: True if "done" operation was encountered (task completed), otherwise False + """ + import time + + for op in operations: + if op.get("operation") in ["click", "doubleclick", "rightclick"]: + try: + x_percent = float(op.get("x", 0)) + y_percent = float(op.get("y", 0)) + screen_width, screen_height = pyautogui.size() + scaling_factor = get_scaling_factor() + + # Compute the base absolute coordinates. + base_x = x_percent * screen_width * scaling_factor + base_y = y_percent * screen_height * scaling_factor + + # Adjust: divide x-coordinate and multiply y-coordinate. + adjusted_x = int(base_x / 1.50) + adjusted_y = int(base_y * 1.25) + + operation_type = op.get("operation") + operation_name = { + "click": "Clicking", + "doubleclick": "Double-clicking", + "rightclick": "Right-clicking" + }.get(operation_type, operation_type) + + print( + f"{operation_name} at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen " + f"with scaling factor {scaling_factor}" + ) + + if operation_type == "doubleclick": + pyautogui.doubleClick(adjusted_x, adjusted_y) + elif operation_type == "rightclick": + pyautogui.rightClick(adjusted_x, adjusted_y) + else: + pyautogui.click(adjusted_x, adjusted_y) + except Exception as e: + print(f"Error performing {op.get('operation')} operation:", e) + + + elif op.get("operation") == "scroll": + + try: + + direction = op.get("direction", "") + + amount = int(op.get("amount", 0)) + + # For vertical scrolling: positive for up, negative for down + + if direction == "up": + + clicks = amount * 150 + + elif direction == "down": + + clicks = -amount * 150 + + # For horizontal scrolling: negative for left, positive for right + + elif direction == "left": + + clicks = -amount * 150 + + elif direction == "right": + + clicks = amount * 150 + + else: + + print(f"Invalid scroll direction: {direction}") + + clicks = 0 + + # Execute scroll based on direction type + + if direction in ["up", "down"]: + + print(f"Scrolling {direction} by {amount} clicks") + + pyautogui.scroll(clicks) + + elif direction in ["left", "right"]: + + print(f"Scrolling {direction} by {amount} clicks") + + pyautogui.hscroll(clicks) + + except Exception as e: + + print("Error performing scroll operation:", e) + + elif op.get("operation") == "write": + content = op.get("content", "") + pyautogui.write(content) + + elif op.get("operation") == "press": + keys = op.get("keys", []) + for key in keys: + pyautogui.press(key) + + elif op.get("operation") == "wait": + duration = float(op.get("duration", 1)) + time.sleep(duration) - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" - ) - print(f"{operate_thought}") - print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n") + elif op.get("operation") == "done": + print("Operation completed:", op.get("summary", "")) + return True # Signal that the task is completed - return False + return False # Continue processing this task diff --git a/operate/setup_icon_templates.py b/operate/setup_icon_templates.py new file mode 100644 index 00000000..d9ad620b --- /dev/null +++ b/operate/setup_icon_templates.py @@ -0,0 +1,117 @@ +import os +import pyautogui +import time +import tkinter as tk +from tkinter import simpledialog, messagebox + + +def setup_icon_templates(): + """ + Simplified helper script to set up icon templates for visual target finding. + Uses simple coordinate input rather than visual selection. + """ + # Create templates directory if it doesn't exist + template_dir = "icon_templates" + if not os.path.exists(template_dir): + os.makedirs(template_dir) + print(f"Created directory: {template_dir}") + + # Create a simple GUI for capturing templates + root = tk.Tk() + root.title("Icon Template Capture Tool") + root.geometry("400x200") + + # Function to capture icon at cursor position + def capture_at_cursor(): + icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root) + if not icon_name: + return + + # Give user time to position cursor + messagebox.showinfo("Capture Icon", + "Position your mouse cursor over the center of the icon you want to capture, then click OK.") + + # Get cursor position + time.sleep(0.5) # Small delay after dialog closes + x, y = pyautogui.position() + + # Capture region around cursor (100x100 pixels) + region_size = 50 # pixels in each direction from center + region = (x - region_size, y - region_size, region_size * 2, region_size * 2) + + try: + # Capture the region + screenshot = pyautogui.screenshot(region=region) + + # Save the template + filename = f"{icon_name.replace(' ', '_').lower()}.png" + filepath = os.path.join(template_dir, filename) + screenshot.save(filepath) + + messagebox.showinfo("Success", f"Saved template as {filename}") + print(f"Saved template as {filepath}") + except Exception as e: + messagebox.showerror("Error", f"Failed to capture: {str(e)}") + print(f"Error: {e}") + + # Function to capture custom region + def capture_custom_region(): + icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root) + if not icon_name: + return + + # Ask for region coordinates + try: + x = simpledialog.askinteger("X Coordinate", "Enter X coordinate (left edge):", parent=root) + if x is None: return + + y = simpledialog.askinteger("Y Coordinate", "Enter Y coordinate (top edge):", parent=root) + if y is None: return + + width = simpledialog.askinteger("Width", "Enter width in pixels:", parent=root, minvalue=10, maxvalue=500) + if width is None: return + + height = simpledialog.askinteger("Height", "Enter height in pixels:", parent=root, minvalue=10, + maxvalue=500) + if height is None: return + + # Capture the specified region + region = (x, y, width, height) + screenshot = pyautogui.screenshot(region=region) + + # Save the template + filename = f"{icon_name.replace(' ', '_').lower()}.png" + filepath = os.path.join(template_dir, filename) + screenshot.save(filepath) + + messagebox.showinfo("Success", f"Saved template as {filename}") + print(f"Saved template as {filepath}") + except Exception as e: + messagebox.showerror("Error", f"Failed to capture: {str(e)}") + print(f"Error: {e}") + + # Create and place buttons + label = tk.Label(root, text="Icon Template Capture Tool", font=("Arial", 14)) + label.pack(pady=10) + + instructions = tk.Label(root, text="Choose a capture method:") + instructions.pack(pady=5) + + button_frame = tk.Frame(root) + button_frame.pack(pady=10) + + cursor_btn = tk.Button(button_frame, text="Capture at Cursor", command=capture_at_cursor, width=20) + cursor_btn.grid(row=0, column=0, padx=10, pady=5) + + region_btn = tk.Button(button_frame, text="Specify Region Manually", command=capture_custom_region, width=20) + region_btn.grid(row=0, column=1, padx=10, pady=5) + + close_btn = tk.Button(root, text="Close", command=root.destroy, width=10) + close_btn.pack(pady=10) + + # Start the GUI + root.mainloop() + + +if __name__ == "__main__": + setup_icon_templates() \ No newline at end of file diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index 597911ad..23d492f1 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -25,3 +25,18 @@ def capture_screen_with_cursor(file_path): subprocess.run(["screencapture", "-C", file_path]) else: print(f"The platform you're using ({user_platform}) is not currently supported") + + +def compress_screenshot(raw_screenshot_filename, screenshot_filename): + with Image.open(raw_screenshot_filename) as img: + # Check if the image has an alpha channel (transparency) + if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info): + # Create a white background image + background = Image.new('RGB', img.size, (255, 255, 255)) + # Paste the image onto the background, using the alpha channel as mask + background.paste(img, mask=img.split()[3]) # 3 is the alpha channel + # Save the result as JPEG + background.save(screenshot_filename, 'JPEG', quality=85) # Adjust quality as needed + else: + # If no alpha channel, simply convert and save + img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85) diff --git a/setup.py b/setup.py index dbb2cf18..5c9e7013 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="self-operating-computer", - version="1.5.7", + version="1.5.9", packages=find_packages(), install_requires=required, # Add dependencies here entry_points={