Update

samholt · Apr 17, 2024 · 22c04d9 · 22c04d9
1 parent 1b1672f
commit 22c04d9
Show file tree

Hide file tree

Showing 12 changed files with 216 additions and 104 deletions.
diff --git a/examples/generate_codebase_simple_blackjack.py b/examples/generate_codebase_simple_blackjack.py
@@ -1,5 +1,5 @@
 from l2mac import generate_codebase
 
-codebase: dict = generate_codebase("Create a cli blackjack game")
+codebase: dict = generate_codebase("Create a cli blackjack game", steps=2)
 
 print(codebase)  # it will print the codebase (repo) complete with all the files as a dictionary
diff --git a/l2mac/core.py b/l2mac/core.py
@@ -67,13 +67,20 @@ def l2mac_internal(prompt_task: str, domain: Domain, run_tests: bool, project_na
     env.set_seed(seed=config.setup.seed)
     env.reset()
     l2mac = L2MAC(prompt_task=prompt_task, env=env, config=config, logger=logger, rate_limiter=rate_limiter)
-    output_file_store = l2mac.run()
+    output_file_store = l2mac.run(steps=steps)
     return output_file_store
 
 
-def generate_codebase(*args, **kwargs):
-    kwargs['domain'] = Domain.codebase
-    return run_l2mac(*args, **kwargs)
+def generate_codebase(prompt_task: str,
+            run_tests: bool = False,
+            project_name: Optional[str] = None,
+            steps: int = 10,
+            prompt_program: Optional[str] = None,
+            prompts_file_path: Optional[str] = None,
+            tools_enabled: Optional[str] = None,
+            debugging_level: DebuggingLevel = DebuggingLevel.info,
+            init_config: bool = False):
+    return run_l2mac(prompt_task=prompt_task, domain=Domain.codebase, run_tests=run_tests, project_name=project_name, steps=steps, prompt_program=prompt_program, prompts_file_path=prompts_file_path, tools_enabled=tools_enabled, debugging_level=debugging_level, init_config=init_config)
 
 def generate_book(*args, **kwargs):
     kwargs['domain'] = Domain.book

diff --git a/l2mac/l2mac.py b/l2mac/l2mac.py
@@ -8,7 +8,7 @@
 import openai
 from l2mac.llm_providers.general import get_llm_config, get_model_max_tokens, chat_completion_rl
 from l2mac.utils.l2mac import hash_messages, clean_string, detect_cycles
-from l2mac.tools.write import write_files_from_dict
+from l2mac.tools.utils import write_files_from_dict
 from l2mac.tools.core import function_definition_list_factory, process_functions_into_function_names, available_functions_factory
 from l2mac.llm_providers.openai import num_tokens_consumed_by_chat_request
 from l2mac.llm_providers.utils import pretty_print_chat_messages
@@ -22,6 +22,7 @@ def __init__(self, prompt_task, env, config, logger, rate_limiter):
         self.seed_value = None
         self.logger = logger
         self.rate_limiter = rate_limiter
+        self.name = 'L2MAC'
         self.reset()
 
     def seed(self, seed_value):
@@ -34,7 +35,6 @@ def get_llm_config(self):
 
 
     def reset(self):
-        self.name = 'L2MAC'
         self.load_from_checkpoint = ''
         self.replay_llm_responses_path = ''
         self.replay_llm_responses_path_index = 0
@@ -117,7 +117,7 @@ def save_agent_state(self, messages, beginning_step=''):
         with open(path, 'w') as f:
             json.dump(data_to_save, f)
 
-    def get_llm_response(self, messages, max_tokens=None):
+    def get_llm_response(self, messages, max_tokens=None, tool_choice='auto'):
         self.print_dialog(messages)
         self.save_agent_state(messages)
         llm_config = self.get_llm_config()
@@ -147,10 +147,11 @@ def get_llm_response(self, messages, max_tokens=None):
         # else:
         #     self.message_hash_same_increase_temperature = 0
         # self.message_hash = message_hash
-        llm_config['functions'] = self.functions
-        if messages[-1].get('function_call'):
-            llm_config['function_call'] = messages[-1]['function_call']
-            del(messages[-1]['function_call'])
+        llm_config['tools'] = self.functions
+        if tool_choice is not None:
+            llm_config['tool_choice'] = {"type": "function", "function": {"name": tool_choice}}
+        else:
+            llm_config['tool_choice'] = 'none'
         if self.replay_llm_responses_path:
             with open(self.replay_llm_responses_path, 'r') as f:
                 responses = json.load(f)
@@ -175,8 +176,8 @@ def get_llm_response(self, messages, max_tokens=None):
                 self.logger.info("Error:", e.__dict__)  # or use a logging framework
                 raise e
         message_response = response["choices"][0]["message"]
-        if not message_response.get('content'):
-            message_response['content'] = None
+        # if not message_response.get('content'):
+        #     message_response['content'] = None
         self.print_dialog([message_response], response_msg=True)
         return message_response
 
@@ -185,10 +186,10 @@ def get_function_names_as_str(self):
         return ', '.join([f'`{fn}`'for fn in fns])
 
 
-    def run(self, state=''):
-        return self._run(state)
+    def run(self, steps: int = 10):
+        return self._run(steps=steps)
 
-    def _run(self, state=''):
+    def _run(self, steps: int = 10):
         self.reset()
         if not self.load_from_checkpoint:
             self.meta_messages = [self.system_message]
@@ -225,37 +226,38 @@ def _run(self, state=''):
 {task_description}
 ```
 
-Understand the problem, by creating an extremely detailed step-by-step plan, where each step is long (multiple sentences) and in total includes every single feature requirement specified above, feel free to copy directly from it. Use no more than 10 steps in the plan. Create additional tests, checks and evaluation at each step when applicable to help make an excellent code implementation, where all the code is fully functional. Use best software design practices, and you can output large amounts of code at once. Please include a last sentence to create and run tests when implementing or writing code in that same step. You will receive no human input at any stage, so you cannot use a human to test. Only create a detailed plan to begin with, which includes designing and running tests to check that they all pass. Please be sure to include all of the specified feature requirements in the following plan.
+Understand the problem, by creating an extremely detailed step-by-step plan, where each step is long (multiple sentences) and in total includes every single feature requirement specified above, feel free to copy directly from it. Use no more than {steps} steps in the plan. Create additional tests, checks and evaluation at each step when applicable to help make an excellent code implementation, where all the code is fully functional. Use best software design practices, and you can output large amounts of code at once. Please include a last sentence to create and run tests when implementing or writing code in that same step. You will receive no human input at any stage, so you cannot use a human to test. Only create a detailed plan to begin with, which includes designing and running tests to check that they all pass. Please be sure to include all of the specified feature requirements in the following plan.
 """
-        self.meta_messages.append({"role": "user", "content": first_message, "function_call": {"name": "provide_detailed_sub_task_steps_for_sub_agents"}})
+        self.meta_messages.append({"role": "user", "content": first_message})
         steps = []
         # Loop until we get a multi-step plan, as sometimes the first plan is not multi-step, and only a single step.
         max_reflections = 1
         current_reflection = 0
         current_dialog = deepcopy(self.meta_messages)
         while len(steps) <= 50 and current_reflection < max_reflections:
             current_reflection += 1
-            initial_response_message = self.get_llm_response(current_dialog)
+            initial_response_message = self.get_llm_response(current_dialog, tool_choice='provide_detailed_sub_task_steps_for_sub_agents')
             current_dialog.append(initial_response_message)
             current_dialog.append({"role": "user", "content": f"""
 Please reflect on the plan, and increase the number of generated steps to that of 100 or so very detailed steps that include all the feature requirements.
 """})
             # Could reflect and improve plan etc a few times here.
-            function_name = initial_response_message["function_call"]["name"]
+            function_response = initial_response_message['tool_calls'][0]["function"]
+            function_name = function_response["name"]
             try:
-                function_args = json.loads(initial_response_message["function_call"]["arguments"])
+                function_args = json.loads(function_response["arguments"])
             except json.decoder.JSONDecodeError:
                 try:
-                    function_args = json.loads(initial_response_message["function_call"]["arguments"].replace('\n', ''))
+                    function_args = json.loads(function_response["arguments"].replace('\n', ''))
                 except json.decoder.JSONDecodeError:
                     try:
-                        function_args = json.loads(initial_response_message["function_call"]["arguments"] + '"]}')
+                        function_args = json.loads(function_response["arguments"] + '"]}')
                     except json.decoder.JSONDecodeError:
                         try:
-                            function_args = json.loads(initial_response_message["function_call"]["arguments"] + '"]}')
+                            function_args = json.loads(function_response["arguments"] + '"]}')
                         except json.decoder.JSONDecodeError:
                             try:
-                                function_args = json.loads(initial_response_message["function_call"]["arguments"] + ']}')
+                                function_args = json.loads(function_response["arguments"] + ']}')
                             except Exception as e:
                                 print(e)
             fuction_to_call = available_functions_factory()[function_name]
@@ -264,7 +266,7 @@ def _run(self, state=''):
         # self.base_dialog = deepcopy(current_dialog)
         self.base_dialog = deepcopy([self.system_message, {"role": "user", "content": first_message}])
         # Remove provide_detailed_sub_task_steps_for_sub_agents function from functions list
-        self.functions = [function for function in self.functions if function['name'] != 'provide_detailed_sub_task_steps_for_sub_agents']
+        self.functions = [tool for tool in self.functions if tool['function']['name'] != 'provide_detailed_sub_task_steps_for_sub_agents']
         previous_step_output_summary = ""
         # errors_df_l = []
         # count_change_times = 0
@@ -295,10 +297,10 @@ def _run(self, state=''):
                         # self.sub_messages = self.sub_messages[:-1]
                         self.sub_messages.pop(3)
                     self.sub_messages.append({"role": "user", "content": f"""
-You have exhausted your context window. Reflect on your progress. Provide a short concise response, of two sentences maximum, this will be used to restart this step from the beginning without the previous messages.""", "function_call": 'none'})
+You have exhausted your context window. Reflect on your progress. Provide a short concise response, of two sentences maximum, this will be used to restart this step from the beginning without the previous messages."""})
 #                     self.sub_messages.append({"role": "user", "content": f"""
 # You have exhausted your context window. Please state only which files are necessary to view to complete this task, i.e. those files which the newly written files import from. Also reflect on your progress. Provide a short concise response, of two sentences maximum, this will be used to restart this step from the beginning without the previous messages.""", "function_call": 'none'})
-                    response_message = self.get_llm_response(self.sub_messages)
+                    response_message = self.get_llm_response(self.sub_messages, tool_choice=None)
                     summary_step_message = response_message['content']
                     # if 'maximum context' in e.args[0]:
                     self.re_tries += 1
@@ -321,7 +323,7 @@ def _run(self, state=''):
                     if 'status' in json.loads(function_return_message['content']) and json.loads(function_return_message['content'])['status'] == 'TASK_STEP_COMPLETE':
                         task_step_complete = True
                         self.sub_messages.append({"role": "user", "content": f"""
-Please provide a one or two sentence summary of the output of this step, which is useful for the next step. Your response will be used when starting the next step without any of the previous messages.""", "function_call": 'none'})
+Please provide a one or two sentence summary of the output of this step, which is useful for the next step. Your response will be used when starting the next step without any of the previous messages."""})
                         continue
                     self.sub_messages.append(function_return_message)
                     if 'name' in function_return_message and function_return_message['name'] == 'sub_task_step_complete' and json.loads(function_return_message['content'])['status'] == 'error':
@@ -346,11 +348,6 @@ def _run(self, state=''):
             self.logger.info('[STEP COMPLETE] sub step completed')
         self.logger.info('[TASK COMPLETE SUCCESSFULLY!!] All steps complete')
         self.logger.info('')
-        if self.env.env_task_id == 'HumanEval':
-            benchmark_task_id = state['task_id'].split('/')[1]
-            write_files_from_dict(self.file_dict, base_dir=f'{self.folder_path}{self.name}/{benchmark_task_id}')
-            return self.file_dict
-        else:
-            write_files_from_dict(self.file_dict, base_dir=f'{self.folder_path}{self.name}')
-            self.save_agent_state(self.sub_messages)
-            return f'{self.folder_path}/{self.name}'
+        write_files_from_dict(self.file_dict, base_dir=f'{self.folder_path}{self.name}')
+        self.save_agent_state(self.sub_messages)
+        return f'{self.folder_path}/{self.name}'
diff --git a/l2mac/llm_providers/general.py b/l2mac/llm_providers/general.py
@@ -231,15 +231,18 @@ def chat_completion_rl_inner(**kwargs):
     kwargs.pop('_rate_limiter', None)
     kwargs.pop('_rate_limiter', None)
     kwargs.pop('stream', None)
+
     t0 = perf_counter()
     # if logger:
     #     logger.info(f"[{name}][OpenAI API Request] {kwargs}")
     # pretty_print_chat_messages(kwargs['messages'])
 
     if rate_limiter:
         rate_limiter.consume(**kwargs)
+        print(kwargs)
         response = client.chat.completions.create(**kwargs)
     else:
+        print(kwargs)
         response = client.chat.completions.create(**kwargs)
     # if logger:
         # logger.info(f"[{name}][OpenAI API Returned] Elapsed request time: {perf_counter() - t0}s | response: {response}")

diff --git a/l2mac/llm_providers/openai.py b/l2mac/llm_providers/openai.py
@@ -451,15 +451,15 @@ def num_tokens_from_messages(messages, model="gpt-4-0613"):
     return num_tokens
 
 
-def num_tokens_from_functions(functions, model="gpt-3.5-turbo-0613"):
+def num_tokens_from_functions(tools, model="gpt-3.5-turbo-0613"):
     """Return the number of tokens used by a list of functions."""
     num_tokens = 0
-    for function in functions:
-        function_tokens = len(CL100K_ENCODER.encode(function['name']))
-        function_tokens += len(CL100K_ENCODER.encode(function['description']))
+    for tool in tools:
+        function_tokens = len(CL100K_ENCODER.encode(tool['function']['name']))
+        function_tokens += len(CL100K_ENCODER.encode(tool['function']['description']))
 
-        if 'parameters' in function:
-            parameters = function['parameters']
+        if 'parameters' in tool['function']:
+            parameters = tool['function']['parameters']
             if 'properties' in parameters:
                 for propertiesKey in parameters['properties']:
                     function_tokens += len(CL100K_ENCODER.encode(propertiesKey))

diff --git a/l2mac/llm_providers/utils.py b/l2mac/llm_providers/utils.py
@@ -37,11 +37,12 @@ def pretty_print_chat_messages(messages, num_tokens=None, max_tokens=None, logge
         color = COLORS.get(role, COLORS["system"])  # Default to system color if role not found
         formatted_role = role.capitalize()
         content = msg['content']
-        if role == "assistant" and 'function_call' in msg:
+        if role == "assistant" and msg['tool_calls']:
             formatted_role = "Function Call"
-            print(f"{color}[{formatted_role}] [{msg['function_call']['name']}] {msg['function_call']['arguments']}\033[0m")  # Reset color at the end
-            if logger:
-                logger.info(f"[{formatted_role}] [{msg['function_call']['name']}] {msg['function_call']['arguments']}")
+            for tool_call in msg['tool_calls']:
+                print(f"{color}[{formatted_role}] [{tool_call['function']['name']}] {tool_call['function']['arguments']}\033[0m")  # Reset color at the end
+                if logger:
+                    logger.info(f"[{formatted_role}] [{tool_call['function']['name']}] {tool_call['function']['arguments']}")
         else:
             print(f"{color}[{formatted_role}] {content}\033[0m")  # Reset color at the end
             if logger:

diff --git a/l2mac/tools/run_code.py → l2mac/tools/code_analysis.py b/l2mac/tools/run_code.py → l2mac/tools/code_analysis.py
@@ -7,7 +7,8 @@
 from timeout_decorator import timeout
 import xml.etree.ElementTree as ET
 from pathlib import Path
-from l2mac.tools.write import write_files_from_dict
+import re
+from l2mac.tools.utils import write_files_from_dict
 from l2mac.tools.read import load_code_files_into_dict
 
 def check_syntax_with_timeout(file_dict):
@@ -116,6 +117,11 @@ def pytest_code_base(file_dict, files_to_test=None):
         print('')
     return captured_output
 
+def count_errors_in_syntax(syntax_output: str):
+    pattern = r".+:\d+:\d+: [E]\d+: .+"
+    errors = re.findall(pattern, syntax_output)
+    return len(errors)
+
 
 def parse_and_print_junit_xml(file_path):
     tree = ET.parse(file_path)

diff --git a/l2mac/tools/control_unit.py b/l2mac/tools/control_unit.py
@@ -1,6 +1,6 @@
 from typing import List
 import json
-from l2mac.tools.run_code import check_syntax_with_timeout, check_pytest_with_timeout, count_errors_in_syntax
+from l2mac.tools.code_analysis import check_syntax_with_timeout, check_pytest_with_timeout, count_errors_in_syntax
 
 def provide_detailed_sub_task_steps_for_sub_agents(steps: List[str] = []):
     return steps

diff --git a/l2mac/tools/core.py b/l2mac/tools/core.py
@@ -4,7 +4,7 @@
 from l2mac.tools.control_unit import provide_detailed_sub_task_steps_for_sub_agents, check_sub_task_step_complete
 from l2mac.tools.read import view_files, list_files
 from l2mac.tools.write import write_files, delete_files
-from l2mac.tools.run_code import run_python_file, pytest_files
+from l2mac.tools.code_analysis import run_python_file, pytest_files
 
 
 
@@ -23,6 +23,16 @@ def available_functions_factory():
     return available_functions
 
 def function_definition_list_factory():
+    # Following OpenAI's updated format for function definitions
+    functions = function_definition_list_factory_internal()
+    tools = []
+    for function in functions:
+        tools.append({"type": "function",
+                        "function": function})
+    return tools
+
+
+def function_definition_list_factory_internal():
     functions = [
             {
             "name": "provide_detailed_sub_task_steps_for_sub_agents",
@@ -163,17 +173,17 @@ def function_definition_list_factory():
             },]
     return functions
 
-def process_functions_into_function_names(functions: List[dict] = []):
+def process_functions_into_function_names(tools: List[dict] = []):
     function_names = []
-    for function in functions:
-        function_names.append(function['name'])
+    for tool in tools:
+        function_names.append(tool['function']['name'])
     return function_names
 
 
-def process_function_call_and_return_message(message_function_call: dict, file_dict: dict, logger=None, functions=[], enable_tests=True):
+def process_function_call_and_return_message(message_function_call: dict, file_dict: dict, logger=None, tools=[], enable_tests=True):
     function_name = ''
-    if len(functions) >= 1:
-        functions_available_keys = process_functions_into_function_names(functions)
+    if len(tools) >= 1:
+        functions_available_keys = process_functions_into_function_names(tools)
     else:
         functions_available_keys = list(available_functions_factory().keys())
     try: