Feature/iterate on leeetcode research (#489)

* cleanup agent research * Check in latest iterations * Updating payload, cleaning up * add study search, paper, and type cleanup
emrgnt-cmplxty · Aug 13, 2023 · 8cfbf55 · 8cfbf55
1 parent 3db1bf1
commit 8cfbf55
Show file tree

Hide file tree

Showing 28 changed files with 7,128 additions and 380 deletions.
diff --git a/automata-embedding-data b/automata-embedding-data
diff --git a/automata/agent/openai_agent.py b/automata/agent/openai_agent.py
@@ -266,7 +266,13 @@ def _get_next_user_response(
                     content=f"{OpenAIAutomataAgent.OBSERVATION_MESSAGE}{result}\n{function_iteration_message}",
                 )
             except Exception as e:
-                logger.info(f"Tool execution failed: {e}")
+                failure_message = f"Tool execution failed: {e}"
+                logger.info(failure_message)
+                return OpenAIChatMessage(
+                    role="user",
+                    content=failure_message,
+                )
+
         return OpenAIChatMessage(
             role="user",
             content=f"{OpenAIAutomataAgent.CONTINUE_PREFIX}\n{self._get_iteration_status()}",

diff --git a/automata/cli/scripts/run_tool_eval.py b/automata/cli/scripts/run_tool_eval.py
@@ -5,6 +5,8 @@
 import logging
 from typing import List, Optional
 
+from evalplus.data import write_jsonl
+
 from automata.cli.cli_utils import initialize_py_module_loader
 from automata.eval import (
     SymbolSearchAction,
@@ -64,34 +66,52 @@ def run_eval_harness(
         eval_loader.expected_actions,
         tool_execution,
     )
-    for result in output.results:
+    outputs = []
+    for counter, result in enumerate(output.results):
         if isinstance(result, SymbolSearchEvalResult):
             expected_action = result.expected_action
             if not isinstance(expected_action, SymbolSearchAction):
                 raise ValueError(
                     "Expected action must be a SymbolSearchAction."
                 )
 
-            logger.debug(f"Search Query: {expected_action.query}")
-            logger.debug(
-                f"Truth Top Match: {expected_action.search_results[0]}\n"
-            )
-
-            logger.debug("- Observed Results - \n")
             if observed_action := result.observed_action:
                 if not isinstance(observed_action, SymbolSearchAction):
                     raise ValueError(
                         "Observed action must be a SymbolSearchAction."
                     )
 
+            if not result.is_partial_match:
+                logger.debug("- Observed Results - \n")
+
+                logger.debug(f"Search Query: {expected_action.query}")
                 logger.debug(
-                    f"Top {TOP_K_MATCHES} Search Results: {observed_action.search_results[:TOP_K_MATCHES]}\n"
+                    f"Truth Top Match: {expected_action.search_results[0]}\n"  # type: ignore
                 )
-            logger.debug(
-                f"Full Match: {result.is_full_match}\nPartial Match: {result.is_partial_match}"
+
+                logger.debug(
+                    f"Top {TOP_K_MATCHES} Search Results: {observed_action.search_results[:TOP_K_MATCHES]}\n"  # type: ignore
+                )
+
+                logger.debug(
+                    f"Full Match: {result.is_full_match}\nPartial Match: {result.is_partial_match}"
+                )
+
+                logger.debug("=" * 150)
+            outputs.append(
+                {
+                    "task_id": f"ContextCodeRetrieval/{counter}",
+                    "query": expected_action.query,
+                    "truth_top_match": expected_action.search_results[0],  # type: ignore
+                    "top_k_matches": observed_action.search_results[  # type: ignore
+                        :TOP_K_MATCHES
+                    ],
+                    "k": TOP_K_MATCHES,
+                }
             )
 
-            logger.debug("=" * 150)
+    # TODO - Put output_filepath in commands.py upstream
+    write_jsonl(kwargs.get("output_filepath", "eval_results.jsonl"), outputs)
     logger.debug(output)
     logger.debug("=" * 150)
 

diff --git a/automata/config/eval/de_duped_cleaned_single_target_search_payload.json b/automata/config/eval/de_duped_cleaned_single_target_search_payload.json
diff --git a/automata/config/eval/demo_single_target_search_payload.json b/automata/config/eval/demo_single_target_search_payload.json
@@ -3,7 +3,7 @@
         "template": {
             "input_function": {
                 "type": "FunctionCall",
-                "name": "llm-facilitated-search",
+                "name": "symbol-similarity-search",
                 "arguments": {
                     "query": "{query}"
                 }
@@ -18,7 +18,47 @@
             {
                 "query": "Which class is an abstract base class for building for agents?",
                 "result": "automata.agent.agent.Agent"
-            }        
+            },
+            {
+                "query": "Which class is an abstract base class for building agent tools?",
+                "result": "automata.agent.agent.AgentToolkitBuilder"
+            },
+            {
+                "query": "Which class enumerates the available agent tools?",
+                "result": "automata.agent.agent.AgentToolkitNames"
+            },
+            {
+                "query": "Which class represents a general agent error?",
+                "result": "automata.agent.error.AgentGeneralError"
+            },
+            {
+                "query": "Which class builds tools for an OpenAI agent?",
+                "result": "automata.agent.openai_agent.OpenAIAgentToolkitBuilder"
+            },
+            {
+                "query": "Which class manages an OpenAI agent lifecycle?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent"
+            },
+            {
+                "query": "Which method of the OpenAI agent is responsible for running the agent?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent.run"
+            },
+            {
+                "query": "Which private method does the OpenAI agent call to perform setup?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent._setup"
+            },
+            {
+                "query": "Which class is responsible for executing instructions and reporting results back to the main system?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent"
+            },
+            {
+                "query": "Which method of the OpenAI agent is responsible for executing a single iteration of the task?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent.__next__"
+            },
+            {
+                "query": "Which method of the OpenAI agent is responsible for running the agent and iterating through the tasks until a result is produced?",
+                "result": "automata.agent.openai_agent.OpenAIAutomataAgent.run"
+            }
         ]
     }
 ]