diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
index 4c9b45ba..9c1b095f 100644
--- a/adalflow/adalflow/__init__.py
+++ b/adalflow/adalflow/__init__.py
@@ -1,6 +1,6 @@
 __version__ = "0.2.6"
 
-from adalflow.core.component import Component, fun_to_component
+from adalflow.core.component import Component
 from adalflow.core.container import Sequential, ComponentList
 from adalflow.core.base_data_class import DataClass, DataClassFormatType, required_field
 
@@ -24,6 +24,9 @@
     FloatParser,
     ListParser,
     BooleanParser,
+    Parser,
+    func_to_parser,
+    FuncParser,
 )
 from adalflow.core.retriever import Retriever
 from adalflow.components.output_parsers import (
@@ -101,6 +104,9 @@
     "FloatParser",
     "ListParser",
     "BooleanParser",
+    "Parser",
+    "func_to_parser",
+    "FuncParser",
     # Output Parsers with dataclass formatting
     "YamlOutputParser",
     "JsonOutputParser",
diff --git a/adalflow/adalflow/components/agent/react.py b/adalflow/adalflow/components/agent/react.py
index 92428e53..fe571051 100644
--- a/adalflow/adalflow/components/agent/react.py
+++ b/adalflow/adalflow/components/agent/react.py
@@ -1,14 +1,19 @@
 """Implementation and optimization of React agent."""
 
 from typing import List, Union, Callable, Optional, Any, Dict
+from dataclasses import dataclass, field
+from adalflow.core.base_data_class import DataClass
 from copy import deepcopy
 import logging
+import traceback
 
 
 from adalflow.core.generator import Generator
-from adalflow.core.component import Component
+from adalflow.optim.grad_component import GradComponent2
+from adalflow.optim.parameter import Parameter, ParameterType
 from adalflow.core.func_tool import FunctionTool, AsyncCallable
 from adalflow.core.tool_manager import ToolManager
+from adalflow.core.component import Component
 from adalflow.components.output_parsers import JsonOutputParser
 from adalflow.core.types import (
     StepOutput,
@@ -17,6 +22,7 @@
     FunctionOutput,
     FunctionExpression,
 )
+from adalflow.optim.grad_component import fun_to_grad_component
 from adalflow.core.model_client import ModelClient
 from adalflow.utils.logger import printc
 
@@ -25,42 +31,63 @@
 
 __all__ = ["DEFAULT_REACT_AGENT_SYSTEM_PROMPT", "ReActAgent"]
 
-# TODO: test react agent
 
-DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<SYS>
-{# role/task description #}
-You are a helpful assistant.
+react_agent_task_desc = r"""
 Answer the user's query using the tools provided below with minimal steps and maximum accuracy.
-{# REACT instructions #}
+
 Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.
+
+<START_OF_TASK_SPEC>
+- For simple queries: Directly call the ``finish`` action and provide the answer.
+- For complex queries:
+    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.
+    - Call one available tool at a time to solve each subquery/subquestion. \
+    - At step 'finish', join all subqueries answers and finish the task.
+Remember:
+- Action must call one of the above tools with name. It can not be empty.
+- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.
+<END_OF_TASK_SPEC>
+"""
+
+# - In this case, you are working as a multi-hop retriever and your answer in finish MUST be verbatim short factoid responses from retrieved context.
+# - Answer with only the exact answer phrase, not a full sentence or paragraph.
+
+DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<START_OF_SYSTEM_PROMPT>
+{{react_agent_task_desc}}
+
+- You have a maximum of {{max_steps}} steps to complete the task. Plan your steps carefully.
+
 {# Tools #}
 {% if tools %}
-<TOOLS>
+<START_OF_TOOLS>
 You available tools are:
 {% for tool in tools %}
 {{ loop.index }}.
 {{tool}}
 ------------------------
 {% endfor %}
-</TOOLS>
+RULES:
+- When the function is a class method and when class_instance exists, use <class_instance_value>.<func_name> to call instead (NOT the CLASS NAME)
+<END_OF_TOOLS>
+{% endif %}
+{# Context Variables #}
+{% if context_variables %}
+<START_OF_CONTEXT>
+You have access to context_variables with the following keys:
+{% for key, value in context_variables.items() %}
+{{ key }}
+------------------------
+{% endfor %}
+You can either pass context_variables or context_variables['key'] to the tools depending on the tool's requirements.
+<END_OF_CONTEXT>
 {% endif %}
 {# output format and examples for output format #}
-<OUTPUT_FORMAT>
+<START_OF_OUTPUT_FORMAT>
 {{output_format_str}}
-</OUTPUT_FORMAT>
-<TASK_SPEC>
-{# Task specification to teach the agent how to think using 'divide and conquer' strategy #}
-- For simple queries: Directly call the ``finish`` action and provide the answer.
-- For complex queries:
-    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.
-    - Call one available tool at a time to solve each subquery/subquestion. \
-    - At step 'finish', join all subqueries answers and finish the task.
-Remember:
-- Action must call one of the above tools with name. It can not be empty.
-- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.
-</TASK_SPEC>
-</SYS>
+<END_OF_OUTPUT_FORMAT>
+<END_OF_SYSTEM_PROMPT>
 -----------------
+<START_OF_USER_QUERY>
 User query:
 {{ input_str }}
 {# Step History #}
@@ -69,14 +96,97 @@
 Your previous steps:
 {% for history in step_history %}
 Step {{ loop.index }}.
+{% if history.action %}
 "Thought": "{{history.action.thought}}",
 "Action": "{{history.action.action}}",
+{% endif %}
 "Observation": "{{history.observation}}"
+
+Current Step/Max Step: {{step_history|length + 1}} / {{max_steps}}
 ------------------------
 {% endfor %}
 </STEPS>
 {% endif %}
-You:"""
+<END_OF_USER_QUERY>
+"""
+
+
+# We have parameters react_agent_task_desc, tools, output_format_str, input_str, step_history
+# react_agent_task_desc is trainable per use case
+# step_history is a list to track the history, where each time it will be updated with the current step output
+def map_step_history_to_prompt(x: Parameter) -> str:
+    output = []
+    for i, step in enumerate(x.data):
+        step_str = f"Step {i + 1}.\n"
+        output.append(step_str + step.to_prompt_str())
+    return "\n".join(output)
+
+
+def map_step_history_list_to_prompt(x: Parameter) -> str:
+    output = []
+    for i, step in enumerate(x.data.step_history):
+        step_str = f"Step {i + 1}.\n"
+        output.append(step_str + step.to_prompt_str())
+    return "\n".join(output)
+
+
+class AppendStepHistory(GradComponent2):
+    def __init__(self):
+        super().__init__(desc="Append the step_output to the step_history.")
+
+    def call(
+        self, step_output: StepOutput, step_history: List[StepOutput]
+    ) -> List[StepOutput]:
+        """Append the step_output to the step_history."""
+        if not step_history:
+            step_history = []
+        step_history = deepcopy(step_history)
+
+        step_history.append(step_output)
+        return step_history
+
+    def forward(self, *args, **kwargs) -> Parameter:
+        """Customize how the data is shown in the prompt."""
+        output = super().forward(*args, **kwargs)
+        output.data_in_prompt = map_step_history_to_prompt
+        return output
+
+
+class FunctionOutputToStepOutput(GradComponent2):
+    def __init__(self):
+        super().__init__(desc="Convert the FunctionOutput to StepOutput")
+
+    def call(
+        self,
+        action_str: FunctionExpression,
+        step: int,
+        result: FunctionOutput,
+        func: Function,
+        id: Optional[str] = None,
+    ) -> StepOutput:
+        """Convert the action string to StepOutput."""
+        step_output = StepOutput(step=step)
+        if not isinstance(action_str, FunctionExpression):
+            raise ValueError(f"Expected FunctionExpression, but got {type(action_str)}")
+        step_output.action = action_str
+        step_output.function = func
+
+        step_output.observation = result.output
+        return step_output
+
+
+@dataclass
+class ReActOutput(DataClass):
+    r"""Similar to GeneratorOutput, but with additional step history and final answer."""
+
+    id: Optional[str] = field(
+        default=None, metadata={"desc": "The unique id of the output"}
+    )
+    step_history: List[StepOutput] = field(
+        metadata={"desc": "The history of steps."}, default_factory=list
+    )
+
+    answer: Any = field(metadata={"desc": "The final answer."}, default=None)
 
 
 class ReActAgent(Component):
@@ -135,12 +245,16 @@ def __init__(
         max_steps: int = 10,
         add_llm_as_fallback: bool = True,
         # TODO: the examples are just for specifying the output format, not end to end input-output examples, need further optimization
-        examples: List[FunctionExpression] = [],
+        # examples: List[FunctionExpression] = [],
+        examples: Union[List[FunctionExpression], List[str]] = [],
         *,
         # the following arguments are mainly for the planner
         model_client: ModelClient,
         model_kwargs: Dict = {},
+        # template for the planner
         template: Optional[str] = None,  # allow users to customize the template
+        context_variables: Optional[Dict] = None,  # context variables
+        debug: bool = True,
     ):
         super().__init__()
         template = template or DEFAULT_REACT_AGENT_SYSTEM_PROMPT
@@ -148,8 +262,14 @@ def __init__(
         self.max_steps = max_steps
 
         self.add_llm_as_fallback = add_llm_as_fallback
+        self.context_variables = context_variables
+        self.debug = debug
 
-        self._init_tools(tools, model_client, model_kwargs)
+        tools = self._init_tools(tools, model_client, model_kwargs)
+        self.tool_manager: ToolManager = ToolManager(
+            tools=tools,
+            additional_context={"context_variables": self.context_variables},
+        )
 
         ouput_data_class = FunctionExpression
         example = FunctionExpression.from_function(
@@ -160,11 +280,22 @@ def __init__(
         self._examples = examples + [example]
 
         output_parser = JsonOutputParser(
-            data_class=ouput_data_class, examples=self._examples, return_data_class=True
+            data_class=ouput_data_class,
+            examples=self._examples,
+            return_data_class=True,
         )
         prompt_kwargs = {
             "tools": self.tool_manager.yaml_definitions,
             "output_format_str": output_parser.format_instructions(),
+            "react_agent_task_desc": Parameter(
+                name="react_agent_task_desc",
+                data=react_agent_task_desc,
+                role_desc="Task description for the ReAct agent which functions as a planner using a Large Language Model.",
+                param_type=ParameterType.PROMPT,
+                requires_opt=True,
+            ),
+            "context_variables": self.context_variables,
+            "max_steps": self.max_steps,
         }
         self.planner = Generator(
             template=template,
@@ -172,9 +303,11 @@ def __init__(
             output_processors=output_parser,
             model_client=model_client,
             model_kwargs=model_kwargs,
+            use_cache=True,
         )
 
-        self.step_history: List[StepOutput] = []
+        # added this component to the computation graph
+        self.append_step_history = AppendStepHistory()
 
     def _init_tools(
         self,
@@ -182,15 +315,16 @@ def _init_tools(
         model_client: ModelClient,
         model_kwargs: Dict,
     ):
-        r"""Initialize the tools."""
-        tools = deepcopy(tools)
+        r"""Initialize the tools. Using reference or else(copy or deepcopy) we can not set the training/eval mode for each tool."""
+
+        tools = tools
         _additional_llm_tool = (
             Generator(model_client=model_client, model_kwargs=model_kwargs)
             if self.add_llm_as_fallback
             else None
         )
 
-        def llm_tool(input: str) -> str:
+        def llm_tool(input: str, **kwargs) -> str:
             """I answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple."""
             try:
                 output: GeneratorOutput = _additional_llm_tool(
@@ -199,13 +333,13 @@ def llm_tool(input: str) -> str:
                 response = output.data if output else None
                 return response
             except Exception as e:
-                log.error(f"Error using the generator: {e}")
-                print(f"Error using the generator: {e}")
+                log.error(f"Error using the llm_tool: {e}")
+                print(f"Error using the llm_tool: {e}")
 
             return None
 
-        def finish(answer: str) -> str:
-            """Finish the task with answer."""
+        def finish(answer: str, **kwargs) -> str:
+            """Finish the task with verbatim short factoid responses from retrieved context."""
             return answer
 
         self._finish = finish
@@ -213,93 +347,489 @@ def finish(answer: str) -> str:
         if self.add_llm_as_fallback:
             tools.append(llm_tool)
         tools.append(finish)
-        self.tool_manager: ToolManager = ToolManager(tools=tools)
+        return tools
 
-    def reset(self):
-        r"""Reset the agent to start a new query."""
-        self.step_history = []
+    def _execute_action(
+        self,
+        step_output: StepOutput,
+        response: Union[Parameter, GeneratorOutput],
+        id: Optional[str] = None,
+    ) -> Optional[StepOutput]:
+        """Parse the action string to a function call and execute it. Update the step_output with the result."""
 
-    # TODO: add async execution
-    def _execute_action(self, action_step: StepOutput) -> Optional[StepOutput]:
-        """Parse the action string to a function call and execute it. Update the action_step with the result."""
-        action = action_step.action
-        try:
+        def handle_error(response: Parameter, e: str):
 
-            fun: Function = self.tool_manager.parse_func_expr(action)
-            result: FunctionOutput = self.tool_manager.execute_func(fun)
-            # TODO: optimize the action_step
-            action_step.function = fun
-            action_step.observation = result.output
-            return action_step
-        except Exception as e:
-            log.error(f"Error executing {action}: {e}")
-            # pass the error as observation so that the agent can continue and correct the error in the next step
-            action_step.observation = f"Error executing {action}: {e}"
-            return action_step
+            @fun_to_grad_component
+            def set_step_output_with_error(
+                step_output: StepOutput, error: str, response: Any
+            ):
+                """Set the step_output with error."""
+                step_output.observation = f"error: {error} at {response.data}"
+                return step_output
 
-    def _run_one_step(self, step: int, prompt_kwargs: Dict, model_kwargs: Dict) -> str:
-        """Run one step of the agent. Plan and execute the action for the step."""
-        step_output: StepOutput = StepOutput(step=step)
-        prompt_kwargs["step_history"] = self.step_history
+            response.add_successor_map_fn(
+                successor=set_step_output_with_error, map_fn=lambda x: x.data
+            )
+            return set_step_output_with_error.forward(step_output, e, response)
 
-        log.debug(
-            f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}"
-        )
+        step = step_output.step
 
-        response: GeneratorOutput = self.planner(
-            prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs
-        )
-        if response.error:
-            error_msg = f"Error planning step {step}: {response.error}"
+        if isinstance(response, Parameter):
+
+            try:
+                function_output_to_step_output = FunctionOutputToStepOutput()
+                # TO FunctionExpression
+
+                func: Union[Function, Parameter] = self.tool_manager(
+                    expr_or_fun=response, step="parse", map_fn=lambda x: x.data.data
+                )
+                # add action to the step_output
+                step_output.action = response.data.data
+                # parse failed
+                if not isinstance(func, Parameter):
+                    raise ValueError(
+                        f"Expected Parameter, but got {type(func)}: {func}"
+                    )
+                if isinstance(func, str):
+
+                    @fun_to_grad_component
+                    def set_step_output_with_error(
+                        step_output: StepOutput, data: FunctionExpression, error: str
+                    ):
+                        """Set the step_output with error."""
+                        step_output.observation = f"Error in parsing the FunctionExperession to Function: {error}"
+                        return step_output
+
+                    response.add_successor_map_fn(
+                        successor=set_step_output_with_error,
+                        map_fn=lambda x: x.data.data,
+                    )
+                    step_output = set_step_output_with_error.forward(
+                        step_output, response, error=func
+                    )
+                    return step_output
+
+            except Exception as e:
+                e = f"{e} at parsing error at functionexpression: {response.data}"
+                return handle_error(response, e)
+
+            try:
+                # printc(f"func: {func}", color="yellow")
+                # replace the id
+                if isinstance(func, Parameter):
+                    func.data.kwargs["id"] = id
+
+                if self.debug:
+                    printc(f"func: {func.data}", color="yellow")
+
+                result: Parameter = self.tool_manager(
+                    expr_or_fun=func, step="execute", map_fn=lambda x: x.data
+                )
+
+                if isinstance(result, str):
+                    # create dummy step output
+
+                    @fun_to_grad_component
+                    def set_step_output_with_error(step_output: StepOutput, data: str):
+                        """Set the step_output with error."""
+                        step_output.observation = f"Error {data} in executing action."
+
+                        return step_output
+
+                    response.add_successor_map_fn(
+                        successor=set_step_output_with_error,
+                        map_fn=lambda x: x.data.data,
+                    )
+                    step_output = set_step_output_with_error.forward(
+                        step_output, response
+                    )
+
+                    return step_output
+
+            except Exception as e:
+                e = f"{e} Error executing function: {func}"
+                return handle_error(response, e)
+
+            try:
+                # printc(f"result: {result}", color="red")
+                result.add_successor_map_fn(
+                    successor=function_output_to_step_output, map_fn=lambda x: x.data
+                )
+                response.add_successor_map_fn(
+                    successor=function_output_to_step_output,
+                    map_fn=lambda x: x.data.data,
+                )
+                func.add_successor_map_fn(
+                    successor=function_output_to_step_output, map_fn=lambda x: x.data
+                )
+                step_output = function_output_to_step_output.forward(
+                    action_str=response,
+                    step=step,
+                    result=result,
+                    func=func,
+                )
+
+                return step_output
+            except Exception as e:
+                e = f"{e} Error converting function output to step output: {result.data}"
+
+                return handle_error(response, e)
+
+        else:
+
+            return self._execute_action_eval_mode(
+                x=response,
+                step_output=step_output,
+                step=step,
+                id=id,
+            )
+
+    def _execute_action_eval_mode(
+        self,
+        x: GeneratorOutput,
+        step_output: StepOutput,
+        step: int,
+        id=None,
+    ) -> StepOutput:
+        """Execute the action and update the step_output."""
+        if x.error or not x.data:
+            error_msg = f"Error planning step {step}: {x.error}"
             step_output.observation = error_msg
+            step_output.action = None
             log.error(error_msg)
+            return step_output
         else:
             try:
-                fun_expr: FunctionExpression = response.data
+                fun_expr: FunctionExpression = x.data
+                printc(f"Step {step}: {fun_expr}", color="blue")
                 step_output.action = fun_expr
                 log.debug(f"Step {step}: {fun_expr}")
 
                 if step_output and step_output.action:
-                    step_output = self._execute_action(step_output)
-                    printc(f"Step {step}: \n{step_output}\n_______\n", color="blue")
+
+                    fun: Function = self.tool_manager(
+                        expr_or_fun=fun_expr, step="parse"
+                    )
+
+                    step_output.function = fun
+                    result: FunctionOutput = self.tool_manager(
+                        expr_or_fun=fun, step="execute"
+                    )
+                    step_output.observation = result.output
+                    if self.debug:
+                        printc(f"Step {step}: \n{step_output}\n_______\n", color="blue")
+                    return step_output
                 else:
+                    if self.debug:
+                        printc(f"Failed to parse response for step {step}", color="red")
                     log.error(f"Failed to parse response for step {step}")
+                    return step_output
             except Exception as e:
                 error_msg = f"Error parsing response for step {step}: {e}"
                 step_output.observation = error_msg
                 log.error(error_msg)
+                if self.debug:
+                    printc(error_msg, color="red")
+                return step_output
 
-        self.step_history.append(step_output)
+    def _run_one_step(
+        self,
+        step: int,
+        prompt_kwargs: Dict,
+        model_kwargs: Dict,
+        id: Optional[str] = None,
+        step_history: Union["Parameter", List[str]] = None,
+    ) -> Union[List[StepOutput], Parameter]:
+        """Run one step of the agent. Plan and execute the action for the step.
+        Need to deal with both train and eval mode on the self.planner.
+        """
+        if self.debug:
+            printc(f"step: {step}", color="yellow")
+
+        prompt_kwargs["step_history"] = step_history
+        step_history_value = (
+            step_history.data if isinstance(step_history, Parameter) else step_history
+        )
+        for data in step_history_value:
+            if not data:
+                raise ValueError(
+                    f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}"
+                )
+            if not isinstance(data, StepOutput):
+                raise ValueError(
+                    f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}"
+                )
 
-        return response
+        log.debug(
+            f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}"
+        )
+        try:
 
-    def call(
+            response: Union[GeneratorOutput, Parameter] = self.planner(
+                prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs, id=id
+            )
+
+        except Exception as e:
+            error_msg = f"Error happened in planner response at step {step}: {e}.\n"
+            error_msg += (
+                f"Prompt kwargs: {prompt_kwargs}\nModel kwargs: {model_kwargs}\n"
+            )
+            error_msg += f"Traceback:\n{traceback.format_exc()}"
+            raise RuntimeError(error_msg)
+
+        step_output: StepOutput = StepOutput(step=step)
+
+        try:
+
+            if self.training and isinstance(response, Parameter):
+
+                if not isinstance(response.data, GeneratorOutput):
+                    raise ValueError(
+                        f"Expected GeneratorOutput, but got {type(response.data)}, value: {response.data}"
+                    )
+                # Detect planner parsing errors to FunctionExpression so that the prompt can be trained to self-correct
+                if not isinstance(response.data.data, FunctionExpression):
+
+                    @fun_to_grad_component
+                    def set_step_output_with_error(
+                        step_output: StepOutput, data: GeneratorOutput
+                    ):
+                        """Set the step_output with error."""
+                        step_output.observation = f"Error {data.error} in parsing response: {data.raw_response}, data type: {type(data.data)}"
+                        return step_output
+
+                    response.add_successor_map_fn(
+                        successor=set_step_output_with_error,
+                        map_fn=lambda x: x.data,
+                    )
+                    step_output = set_step_output_with_error.forward(
+                        step_output, response
+                    )
+
+                else:
+
+                    step_output: Parameter = self._execute_action(
+                        step_output, response, id
+                    )
+                if self.debug:
+                    printc(f"step_output: {step_output.data}", color="red")
+                if not isinstance(step_output, Parameter):
+                    raise ValueError(
+                        f"Ensure step_output to be Parameter at training mode. Got {type(step_output)}.\n\
+                            Please check the observation for error details: {step_output}"
+                    )
+                # combine the current step_output with the step_history
+                step_output.add_successor_map_fn(
+                    successor=self.append_step_history, map_fn=lambda x: x.data
+                )
+                step_history.add_successor_map_fn(
+                    successor=self.append_step_history, map_fn=lambda x: x.data
+                )
+
+                step_history = self.append_step_history.forward(
+                    step_output, step_history
+                )
+                # connect step_history to the next planner
+                step_history.add_successor_map_fn(
+                    successor=self.planner, map_fn=lambda x: x.data
+                )
+                if self.debug:
+                    printc(
+                        f"step_history: {step_history.get_prompt_data()}", color="red"
+                    )
+                return step_history
+
+            else:
+
+                step_output: StepOutput = self._execute_action(
+                    step_output=step_output, response=response, id=id
+                )
+                if not step_output:
+                    raise RuntimeError(
+                        f"Error executing action at step {step}: {step_output}"
+                    )
+
+                if self.debug:
+                    printc(f"step_output: {step_output}", color="red")
+                step_history.append(step_output)
+                return step_history
+        except Exception as e:
+            error_msg = f"Error during execution at step {step}: {e}.\n"
+            error_msg += f"Step output: {step_output}\nResponse: {response}\n"
+            error_msg += f"Traceback:\n{traceback.format_exc()}"
+            raise RuntimeError(error_msg)
+
+    def _check_last_step(
+        self, step_history: Union["Parameter", List[str]] = None
+    ) -> bool:
+        """Check if the last step is the finish step."""
+        if not step_history:
+            return True
+
+        last_step: StepOutput = None
+        if isinstance(step_history, Parameter):
+            # try:
+            step_history_data = step_history.data
+            last_step = step_history_data[-1]
+        else:
+            last_step = step_history[-1]
+
+        if last_step and last_step.function and last_step.function.name == "finish":
+            return True
+        return False
+
+    def _get_answer(
+        self, step_history: Union["Parameter", List[str]] = None
+    ) -> Union[str, "Parameter"]:
+        """Get the final answer from the step history.
+
+        When in training mode, we pass the whole step_history to the backward engine to find the feedback
+        """
+        if not step_history:
+            return None
+
+        last_step: StepOutput = None
+        if isinstance(
+            step_history, Parameter
+        ):  # change the step history at the last step
+            try:
+                output = ReActOutput(
+                    step_history=step_history.data,
+                    answer=str(step_history.data[-1].observation),
+                )
+                step_history.data = output
+                step_history.data_in_prompt = map_step_history_list_to_prompt
+                return step_history
+
+            except Exception as e:
+                log.error(f"Error getting data from Parameter: {e}")
+                return None
+        else:
+            last_step = step_history[-1]
+            # printc(f"last_step: {last_step}", color="yellow")
+
+            return str(last_step.observation)
+
+    def call(self, *args, **kwargs) -> ReActOutput:
+        output = self.bicall(*args, **kwargs)
+        if not isinstance(output, ReActOutput) or not output:
+            raise ValueError(f"Expected ReActOutput, but got {type(output)}")
+        return output
+
+    def forward(self, *args, **kwargs) -> Parameter:
+        return self.bicall(*args, **kwargs)
+
+    def _is_step_output_last_step(self, step_output: StepOutput) -> bool:
+        """Check if the step output is the last step."""
+        step_output_data = (
+            step_output.data if isinstance(step_output, Parameter) else step_output
+        )
+        if (
+            step_output_data
+            and step_output_data.function
+            and step_output_data.function.name == "finish"
+        ):
+            return True
+        return False
+
+    def bicall(
         self,
         input: str,
         promt_kwargs: Optional[Dict] = {},
         model_kwargs: Optional[Dict] = {},
-    ) -> Any:
+        id: Optional[str] = None,
+    ) -> Union["Parameter", ReActOutput]:
         r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs."""
-        prompt_kwargs = {**promt_kwargs, "input_str": input}
+        # initialize step_history in both training and eval mode
+        step_history = None
+
+        if self.training:
+            step_history = Parameter(
+                data=[],
+                param_type=ParameterType.INPUT,
+                name="step_history",
+                requires_opt=True,
+                data_in_prompt=map_step_history_to_prompt,
+            )
+        else:
+            step_history = []
+
+        # set up the prompts
+        prompt_kwargs = {
+            **promt_kwargs,
+            "input_str": input,
+        }
+
         printc(f"input_query: {input}", color="red")
         for i in range(self.max_steps):
             step = i + 1
             try:
-                self._run_one_step(step, prompt_kwargs, model_kwargs)
-                if (
-                    self.step_history[-1].function
-                    and self.step_history[-1].function.name == "finish"
-                ):
+                step_history = self._run_one_step(
+                    step, prompt_kwargs, model_kwargs, id, step_history
+                )
+                if self._check_last_step(step_history):
                     break
             except Exception as e:
                 log.error(f"Error running step {step}: {e}")
+                printc(f"Error running step {step}: {e}", color="red")
+                raise e  # the only place to raise the error for debugging. In normal cases, the agent should not raise an error.
+
+        answer = self._get_answer(step_history)
+        if self.training:
+            return answer
+        # wrap the output
+        output = ReActOutput(step_history=step_history, id=id, answer=answer)
+        if self.debug:
+            printc(f"answer: {output}", color="yellow")
 
-        answer = self.step_history[-1].observation
-        printc(f"answer:\n {answer}", color="green")
-        log.info(f"step_history: {self.step_history}")
-        self.reset()
-        return answer
+        return output
 
     def _extra_repr(self) -> str:
         s = f"max_steps={self.max_steps}, add_llm_as_fallback={self.add_llm_as_fallback}, "
         return s
+
+
+if __name__ == "__main__":
+    from adalflow.components.model_client import OpenAIClient
+    from adalflow.utils import setup_env
+    from adalflow.core.func_tool import FunctionTool
+
+    setup_env()
+
+    class App(Component):
+        def __init__(self):
+            super().__init__()
+            self.llm_tool = Generator(
+                model_client=OpenAIClient(),
+                model_kwargs={"model": "gpt-3.5-turbo"},
+            )
+
+            def llm_as_tool(input: str, id: Optional[str] = None) -> str:
+                """Used as a calculator tool."""
+                printc(f"llm_as_tool: {input}", color="yellow")
+
+                return self.llm_tool(prompt_kwargs={"input_str": input}, id=id)
+
+            self.react_agent = ReActAgent(
+                tools=[FunctionTool(llm_as_tool, component=self.llm_tool)],
+                max_steps=2,
+                add_llm_as_fallback=False,
+                model_client=OpenAIClient(),
+                model_kwargs={"model": "gpt-3.5-turbo"},
+            )
+
+        def call(self, input: str, id: Optional[str] = None) -> Union[str, "Parameter"]:
+            return self.react_agent(input, id=id)
+
+        def forward(
+            self, input: str, id: Optional[str] = None
+        ) -> Union[str, "Parameter"]:
+            return self.react_agent(input, id=id)
+
+    # print(OutputParameter.__mro__)
+
+    app = App()
+    app.eval()
+    output = app("I want to multiply 3 and 4.", id="123")
+    print(output)
+    # output.draw_graph()
diff --git a/adalflow/adalflow/components/agent/react_v2.py b/adalflow/adalflow/components/agent/react_v2.py
new file mode 100644
index 00000000..00027f51
--- /dev/null
+++ b/adalflow/adalflow/components/agent/react_v2.py
@@ -0,0 +1,568 @@
+"""Implementation and optimization of React agent."""
+
+from typing import List, Union, Callable, Optional, Any, Dict
+from dataclasses import dataclass, field
+from adalflow.core.base_data_class import DataClass
+from copy import deepcopy
+import logging
+import warnings
+
+
+from adalflow.core.generator import Generator
+from adalflow.optim.grad_component import GradComponent
+from adalflow.optim.parameter import Parameter, ParameterType
+from adalflow.core.func_tool import FunctionTool, AsyncCallable
+from adalflow.core.tool_manager import ToolManager
+from adalflow.components.output_parsers import JsonOutputParser
+from adalflow.core.types import (
+    StepOutput,
+    GeneratorOutput,
+    Function,
+    FunctionOutput,
+    FunctionExpression,
+)
+from adalflow.core.model_client import ModelClient
+from adalflow.utils.logger import printc
+
+
+log = logging.getLogger(__name__)
+
+__all__ = ["DEFAULT_REACT_AGENT_SYSTEM_PROMPT", "ReActAgent"]
+
+
+react_agent_task_desc = r"""{# role/task description #}
+You are a helpful assistant.
+Answer the user's query using the tools provided below with minimal steps and maximum accuracy.
+{# REACT instructions #}
+Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.
+
+<START_OF_TASK_SPEC>
+{# Task specification to teach the agent how to think using 'divide and conquer' strategy #}
+- For simple queries: Directly call the ``finish`` action and provide the answer.
+- For complex queries:
+    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.
+    - Call one available tool at a time to solve each subquery/subquestion. \
+    - At step 'finish', join all subqueries answers and finish the task.
+Remember:
+- Action must call one of the above tools with name. It can not be empty.
+- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.
+<END_OF_TASK_SPEC>
+"""
+
+DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<START_OF_SYSTEM_PROMPT>
+{{react_agent_task_desc}}
+{# Tools #}
+{% if tools %}
+<START_OF_TOOLS>
+You available tools are:
+{% for tool in tools %}
+{{ loop.index }}.
+{{tool}}
+------------------------
+{% endfor %}
+<END_OF_TOOLS>
+{% endif %}
+{# Context Variables #}
+{% if context_variables %}
+<START_OF_CONTEXT>
+You have access to context_variables with the following keys:
+{% for key, value in context_variables.items() %}
+{{ key }}
+------------------------
+{% endfor %}
+You can either pass context_variables or context_variables['key'] to the tools depending on the tool's requirements.
+<END_OF_CONTEXT>
+{% endif %}
+{# output format and examples for output format #}
+<START_OF_OUTPUT_FORMAT>
+{{output_format_str}}
+<END_OF_OUTPUT_FORMAT>
+<END_OF_SYSTEM_PROMPT>
+-----------------
+<START_OF_USER_QUERY>
+User query:
+{{ input_str }}
+{# Step History #}
+{% if step_history %}
+<STEPS>
+Your previous steps:
+{% for history in step_history %}
+Step {{ loop.index }}.
+"Thought": "{{history.action.thought}}",
+"Action": "{{history.action.action}}",
+"Observation": "{{history.observation}}"
+------------------------
+{% endfor %}
+</STEPS>
+{% endif %}
+<END_OF_USER_QUERY>
+"""
+
+# We have parameters react_agent_task_desc, tools, output_format_str, input_str, step_history
+# react_agent_task_desc is trainable per use case
+# step_history is a list to track the history, where each time it will be updated with the current step output
+
+
+class AppendStepHistory(GradComponent):
+    def __init__(self):
+        super().__init__()
+        self.name = "AppendStepHistory"
+        self._component_desc = "Append the step_output to the step_history."
+
+    def call(
+        self, step_output: StepOutput, step_history: List[StepOutput]
+    ) -> List[StepOutput]:
+        """Append the step_output to the step_history."""
+        if not step_history:
+            step_history = []
+        # make a copy step_history for better tracking
+        step_history = deepcopy(step_history)
+
+        step_history.append(step_output)
+        # printc(f"step_history: {step_history}", color="yellow")
+        return step_history
+
+
+class ExecuteAction(GradComponent):
+    def __init__(self):
+        super().__init__()
+        self.name = "ExecuteAction"
+        self._component_desc = "Execute the action and output the new step_output."
+
+    def call(
+        self,
+        response: GeneratorOutput,
+        step_output: StepOutput,
+        execute_action: Callable,
+        id: Optional[str] = None,
+    ) -> StepOutput:
+        """Parse the action string to a function call and execute it. Update the action_step with the result."""
+        step = step_output.step
+        output = execute_action_fn(response, step_output, step, execute_action, id)
+        if isinstance(output, Parameter):
+            output = output.full_response
+        return output
+
+
+class FunctionOutputToStepOutput(GradComponent):
+    def __init__(self):
+        super().__init__()
+        self.name = "FunctionOutputToStepOutput"
+        self._component_desc = "Convert the FunctionOutput to StepOutput."
+
+    def call(self, output: FunctionOutput, step_output: StepOutput) -> StepOutput:
+        """Convert the FunctionOutput to StepOutput."""
+
+        temp_result = output.output
+        if isinstance(temp_result, Parameter):
+            step_output.observation = temp_result.data
+        else:
+            step_output.observation = temp_result
+        return step_output
+        # step_output = StepOutput(step=step)
+        # step_output.observation = output.output
+        # return step_output
+
+
+# TODO: make execute_action_fn to a GradComponent to enable the training of the tools too.
+def execute_action_fn(
+    x: GeneratorOutput, step_output: StepOutput, step: int, execute_action: Any, id=None
+) -> StepOutput:
+    """Execute the action and update the step_output."""
+    if x.error:
+        error_msg = f"Error planning step {step}: {x.error}"
+        step_output.observation = error_msg
+        log.error(error_msg)
+    else:
+        try:
+            fun_expr: FunctionExpression = x.data
+            step_output.action = fun_expr
+            log.debug(f"Step {step}: {fun_expr}")
+
+            if step_output and step_output.action:
+                step_output = execute_action(step_output, id)
+                printc(f"Step {step}: \n{step_output}\n_______\n", color="blue")
+                return step_output
+            else:
+                printc(f"Failed to parse response for step {step}", color="red")
+                log.error(f"Failed to parse response for step {step}")
+                return step_output
+        except Exception as e:
+            error_msg = f"Error parsing response for step {step}: {e}"
+            step_output.observation = error_msg
+            log.error(error_msg)
+            printc(error_msg, color="red")
+            return step_output
+
+
+@dataclass
+class ReActOutput(DataClass):
+    r"""Similar to GeneratorOutput, but with additional step history and final answer."""
+
+    id: Optional[str] = field(
+        default=None, metadata={"desc": "The unique id of the output"}
+    )
+    step_history: List[StepOutput] = field(
+        metadata={"desc": "The history of steps."}, default_factory=list
+    )
+
+    answer: Any = field(metadata={"desc": "The final answer."}, default=None)
+
+
+class ReActAgent(GradComponent):
+    __doc__ = r"""ReActAgent uses generator as a planner that runs multiple and sequential functional call steps to generate the final response.
+
+    Users need to set up:
+    - tools: a list of tools to use to complete the task. Each tool is a function or a function tool.
+    - max_steps: the maximum number of steps the agent can take to complete the task.
+    - use_llm_as_fallback: a boolean to decide whether to use an additional LLM model as a fallback tool to answer the query.
+    - model_client: the model client to use to generate the response.
+    - model_kwargs: the model kwargs to use to generate the response.
+    - template: the template to use to generate the prompt. Default is DEFAULT_REACT_AGENT_SYSTEM_PROMPT.
+
+    For the generator, the default arguments are:
+    (1) default prompt: DEFAULT_REACT_AGENT_SYSTEM_PROMPT
+    (2) default output_processors: JsonParser
+
+    There are `examples` which is optional, a list of string examples in the prompt.
+
+    Example:
+
+    .. code-block:: python
+
+        from core.openai_client import OpenAIClient
+        from components.agent.react import ReActAgent
+        from core.func_tool import FunctionTool
+        # define the tools
+        def multiply(a: int, b: int) -> int:
+            '''Multiply two numbers.'''
+            return a * b
+        def add(a: int, b: int) -> int:
+            '''Add two numbers.'''
+            return a + b
+        agent = ReActAgent(
+            tools=[multiply, add],
+            model_client=OpenAIClient(),
+            model_kwargs={"model": "gpt-3.5-turbo"},
+        )
+
+        # Using examples:
+
+        call_multiply = FunctionExpression.from_function(
+            thought="I want to multiply 3 and 4.",
+
+
+
+    Reference:
+    [1] https://arxiv.org/abs/2210.03629, published in Mar, 2023.
+    """
+
+    # TODO: allow users to pass in a few examples. Need to be a list of FunctionExpression instances.
+    def __init__(
+        self,
+        # added arguments specifc to React
+        tools: List[Union[Callable, AsyncCallable, FunctionTool]] = [],
+        max_steps: int = 10,
+        add_llm_as_fallback: bool = True,
+        # TODO: the examples are just for specifying the output format, not end to end input-output examples, need further optimization
+        # examples: List[FunctionExpression] = [],
+        examples: Union[List[FunctionExpression], List[str]] = [],
+        *,
+        # the following arguments are mainly for the planner
+        model_client: ModelClient,
+        model_kwargs: Dict = {},
+        # template for the planner
+        template: Optional[str] = None,  # allow users to customize the template
+        context_variables: Optional[Dict] = None,  # context variables
+    ):
+        super().__init__()
+        template = template or DEFAULT_REACT_AGENT_SYSTEM_PROMPT
+
+        self.max_steps = max_steps
+
+        self.add_llm_as_fallback = add_llm_as_fallback
+        self.context_variables = context_variables
+
+        self._init_tools(tools, model_client, model_kwargs)
+
+        ouput_data_class = FunctionExpression
+        example = FunctionExpression.from_function(
+            thought="I have finished the task.",
+            func=self._finish,
+            answer="final answer: 'answer'",
+        )
+        self._examples = examples + [example]
+
+        output_parser = JsonOutputParser(
+            data_class=ouput_data_class, examples=self._examples, return_data_class=True
+        )
+        prompt_kwargs = {
+            "tools": self.tool_manager.yaml_definitions,
+            "output_format_str": output_parser.format_instructions(),
+            "react_agent_task_desc": Parameter(
+                name="react_agent_task_desc",
+                data=react_agent_task_desc,
+                role_desc="Task description for the ReAct agent which functions as a planner using a Large Language Model.",
+                param_type=ParameterType.PROMPT,
+                requires_opt=True,
+            ),
+            "context_variables": self.context_variables,
+        }
+        self.planner = Generator(
+            template=template,
+            prompt_kwargs=prompt_kwargs,
+            output_processors=output_parser,
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+        )
+
+        # added this component to the computation graph
+        self.append_step_history = AppendStepHistory()
+        self.execute_action = ExecuteAction()
+        self.function_output_to_step_output = FunctionOutputToStepOutput()
+
+    def _init_tools(
+        self,
+        tools: List[Union[Callable, AsyncCallable, FunctionTool]],
+        model_client: ModelClient,
+        model_kwargs: Dict,
+    ):
+        r"""Initialize the tools."""
+        tools = deepcopy(tools)
+        _additional_llm_tool = (
+            Generator(model_client=model_client, model_kwargs=model_kwargs)
+            if self.add_llm_as_fallback
+            else None
+        )
+
+        def llm_tool(input: str, **kwargs) -> str:
+            """I answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple."""
+            try:
+                output: GeneratorOutput = _additional_llm_tool(
+                    prompt_kwargs={"input_str": input}
+                )
+                response = output.data if output else None
+                return response
+            except Exception as e:
+                log.error(f"Error using the generator: {e}")
+                print(f"Error using the generator: {e}")
+
+            return None
+
+        def finish(answer: str, **kwargs) -> str:
+            """Finish the task with answer."""
+            return answer
+
+        self._finish = finish
+
+        if self.add_llm_as_fallback:
+            tools.append(llm_tool)
+        tools.append(finish)
+        self.tool_manager: ToolManager = ToolManager(
+            tools=tools,
+            additional_context={"context_variables": self.context_variables},
+        )
+
+    # TODO: add async execution
+    def _execute_action(
+        self, action_step: StepOutput, id: Optional[str] = None
+    ) -> Optional[StepOutput]:
+        """Parse the action string to a function call and execute it. Update the action_step with the result."""
+        action = action_step.action
+        try:
+
+            fun: Function = self.tool_manager.parse_func_expr(action)
+            # replace the id
+            fun.kwargs["id"] = id
+
+            result: Union[FunctionOutput, Parameter] = self.tool_manager(fun)
+            action_step.function = fun
+            if isinstance(result, Parameter):
+                result.add_successor_map_fn(
+                    successor=self.function_output_to_step_output,
+                    map_fn=lambda x: x.data,
+                )
+                action_step: StepOutput = self.function_output_to_step_output(
+                    output=result, step_output=action_step
+                )
+            elif isinstance(result, FunctionOutput):
+                action_step.observation = result.output
+            else:
+                warnings.warn(f"Fails to parse the result: {result}")
+                action_step.observation = result
+
+            return action_step
+        except Exception as e:
+            log.error(f"Error executing {action}: {e}")
+            # pass the error as observation so that the agent can continue and correct the error in the next step
+            action_step.observation = f"Error executing {action}: {e}"
+            return action_step
+
+    def _run_one_step(
+        self,
+        step: int,
+        prompt_kwargs: Dict,
+        model_kwargs: Dict,
+        id: Optional[str] = None,
+        step_history: Union["Parameter", List[str]] = None,
+    ) -> Union[StepOutput, Parameter]:
+        """Run one step of the agent. Plan and execute the action for the step.
+        Need to deal with both train and eval mode on the self.planner.
+        """
+
+        prompt_kwargs["step_history"] = step_history
+
+        log.debug(
+            f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}"
+        )
+
+        response: Union[GeneratorOutput, Parameter] = self.planner(
+            prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs, id=id
+        )
+
+        # create a new step output
+        step_output: StepOutput = StepOutput(step=step)
+
+        # connecting two generators in the computation graph, it will set up self.step_history
+        if isinstance(response, Parameter):
+            # get the full response
+            def map_fn(x: Parameter) -> GeneratorOutput:
+                return x.full_response
+
+            response.add_successor_map_fn(successor=self.execute_action, map_fn=map_fn)
+
+            step_output: Parameter = self.execute_action.forward(
+                response, step_output, self._execute_action, id
+            )
+            step_output.add_successor_map_fn(
+                successor=self.append_step_history, map_fn=lambda x: x.data
+            )
+
+            step_history = self.append_step_history.forward(step_output, step_history)
+            # connect step_history to the next planner
+            step_history.add_successor_map_fn(
+                successor=self.planner, map_fn=lambda x: x.data
+            )
+            # convert step history back to data
+            printc(f"step_history: {step_history.data}", color="yellow")
+            return step_history
+
+        else:
+            step_output = execute_action_fn(
+                response, step_output, step, self._execute_action, id
+            )
+            step_history.append(step_output)
+            return step_history
+
+    def _check_last_step(
+        self, step_history: Union["Parameter", List[str]] = None
+    ) -> bool:
+        """Check if the last step is the finish step."""
+        if not step_history:
+            return True
+
+        last_step: StepOutput = None
+        if isinstance(step_history, Parameter):
+            try:
+                step_history = step_history.data
+                last_step = step_history[-1]
+
+            except Exception as e:
+                log.error(f"Error getting data from Parameter: {e}")
+                return False
+        else:
+            last_step = step_history[-1]
+
+        if last_step and last_step.function and last_step.function.name == "finish":
+            return True
+        return False
+
+    def _get_answer(
+        self, step_history: Union["Parameter", List[str]] = None
+    ) -> Union[str, "Parameter"]:
+        """Get the final answer from the step history."""
+        if not step_history:
+            return None
+
+        last_step: StepOutput = None
+        if isinstance(step_history, Parameter):
+            try:
+                return step_history
+
+            except Exception as e:
+                log.error(f"Error getting data from Parameter: {e}")
+                return None
+        else:
+            last_step = step_history[-1]
+
+            return last_step.observation
+
+    def call(self, *args, **kwargs):
+        return self.bicall(*args, **kwargs)
+
+    def forward(self, *args, **kwargs) -> Parameter:
+        return self.bicall(*args, **kwargs)
+
+    def _is_step_output_last_step(self, step_output: StepOutput) -> bool:
+        """Check if the step output is the last step."""
+        step_output_data = (
+            step_output.data if isinstance(step_output, Parameter) else step_output
+        )
+        if (
+            step_output_data
+            and step_output_data.function
+            and step_output_data.function.name == "finish"
+        ):
+            return True
+        return False
+
+    def bicall(
+        self,
+        input: str,
+        promt_kwargs: Optional[Dict] = {},
+        model_kwargs: Optional[Dict] = {},
+        id: Optional[str] = None,
+    ) -> Union["Parameter", ReActOutput]:
+        r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs."""
+        # initialize step_history
+        step_history = None
+        if self.training:
+            step_history = Parameter(
+                data=[],
+                param_type=ParameterType.INPUT,
+                name="step_history",
+                requires_opt=True,
+            )
+        else:
+            step_history = []
+
+        # set up the prompts
+        prompt_kwargs = {
+            **promt_kwargs,
+            "input_str": input,
+        }
+
+        printc(f"input_query: {input}", color="red")
+        for i in range(self.max_steps):
+            step = i + 1
+            try:
+                step_history = self._run_one_step(
+                    step, prompt_kwargs, model_kwargs, id, step_history
+                )
+
+                if self._check_last_step(step_history):
+                    break
+
+            except Exception as e:
+                log.error(f"Error running step {step}: {e}")
+
+        answer = self._get_answer(step_history)
+        if self.training:
+            return answer
+        # wrap the output
+        output = ReActOutput(step_history=step_history, id=id, answer=answer)
+        return output
+
+    def _extra_repr(self) -> str:
+        s = f"max_steps={self.max_steps}, add_llm_as_fallback={self.add_llm_as_fallback}, "
+        return s
diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py
index 6d2e56dd..dc258686 100644
--- a/adalflow/adalflow/components/output_parsers/dataclass_parser.py
+++ b/adalflow/adalflow/components/output_parsers/dataclass_parser.py
@@ -4,12 +4,12 @@
 from typing import Any, Literal, List, Optional
 import logging
 
-from adalflow.core.component import Component
 from adalflow.core.prompt_builder import Prompt
-from adalflow.core.string_parser import YamlParser, JsonParser
+from adalflow.core.string_parser import YamlParser, JsonParser, Parser
 from adalflow.core.base_data_class import DataClass, DataClassFormatType
 from adalflow.core.base_data_class import ExcludeType, IncludeType
 
+
 __all__ = ["DataClassParser"]
 
 log = logging.getLogger(__name__)
@@ -42,7 +42,7 @@
 """
 
 
-class DataClassParser(Component):
+class DataClassParser(Parser):
     __doc__ = r"""Made the structured output even simpler compared with JsonOutputParser and YamlOutputParser.
 
         1. Understands __input_fields__ and __output_fields__ from the DataClass (no need to use include/exclude to decide fields).
@@ -166,6 +166,9 @@ def get_examples_str(
         examples_str = Prompt(template=EXAMPLES_FORMAT)(examples=str_examples)
         return examples_str
 
+    def __call__(self, *args, **kwargs):
+        return self.call(*args, **kwargs)
+
     def call(self, input: str) -> Any:
         r"""Parse the output string to the desired format and return the parsed output."""
         try:
diff --git a/adalflow/adalflow/components/output_parsers/outputs.py b/adalflow/adalflow/components/output_parsers/outputs.py
index 288cba67..82e5e2cc 100644
--- a/adalflow/adalflow/components/output_parsers/outputs.py
+++ b/adalflow/adalflow/components/output_parsers/outputs.py
@@ -11,9 +11,8 @@
 from typing import Dict, Any, Optional, List
 import logging
 
-from adalflow.core.component import Component
 from adalflow.core.prompt_builder import Prompt
-from adalflow.core.string_parser import YamlParser, ListParser, JsonParser
+from adalflow.core.string_parser import YamlParser, ListParser, JsonParser, Parser
 from adalflow.core.base_data_class import DataClass, DataClassFormatType
 from adalflow.core.base_data_class import ExcludeType, IncludeType
 
@@ -69,15 +68,19 @@
 YAML_OUTPUT_PARSER_OUTPUT_TYPE = Dict[str, Any]
 
 
-class OutputParser(Component):
+class OutputParser(Parser):
     __doc__ = r"""The abstract class for all output parsers.
 
+    On top of the basic string Parser, it handles structured data interaction:
+    1. format_instructions: Return the formatted instructions to use in prompt for the output format.
+    2. call: Parse the output string to the desired format and return the parsed output via yaml or json.
+
     This interface helps users customize output parsers with consistent interfaces for the Generator.
     Even though you don't always need to subclass it.
 
-    AdalFlow uses two core components:
+    AdalFlow uses two core classes:
     1. the Prompt to format output instruction
-    2. A string parser component from core.string_parser for response parsing.
+    2. A string parser from core.string_parser for response parsing.
     """
 
     def __init__(self, *args, **kwargs) -> None:
@@ -88,6 +91,9 @@ def format_instructions(self) -> str:
         r"""Return the formatted instructions to use in prompt for the output format."""
         raise NotImplementedError("This is an abstract method.")
 
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.call(*args, **kwds)
+
     def call(self, input: str) -> Any:
         r"""Parse the output string to the desired format and return the parsed output."""
         raise NotImplementedError("This is an abstract method.")
diff --git a/adalflow/adalflow/core/__init__.py b/adalflow/adalflow/core/__init__.py
index a4a67c6a..928b3ce5 100644
--- a/adalflow/adalflow/core/__init__.py
+++ b/adalflow/adalflow/core/__init__.py
@@ -1,12 +1,23 @@
 from .base_data_class import DataClass, required_field, DataClassFormatType
 
-from .component import Component, FunComponent, fun_to_component
+from .component import Component
 from .container import Sequential, ComponentList
 from .db import LocalDB
 from .default_prompt_template import DEFAULT_ADALFLOW_SYSTEM_PROMPT
 from .embedder import Embedder, BatchEmbedder
 from .generator import Generator, BackwardEngine
 from .model_client import ModelClient
+from .string_parser import (
+    Parser,
+    FuncParser,
+    func_to_parser,
+    YamlParser,
+    JsonParser,
+    IntParser,
+    FloatParser,
+    ListParser,
+    BooleanParser,
+)
 
 # from .parameter import Parameter
 from .prompt_builder import Prompt
@@ -51,8 +62,6 @@
     "Component",
     "Sequential",
     "ComponentList",
-    "FunComponent",
-    "fun_to_component",
     "DataClass",
     "DataClassFormatType",
     "required_field",
@@ -94,6 +103,16 @@
     "DialogTurn",
     "Conversation",
     "Tokenizer",
+    # Parsers
+    "Parser",
+    "FuncParser",
+    "func_to_parser",
+    "YamlParser",
+    "JsonParser",
+    "IntParser",
+    "FloatParser",
+    "ListParser",
+    "BooleanParser",
 ]
 
 for name in __all__:
diff --git a/adalflow/adalflow/core/base_data_class.py b/adalflow/adalflow/core/base_data_class.py
index 1a379724..543a1090 100644
--- a/adalflow/adalflow/core/base_data_class.py
+++ b/adalflow/adalflow/core/base_data_class.py
@@ -292,7 +292,9 @@ class TrecDataList(DataClass):
             # {'data': [{'question': 'What is the capital of France?'}]}
         """
         if not is_dataclass(self):
-            raise ValueError("to_dict() called on a class type, not an instance.")
+            raise ValueError(
+                f"to_dict() is not called on a dataclass instance: {self.__class__}. You might forget to use @dataclass decorator."
+            )
         # convert all fields to its data if its parameter
         fields = self.__dataclass_fields__
         from adalflow.optim.parameter import Parameter
diff --git a/adalflow/adalflow/core/component.py b/adalflow/adalflow/core/component.py
index d0dd6631..3324f33f 100644
--- a/adalflow/adalflow/core/component.py
+++ b/adalflow/adalflow/core/component.py
@@ -3,7 +3,6 @@
 
 from collections import OrderedDict, namedtuple
 from typing import (
-    Callable,
     Dict,
     Any,
     Optional,
@@ -519,17 +518,85 @@ def named_parameters(
     #     )
     #     plt.show()
 
-    # TODO: do we need to disable this format of calling instead use call and acall extensively?
-    def __call__(self, *args, **kwargs):
-        r"""In default, we use sync call."""
-        output = self.call(*args, **kwargs)
-        return output
+    def forward(self, *args, **kwargs):
+        """
+        User must override this for the training scenario
+        if bicall is not defined.
+        """
+        raise NotImplementedError("Subclasses must implement `forward` or `bicall`.")
 
     def call(self, *args, **kwargs):
+        """
+        User must override this for the inference scenario
+        if bicall is not defined.
+        """
+        if self._has_bicall():
+            output = self.bicall(*args, **kwargs)
+            return output
+        raise NotImplementedError("Subclasses must implement `call` or `bicall`.")
+
+    def bicall(self, *args, **kwargs):
+        """
+        If the user provides a `bicall` method, then `__call__` will automatically
+        dispatch here for both training and inference scenarios. This can internally
+        decide how to handle training vs. inference, or just produce a single unified
+        output type.
+        """
+        # Default fallback if not overridden
         raise NotImplementedError(
-            f"Component {type(self).__name__} is missing the required 'call' method."
+            "Optional method. Implement to handle both scenarios in one place."
         )
 
+    def __call__(self, *args, **kwargs):
+        # 1. If `bicall` is defined by the user, use it
+        #    and let the `bicall` implementation handle
+        #    the difference between training vs. inference.
+        from adalflow.optim.parameter import Parameter
+
+        if self._has_bicall():
+            output = self.bicall(*args, **kwargs)
+
+            # Validation checks based on training or inference
+            if self.training:
+                # Ensure output is a Parameter in training
+                if not isinstance(output, Parameter):
+                    raise ValueError(
+                        f"Output should be of type Parameter in training mode, but got {type(output)}"
+                    )
+            else:
+                # Ensure output is not a Parameter in inference
+                if isinstance(output, Parameter):
+                    raise ValueError(
+                        f"Output should not be of type Parameter in inference mode, but got {type(output)}"
+                    )
+            return output
+
+        # 2. Otherwise, if `bicall` is not defined, fall back to forward / call
+        if self.training:
+            output = self.forward(*args, **kwargs)
+            # Validation for training
+            if not isinstance(output, Parameter):
+                raise ValueError(
+                    f"Output should be of type Parameter in training mode, but got {type(output)}"
+                )
+            return output
+        else:
+            output = self.call(*args, **kwargs)
+            # Validation for inference
+            if isinstance(output, Parameter):
+                raise ValueError(
+                    f"Output should not be of type Parameter in inference mode, but got {type(output)}"
+                )
+            return output
+
+    def _has_bicall(self):
+        """
+        Helper method to check if this subclass has overridden bicall.
+        """
+        # The default `bicall` in this class raises NotImplementedError,
+        # so we can check if the method is still the same one as in `MyModule`.
+        return self.bicall.__func__ is not Component.bicall
+
     async def acall(self, *args, **kwargs):
         r"""API call, file io."""
         pass
@@ -890,6 +957,18 @@ def _get_name(self):
     def __repr__(self):
         # We treat the extra repr like the sub-module, one item per line
         extra_lines = []
+        # add training mode
+        status = ""
+        if self.training:
+            status = "training: True"
+        else:
+            status = "training: False"
+        # add teacher mode
+        if self.teacher_mode:
+            status += ", teacher_mode: True"
+        else:
+            status += ", teacher_mode: False"
+        extra_lines.append(status)
         extra_repr = self._extra_repr()
         # empty string will be split into list ['']
         if extra_repr:
@@ -928,76 +1007,6 @@ def _get_init_args(self, *args, **kwargs) -> Dict[str, Any]:
         return init_args
 
 
-# TODO: support async call
-class FunComponent(Component):
-    r"""Component that wraps a function.
-
-    Args:
-        fun (Callable): The function to be wrapped.
-
-    Examples:
-
-    function = lambda x: x + 1
-    fun_component = FunComponent(function)
-    print(fun_component(1))  # 2
-    """
-
-    def __init__(self, fun: Optional[Callable] = None, afun: Optional[Callable] = None):
-        super().__init__()
-        self.fun_name = fun.__name__
-        EntityMapping.register(self.fun_name, fun)
-
-    def call(self, *args, **kwargs):
-        fun = EntityMapping.get(self.fun_name)
-        return fun(*args, **kwargs)
-
-    def _extra_repr(self) -> str:
-        return super()._extra_repr() + f"fun_name={self.fun_name}"
-
-
-def fun_to_component(fun) -> FunComponent:
-    r"""Helper function to convert a function into a Component with
-    its own class name.
-
-    Can be used as both a decorator and a function.
-
-    Args:
-        fun (Callable): The function to be wrapped.
-    Returns:
-        FunComponent: The component that wraps the function.
-
-    Examples:
-    1. As a decorator:
-        >>> @fun_to_component
-        >>> def my_function(x):
-        >>>     return x + 1
-        >>> # is equivalent to
-        >>> class MyFunctionComponent(FunComponent):
-        >>>     def __init__(self):
-        >>>         super().__init__(my_function)
-
-    2. As a function:
-        >>> my_function_component = fun_to_component(my_function)
-    """
-
-    # Split the function name by underscores, capitalize each part, and join them back together
-    class_name = (
-        "".join(part.capitalize() for part in fun.__name__.split("_")) + "Component"
-    )
-    # register the function
-    EntityMapping.register(fun.__name__, fun)
-    # Define a new component class dynamically
-    component_class = type(
-        class_name,
-        (FunComponent,),
-        {"__init__": lambda self: FunComponent.__init__(self, fun)},
-    )
-    # register the component
-    EntityMapping.register(class_name, component_class)
-
-    return component_class()
-
-
 # TODO: not used yet, will further investigate dict mode
 # class ComponentDict(Component):
 #     r"""
diff --git a/adalflow/adalflow/core/func_tool.py b/adalflow/adalflow/core/func_tool.py
index 62d4f3fe..419ae846 100644
--- a/adalflow/adalflow/core/func_tool.py
+++ b/adalflow/adalflow/core/func_tool.py
@@ -4,7 +4,8 @@
 """
 
 from typing import Any, Optional, Callable, Awaitable, Union
-from inspect import iscoroutinefunction
+from inspect import iscoroutinefunction, ismethod, isfunction
+import inspect
 import logging
 import asyncio
 import nest_asyncio
@@ -16,6 +17,8 @@
     Function,
 )
 from adalflow.core import Component
+from adalflow.optim.parameter import Parameter
+from adalflow.optim.grad_component import GradComponent2
 from adalflow.core.functional import (
     get_fun_schema,
 )
@@ -37,11 +40,26 @@ def is_running_in_event_loop() -> bool:
         return False
 
 
+def find_instance_name_from_self(instance):
+    """
+    Attempt to find the variable name of the instance in the calling context.
+
+    :param instance: The instance to find the name for.
+    :return: The variable name of the instance, if found; otherwise, None.
+    """
+    # Inspect the calling stack frame
+    frame = inspect.stack()[2].frame
+    for var_name, var_obj in frame.f_locals.items():
+        if var_obj is instance:
+            return var_name
+    return None
+
+
 FunctionType = Union[Callable[..., Any], Awaitable[Callable[..., Any]]]
 
 
 # TODO: improve the support for async functions, similarly a component might be used as a tool
-class FunctionTool(Component):
+class FunctionTool(GradComponent2):
     __doc__ = r"""Describing and executing a function via call with arguments.
 
 
@@ -49,6 +67,32 @@ class FunctionTool(Component):
 
     Function be used by LLM as a tool to achieve a specific task.
 
+    What function can you pass as a tool?
+    1. Any unbound function you wrote outside of a class.
+    2. Any class method you wrote in your component. It can call `self` and other methods inside of your component.
+    3. When the function is using a trainable component, and you can directly use the component's method as a tool or wrap it in a function. But you need to make sure to pass the component to the tool.
+
+    Here are some examples:
+
+    .. code-block:: python
+
+        from adalflow.core.func_tool import FunctionTool
+        class AgenticRAG(GradComponent):
+            def __init__(self, ...):
+                super().__init__()
+                self.retriever = Retriever()
+                self.llm = Generator()
+
+                def retriever_as_tool(input: str) -> str:
+                    r"Used as a retriever tool."
+                    return self.retriever(input)
+
+                tools = [FunctionTool(retriever_as_tool, component=self.retriever),
+                            FunctionTool(self.llm.__call__, component=self.llm)]
+                # if you have trainable component, this will ensure it can be trained together with your whole task pipeline
+                # if you dont want to train them and simply treating them as a tool, you can call like this
+                # tools = [FunctionTool(retriever_as_tool), FunctionTool(self.llm.__call__, component=self.llm)]
+
     Features:
     - Supports both synchronous and asynchronous functions via ``call`` and ``acall``.
     - Creates a FunctionDefinition from the function using ``get_fun_schema``.
@@ -63,18 +107,23 @@ class FunctionTool(Component):
                  - via sandboxed execute directionly using ``sandbox_exec``.
 
 
+    A FunctionTool allows other GradComponent(as a tool) to pass through correctly.
     """
 
     def __init__(
         self,
         fn: FunctionType,
+        component: Optional[Component] = None,
         definition: Optional[FunctionDefinition] = None,
     ):
-        super().__init__()
+        super().__init__(
+            name="FunctionTool", desc="A component calls and executes a function."
+        )
         nest_asyncio.apply()
         assert fn is not None, "fn must be provided"
 
         self.fn = fn
+        self.component = component  # pass it here to control the training mode
         self._is_async = iscoroutinefunction(fn)
 
         self.definition = definition or self._create_fn_definition()
@@ -85,22 +134,99 @@ def __init__(
     def is_async(self) -> bool:
         return self._is_async
 
+    # def _create_fn_definition(self) -> FunctionDefinition:
+    #     name = self.fn.__name__
+    #     docstring = self.fn.__doc__
+    #     description = f"{docstring}"
+    #     description = f"{name}{signature(self.fn)}\n{docstring}"
+    #     # description = f"{name}{signature(self.fn)}\n{docstring}"
+    #     fn_parameters = get_fun_schema(name, self.fn)
+    #     return FunctionDefinition(
+    #         func_name=name, func_desc=description, func_parameters=fn_parameters
+    #     )
+
     def _create_fn_definition(self) -> FunctionDefinition:
         name = self.fn.__name__
         docstring = self.fn.__doc__
-        description = f"{docstring}"
-        description = f"{name}{signature(self.fn)}\n{docstring}"
-        # description = f"{name}{signature(self.fn)}\n{docstring}"
+        signature_str = str(signature(self.fn))
+
+        # Get the class that owns the method, if applicable
+        cls_name = None
+        # cls_docstring = None
+        instance = None
+        if ismethod(self.fn):  # Check if it’s a bound method
+            instance = self.fn.__self__
+            instance = find_instance_name_from_self(instance)
+            if name == "__call__" and not instance:
+                raise ValueError(
+                    "Please provide a name for the instance in the calling context"
+                )
+            cls_name = self.fn.__self__.__class__.__name__
+            # cls_docstring = getdoc(self.fn.__self__.__class__)
+        elif isfunction(self.fn):  # Unbound method
+            cls_name = self.fn.__qualname__.split(".")[0]
+
+        # Build the description
+        description = f"{name}{signature_str}\n"
+        if cls_name:
+            description += f"Belongs to class: {cls_name}\n"
+        if docstring:
+            description += f"Method docstring: {docstring}\n"
+        # if cls_docstring:
+        #     description += f"Class docstring: {cls_docstring}\n"
+
+        # Get function parameters schema
         fn_parameters = get_fun_schema(name, self.fn)
+
         return FunctionDefinition(
-            func_name=name, func_desc=description, func_parameters=fn_parameters
+            func_name=name,
+            func_desc=description,
+            func_parameters=fn_parameters,
+            class_instance=instance,
         )
 
+    def forward(self, *args, **kwargs) -> Parameter:
+        r"""Forward the function tool."""
+        return self.bicall(*args, **kwargs)
+
     def call(self, *args: Any, **kwargs: Any) -> FunctionOutput:
         r"""Execute the function synchronously.
 
         Example:
 
+        .. code-block:: python
+
+            import time
+            def sync_function_1():
+                time.sleep(1)
+                return "Function 1 completed"
+
+            tool_1 = FunctionTool(sync_function_1)
+            output = tool_1.call()
+        """
+        return self.bicall(*args, **kwargs)
+        # if self._is_async:
+        #     raise ValueError("FunctionTool is asynchronous, use acall instead")
+        # output, error = None, None
+        # try:
+        #     output = self.fn(*args, **kwargs)
+        # except Exception as e:
+        #     log.error(f"Error at calling {self.fn}: {e}")
+        #     # raise ValueError(f"Error: {e}")
+        #     error = str(e)
+        # return FunctionOutput(
+        #     name=self.definition.func_name,
+        #     # raw_input={"args": args, "kwargs": kwargs},
+        #     input=Function(name=self.definition.func_name, args=args, kwargs=kwargs),
+        #     output=output,
+        #     error=error,
+        # )
+
+    def bicall(self, *args: Any, **kwargs: Any) -> Union[FunctionOutput, Parameter]:
+        r"""Execute the function synchronously.
+
+        Example:
+
         .. code-block:: python
 
             import time
@@ -114,12 +240,31 @@ def sync_function_1():
         if self._is_async:
             raise ValueError("FunctionTool is asynchronous, use acall instead")
         output, error = None, None
+
+        # NOTE: special case:
+        # self.fn can have both train and eval mode or untrainable as a function.
         try:
             output = self.fn(*args, **kwargs)
         except Exception as e:
             log.error(f"Error at calling {self.fn}: {e}")
             # raise ValueError(f"Error: {e}")
             error = str(e)
+
+        if isinstance(output, Parameter):
+            if not self.training:
+                raise ValueError(
+                    f"FunctionTool {self.definition.func_name} is in eval mode, but the output is Parameter"
+                )
+            output.data = FunctionOutput(
+                name=self.definition.func_name,
+                # raw_input={"args": args, "kwargs": kwargs},
+                input=Function(
+                    name=self.definition.func_name, args=args, kwargs=kwargs
+                ),
+                output=output.data,
+                error=error,
+            )
+            return output
         return FunctionOutput(
             name=self.definition.func_name,
             # raw_input={"args": args, "kwargs": kwargs},
@@ -249,9 +394,9 @@ async def run_sync_and_async_mix():
 
         return result
 
-    def __call__(self, *args, **kwargs) -> FunctionOutput:
-        r"""Execute the function synchronously or asynchronously based on the function type."""
-        return self.execute(*args, **kwargs)
+    # def __call__(self, *args, **kwargs) -> FunctionOutput:
+    #     r"""Execute the function synchronously or asynchronously based on the function type."""
+    #     return self.execute(*args, **kwargs)
 
     def _extra_repr(self) -> str:
         s = f"fn: {self.fn}, async: {self._is_async}, definition: {self.definition}"
@@ -260,61 +405,92 @@ def _extra_repr(self) -> str:
 
 if __name__ == "__main__":
 
-    import asyncio
-    import time
-
-    async def async_function_1():
-        await asyncio.sleep(1)
-        return "Function 1 completed"
-
-    def sync_function_1():
-        time.sleep(1)
-        return "Function 1 completed"
-
-    async def async_function_2():
-        await asyncio.sleep(2)
-        return "Function 2 completed"
-
-    def sync_function_2():
-        time.sleep(2)
-        return "Function 2 completed"
-
-    async_tool_1 = FunctionTool(async_function_1)
-    sync_tool_1 = FunctionTool(sync_function_2)
-    async_tool_2 = FunctionTool(async_function_2)
-    sync_tool_2 = FunctionTool(sync_function_2)
-
-    def run_sync_and_async_mix_without_wait():
-        # both sync and async tool can use execute
-        # sync tool can also use call
-        # takes 5 seconds (1+1+2) + overhead
-        start_time = time.time()
-        results = [
-            async_tool_1.execute(),
-            sync_tool_1.execute(),
-            sync_tool_2.call(),
-        ]
-        print(results)
-        end_time = time.time()
-        print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}")
-        return results
-
-    async def run_sync_and_async_mix():
-        # both sync and async tool can use execute&to_thread
-        # async tool can also use acall without to_thread
-        # takes a bit over 2 seconds max(2)
-        start_time = time.time()
-        results = await asyncio.gather(
-            async_tool_1.execute(),
-            sync_tool_1.execute(),
-            async_tool_2.acall(),
-        )
-        print(results)
-        end_time = time.time()
-        print(f"run_sync_and_async_mix time: {end_time - start_time}")
-        return results
-
-    print(async_tool_1.execute())
-
-    run_sync_and_async_mix_without_wait()
-    asyncio.run(run_sync_and_async_mix())
+    # import asyncio
+    # import time
+
+    # async def async_function_1():
+    #     await asyncio.sleep(1)
+    #     return "Function 1 completed"
+
+    # def sync_function_1():
+    #     time.sleep(1)
+    #     return "Function 1 completed"
+
+    # async def async_function_2():
+    #     await asyncio.sleep(2)
+    #     return "Function 2 completed"
+
+    # def sync_function_2():
+    #     time.sleep(2)
+    #     return "Function 2 completed"
+
+    # async_tool_1 = FunctionTool(async_function_1)
+    # sync_tool_1 = FunctionTool(sync_function_2)
+    # async_tool_2 = FunctionTool(async_function_2)
+    # sync_tool_2 = FunctionTool(sync_function_2)
+
+    # def run_sync_and_async_mix_without_wait():
+    #     # both sync and async tool can use execute
+    #     # sync tool can also use call
+    #     # takes 5 seconds (1+1+2) + overhead
+    #     start_time = time.time()
+    #     results = [
+    #         async_tool_1.execute(),
+    #         sync_tool_1.execute(),
+    #         sync_tool_2.call(),
+    #     ]
+    #     print(results)
+    #     end_time = time.time()
+    #     print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}")
+    #     return results
+
+    # async def run_sync_and_async_mix():
+    #     # both sync and async tool can use execute&to_thread
+    #     # async tool can also use acall without to_thread
+    #     # takes a bit over 2 seconds max(2)
+    #     start_time = time.time()
+    #     results = await asyncio.gather(
+    #         async_tool_1.execute(),
+    #         sync_tool_1.execute(),
+    #         async_tool_2.acall(),
+    #     )
+    #     print(results)
+    #     end_time = time.time()
+    #     print(f"run_sync_and_async_mix time: {end_time - start_time}")
+    #     return results
+
+    # print(async_tool_1.execute())
+
+    # run_sync_and_async_mix_without_wait()
+    # asyncio.run(run_sync_and_async_mix())
+
+    from adalflow.components.model_client import OpenAIClient
+    from adalflow.core.generator import Generator
+    from adalflow.optim.parameter import Parameter
+    from adalflow.core.types import GeneratorOutput
+    from adalflow.utils import setup_env, printc
+
+    setup_env()
+
+    llm = Generator(
+        model_client=OpenAIClient(),
+        model_kwargs={"model": "gpt-3.5-turbo"},
+    )
+    # llm.train()
+
+    def llm_as_tool(input: str, id: Optional[str] = None) -> str:
+        """Used as a calculator tool."""
+        printc(f"llm_as_tool: {input}", color="yellow")
+
+        return llm(prompt_kwargs={"input_str": input}, id=id)
+
+    llm_tool = FunctionTool(llm_as_tool, component=llm)
+    llm_tool.train()
+    output: Parameter = llm_tool("What is 2+2?")
+    output.draw_graph()
+    print(output)
+    llm_tool.eval()
+    output: FunctionTool = llm_tool("What is 2+2?")
+    print(output)
+    assert isinstance(output, FunctionOutput)
+    assert isinstance(output.output, GeneratorOutput)
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index baedd8fb..89975ade 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -9,6 +9,7 @@
 
 from typing import Any, Dict, Optional, Union, Callable, Tuple, List
 import logging
+from dataclasses import dataclass, field
 
 
 from adalflow.core.types import (
@@ -21,7 +22,12 @@
 from adalflow.core.base_data_class import DataClass
 
 
-from adalflow.optim.parameter import Parameter, GradientContext
+from adalflow.optim.parameter import (
+    Parameter,
+    GradientContext,
+    Gradient,
+    OutputParameter,
+)
 from adalflow.optim.types import ParameterType
 
 from adalflow.core.prompt_builder import Prompt
@@ -32,16 +38,21 @@
 from adalflow.utils.cache import CachedEngine
 from adalflow.tracing.callback_manager import CallbackManager
 from adalflow.utils.global_config import get_adalflow_default_root_path
+from adalflow.core.string_parser import JsonParser, Parser
+
 
 from adalflow.optim.text_grad.backend_engine_prompt import (
     FEEDBACK_ENGINE_TEMPLATE,
     LLM_CONVERSATION_TEMPLATE,
+    ALL_PRED_INFO,
+    OUTPUT_INSTRUCTION,
     VARIABLE_AND_PEERS_INFO,
     # CONVERSATION_START_INSTRUCTION_BASE,
     CONVERSATION_START_INSTRUCTION_CHAIN,
     OBJECTIVE_INSTRUCTION_BASE,
     OBJECTIVE_INSTRUCTION_CHAIN,
 )
+from adalflow.utils.logger import printc
 
 __all__ = ["Generator", "BackwardEngine", "create_teacher_generator"]
 
@@ -53,6 +64,20 @@
 PromptArgType = Dict[str, Union[str, Parameter]]
 
 
+@dataclass
+class BackwardPassSetup(DataClass):
+    all_pred_at_once: bool = field(
+        default=False, metadata={"desc": "Backward all predecessors at once."}
+    )
+    threshold_score_to_compute_grad_for_errors: float = field(
+        default=0.9,
+        metadata={"desc": "Threshold score to compute gradient for errors."},
+    )
+    compute_grad_for_errors_only: bool = field(
+        default=True, metadata={"desc": "Compute gradient for errors only."}
+    )
+
+
 class Generator(GradComponent, CachedEngine, CallbackManager):
     __doc__ = """An user-facing orchestration component for LLM prediction.
 
@@ -85,6 +110,10 @@ class Generator(GradComponent, CachedEngine, CallbackManager):
         {}
     )  # to create teacher generator from student TODO: might reaccess this
 
+    backward_pass_setup: BackwardPassSetup = (
+        BackwardPassSetup()
+    )  # default setup for the backward pass
+
     def __init__(
         self,
         *,
@@ -95,7 +124,7 @@ def __init__(
         template: Optional[str] = None,
         prompt_kwargs: Optional[Dict] = {},
         # args for the output processing
-        output_processors: Optional[Component] = None,
+        output_processors: Optional[Parser] = None,
         name: Optional[str] = None,
         # args for the cache
         cache_path: Optional[str] = None,
@@ -142,6 +171,11 @@ def __init__(
 
         self.output_processors = output_processors
 
+        if output_processors and (not isinstance(output_processors, Parser)):
+            raise ValueError(
+                f"output_processors should be a Parser instance, got {type(output_processors)}"
+            )
+
         self.set_parameters(prompt_kwargs)
 
         # end of trainable parameters
@@ -169,6 +203,9 @@ def __init__(
             {}
         )  # used by dynamic computation graph and backpropagation
 
+    def update_default_backward_pass_setup(self, setup: BackwardPassSetup):
+        self.backward_pass_setup = setup
+
     def set_cache_path(self, cache_path: str, model_client: object, model: str):
         """Set the cache path for the generator."""
 
@@ -244,7 +281,9 @@ def set_parameters(self, prompt_kwargs: PromptArgType):
                 peers = [
                     p
                     for k, p in prompt_kwargs.items()
-                    if isinstance(p, Parameter) and k != key
+                    if isinstance(p, Parameter)
+                    and k != key
+                    and p.param_type == ParameterType.PROMPT
                 ]
                 p.set_peers(peers)
                 setattr(self, key, p)
@@ -301,7 +340,7 @@ def get_prompt(self, **kwargs) -> str:
         return self.prompt.call(**kwargs)
 
     def _extra_repr(self) -> str:
-        s = f"model_kwargs={self.model_kwargs}, model_type={self.model_type}"
+        s = f"model_kwargs={self.model_kwargs}, model_type={self.model_type}, prompt={self.prompt}"
         return s
 
     def _post_call(self, completion: Any) -> GeneratorOutput:
@@ -338,6 +377,7 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]:
             model_kwargs=composed_model_kwargs,
             model_type=self.model_type,
         )
+        # printc(f"api_kwargs: {api_kwargs}", color="red")
         return api_kwargs
 
     def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any:
@@ -454,7 +494,7 @@ def forward(
                 prompt_kwargs[k] = Parameter(
                     data=v,
                     name=f"{self.name}_{k}",
-                    requires_opt=True,
+                    requires_opt=False,
                     param_type=ParameterType.INPUT,
                     data_id=id,
                 )
@@ -506,7 +546,13 @@ def forward(
                         self.model_kwargs, model_kwargs
                     ),
                 }
+                # printc(f"input_args: {input_args}", color="red")
+
                 output = self.call(**input_args, id=id)
+                if not isinstance(output, GeneratorOutput):
+                    raise ValueError(
+                        f"Output should be of type GeneratorOutput, got {type(output)}"
+                    )
         # 2. Generate a Parameter object from the output
         combined_prompt_kwargs = compose_model_kwargs(self.prompt_kwargs, prompt_kwargs)
         # if self.data_map_func is None:
@@ -517,19 +563,32 @@ def forward(
         ]
 
         log.debug(f"Predecessors: {predecessors} for generator {self.name}")
-        param_data = (
-            output.raw_response
-            if output and not output.error
-            else f"Error: {output.error}, raw_response: {output.raw_response}"
-        )
-        response: Parameter = Parameter(
+
+        def data_to_prompt_map_fn(data: Parameter) -> str:
+            data: GeneratorOutput = data.data
+            # if data.data is not None:
+            #     return data.data
+            if data.error is not None:
+                return f"Response: {data.raw_response} parsed with error: {data.error}"
+            return f" {data.raw_response}"
+
+        # TODO: all parameter should just wrap the whole output.
+        # this is for training.
+        param_data = output
+        response: Parameter = OutputParameter(
             data=param_data,
             name=self.name + "_output",
             role_desc=f"Output from (llm) {self.name}",
             param_type=ParameterType.GENERATOR_OUTPUT,
+            data_id=id,
+            full_response=output,  # the data structure
+            data_in_prompt=data_to_prompt_map_fn,
         )
         response.set_predecessors(predecessors)
-        response.trace_forward_pass(input_args=input_args, full_response=output)
+        response.trace_forward_pass(
+            input_args=input_args, full_response=output, id=self.id, name=self.name
+        )
+        # setattr(response, "full_response", output)
         # *** special to the generator ***
         response.trace_api_kwargs(api_kwargs=self._trace_api_kwargs)
         # attach the demo to the demo parameter
@@ -560,15 +619,13 @@ def forward(
             log.debug(f"Backward engine: {self.backward_engine}")
 
         # attach a funtion to compute gradient for predecessors
+
         response.set_grad_fn(
             BackwardContext(
                 backward_fn=self.backward,
                 backward_engine=self.backward_engine,
                 response=response,
-                prompt_kwargs={
-                    k: v.data if isinstance(v, Parameter) else v
-                    for k, v in prompt_kwargs.items()
-                },
+                prompt_kwargs=prompt_kwargs,
                 template=self.template,
                 prompt_str=self.get_prompt(**combined_prompt_kwargs),
                 id=id,
@@ -576,7 +633,6 @@ def forward(
         )
         return response
 
-    # == pytorch custom autograd function ==
     def backward(
         self,
         response: Parameter,  # the output of the forward pass
@@ -589,6 +645,14 @@ def backward(
 
         log.info(f"Generator: Backward: {response.name}")
 
+        backward_pass_setup = (
+            backward_engine.backward_pass_setup if backward_engine else None
+        )
+        printc(
+            f"backward pass setup: {backward_pass_setup}, name: {self.name}",
+            color="red",
+        )
+
         children_params = response.predecessors
         is_intermediate_node = True
         if response.get_gradient_and_context_text().strip() == "":
@@ -597,41 +661,229 @@ def backward(
         # backward score to the demo parameter
         for pred in children_params:
             # if pred.requires_opt:
-            pred.set_score(response._score)
+            if response.score is not None:
+                pred.set_score(response.score)
             log.debug(
-                f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}"
+                f"backpropagate the score {response.score} to {pred.name}, is_teacher: {self.teacher_mode}"
             )
             if pred.param_type == ParameterType.DEMOS:
                 # Accumulate the score to the demo
                 pred.add_score_to_trace(
-                    trace_id=id, score=response._score, is_teacher=self.teacher_mode
+                    trace_id=id, score=response.score, is_teacher=self.teacher_mode
                 )
                 log.debug(f"Pred: {pred.name}, traces: {pred._traces}")
 
         # 1.backward for text-gradients
         if backward_engine:
-            log.debug(
+
+            printc(
                 f"Generator: Backward engine is set for the generator. {backward_engine}"
             )
-            for pred in children_params:
-                if not pred.requires_opt or pred.param_type == ParameterType.DEMOS:
-                    log.debug(
-                        f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
+            if response.backward_engine_disabled:
+                for pred in children_params:
+                    pred.backward_engine_disabled = True
+                return
+
+            all_pred_at_once = backward_pass_setup.all_pred_at_once
+
+            if not all_pred_at_once:
+                for pred in children_params:
+                    if not pred.requires_opt or pred.param_type == ParameterType.DEMOS:
+                        log.debug(
+                            f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
+                        )
+                        continue
+
+                    self._backward_through_one_predecessor(
+                        pred=pred,
+                        response=response,
+                        prompt_kwargs=prompt_kwargs,
+                        template=template,
+                        backward_engine=backward_engine,
+                        prompt_str=prompt_str,
+                        backward_pass_setup=backward_pass_setup,
+                        is_intermediate_node=is_intermediate_node,
+                    )
+            else:
+                backward = False
+                for pred in children_params:
+                    if pred.requires_opt and pred.param_type in [
+                        ParameterType.PROMPT,
+                        ParameterType.GENERATOR_OUTPUT,
+                        ParameterType.RETRIEVER_OUTPUT,
+                        ParameterType.OUTPUT,
+                    ]:
+                        backward = True
+                        break
+                if backward:
+                    # 2nd approach, backward all that need opt at once.
+                    self._backward_through_all_predecessors(
+                        children_params=children_params,
+                        response=response,
+                        prompt_kwargs=prompt_kwargs,
+                        template=template,
+                        backward_engine=backward_engine,
+                        prompt_str=prompt_str,
+                        backward_pass_setup=backward_pass_setup,
+                        is_intermediate_node=is_intermediate_node,
                     )
-                    continue
-
-                self._backward_through_one_predecessor(
-                    pred=pred,
-                    response=response,
-                    prompt_kwargs=prompt_kwargs,
-                    template=template,
-                    backward_engine=backward_engine,
-                    prompt_str=prompt_str,
-                    is_intermediate_node=is_intermediate_node,
-                )
         else:
             log.debug("Backward engine is not set for the generator. No text gradient.")
 
+    @staticmethod
+    def _backward_through_all_predecessors(
+        children_params: List[Parameter],
+        response: Parameter,
+        prompt_kwargs: Dict[str, str],
+        template: str,
+        backward_engine: "BackwardEngine",
+        prompt_str: str,
+        backward_pass_setup: BackwardPassSetup,
+        is_intermediate_node: bool = False,
+    ):
+        parser = JsonParser()
+        # instruction and objective is the same for all the children
+        instruction_str, objective_str = None, None
+
+        # 1. Generate the conversation input and output
+        input_prompt_kwargs = {
+            k: v.get_prompt_data() if isinstance(v, Parameter) else v
+            for k, v in prompt_kwargs.items()
+        }
+
+        print(f"gt: {response.get_gt()}")
+
+        conversation_prompt_kwargs = {
+            "input_value": input_prompt_kwargs,
+            "llm_output": response.get_prompt_data(),
+            # "gt": response.get_gt(),
+        }
+
+        conversation_str = Prompt(
+            prompt_kwargs=conversation_prompt_kwargs,
+            template=LLM_CONVERSATION_TEMPLATE,
+        )()
+
+        all_pred_info = Prompt(
+            prompt_kwargs={"variables": [p.get_param_info() for p in children_params]},
+            template=ALL_PRED_INFO,
+        )()
+
+        conv_ins_template = None  # CONVERSATION_START_INSTRUCTION_BASE
+        obj_ins_template = OBJECTIVE_INSTRUCTION_BASE
+        if is_intermediate_node:  # TODO: this will always be true
+            conv_ins_template = CONVERSATION_START_INSTRUCTION_CHAIN
+            obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN
+            response_gradient = response.get_gradients_str()
+            # response_gradient = response.get_gradients_component_schema(
+            #     skip_correct_sample=False
+            # )
+            if not response_gradient:
+                raise ValueError(
+                    f"Generator: No gradient found for {response}. Please check the response."
+                )
+
+        # replace variable and peers with all_pred_info
+
+        instruction_str = Prompt(
+            template=conv_ins_template,
+            prompt_kwargs={
+                "variable_and_peers_info": all_pred_info,
+                "conversation_str": conversation_str,
+            },
+        )()
+        objective_str = Prompt(
+            template=obj_ins_template,
+            prompt_kwargs={
+                "response_desc": response.role_desc,
+                "response_gradient": response_gradient,
+                "instruction_to_backward_engine": response.instruction_to_backward_engine,
+            },
+        )()
+
+        backward_engine_prompt_kwargs = {
+            "conversation_sec": instruction_str,
+            "objective_instruction_sec": objective_str,
+            "output_format_str": OUTPUT_INSTRUCTION,
+        }
+
+        backward_engine_prompt_str = backward_engine.get_prompt(
+            **backward_engine_prompt_kwargs
+        )
+        # print(f"Backward engine prompt: {backward_engine_prompt_str}")
+
+        gradient_output: GeneratorOutput = None
+        response_gradient_list = [""] * len(children_params)
+        if (
+            backward_pass_setup.compute_grad_for_errors_only
+            and response.score is not None
+            and float(response.score)
+            > backward_pass_setup.threshold_score_to_compute_grad_for_errors
+        ):
+            manual_response_1 = f"You get score: {response.score}. No noticable error."
+            response_gradient_list = [manual_response_1] * len(children_params)
+            raw_response = str(response_gradient_list)
+            gradient_output = GeneratorOutput(
+                data=response_gradient_list, raw_response=raw_response
+            )
+        else:
+
+            gradient_output: GeneratorOutput = backward_engine(
+                prompt_kwargs=backward_engine_prompt_kwargs
+            )
+            if not isinstance(gradient_output, GeneratorOutput):
+                raise ValueError(
+                    f"Generator: Backward Engine should return a GeneratorOutput. Got {gradient_output} instead."
+                )
+
+            # parse the list of gradients
+
+            try:
+                response_gradient_list = parser.call(gradient_output.data)
+            except Exception as e:
+                log.error(f"Error parsing the response_gradient_list: {e}")
+                failure_message = backward_engine.failure_message_to_optimizer(
+                    gradient_output
+                )
+                if failure_message:
+                    response_gradient_list = [failure_message] * len(children_params)
+                printc(f"failure_message: {failure_message}", color="red")
+
+        print(f"gradient list: {response_gradient_list}")
+
+        # generate the gradient for each child
+        for i, pred in enumerate(children_params):
+            if not pred.requires_opt or pred.param_type == ParameterType.DEMOS:
+                log.debug(
+                    f"Generator: Skipping {pred} as it does not require optimization."
+                )
+                continue
+
+            gradient_data = (
+                response_gradient_list[i]
+                if response_gradient_list and len(response_gradient_list) > i
+                else "Failed to get the gradient."
+            )
+
+            var_gradient = Gradient(
+                data=gradient_data,
+                data_id=response.data_id,
+                score=response.score,  # add score to gradient
+                from_response=response,
+                to_pred=pred,
+            )
+            var_gradient.add_context(
+                GradientContext(
+                    input_output=conversation_str,
+                    response_desc=response.role_desc,
+                    variable_desc=pred.role_desc,  # parameter_desc
+                )
+            )
+            var_gradient.add_prompt(backward_engine_prompt_str)
+            pred.add_gradient(var_gradient)
+            if response.score is not None:
+                pred.set_score(response.score)
+
     @staticmethod
     def _backward_through_one_predecessor(
         pred: Parameter,
@@ -640,6 +892,7 @@ def _backward_through_one_predecessor(
         template: str,
         backward_engine: "BackwardEngine",
         prompt_str: str,
+        backward_pass_setup: BackwardPassSetup,
         is_intermediate_node: bool = False,
     ):
         """Creating gradient/textual feedback for prompt type parameters."""
@@ -648,9 +901,6 @@ def _backward_through_one_predecessor(
                 f"Generator: Skipping {pred} as it does not require optimization."
             )
             return
-        log.debug(
-            f"Generator: Backward through {pred}, is_intermediate_node: {is_intermediate_node}"
-        )
 
         if pred.check_if_already_computed_gradient_respect_to(response.id):
             log.debug(
@@ -669,10 +919,9 @@ def _backward_through_one_predecessor(
         }
 
         conversation_prompt_kwargs = {
-            # "variable_name": pred.name,
-            # "variable_desc": pred.role_desc,
             "input_value": input_prompt_kwargs,
-            "llm_output": response.data,
+            "llm_output": response.get_prompt_data(),
+            "gt": response.get_gt(),
         }
 
         conversation_str = Prompt(
@@ -682,8 +931,11 @@ def _backward_through_one_predecessor(
 
         variable_dict = pred.get_param_info()
 
+        peers = [p.get_param_info() for p in pred.peers]
+        # peers = []
+
         variable_and_peers_info = Prompt(
-            prompt_kwargs={"variable": variable_dict, "peers": pred.peers},
+            prompt_kwargs={"variable": variable_dict, "peers": peers},
             template=VARIABLE_AND_PEERS_INFO,
         )()
 
@@ -692,12 +944,22 @@ def _backward_through_one_predecessor(
         if is_intermediate_node:  # TODO: this will always be true
             conv_ins_template = CONVERSATION_START_INSTRUCTION_CHAIN
             obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN
-
+            response_gradient = response.get_gradients_str()
+            if not response_gradient:
+                raise ValueError(
+                    f"Generator: No gradient found for {response}. Please check the response. pred: {pred}"
+                )
+        predecessors = [
+            pred.get_param_info()
+            for pred in response.predecessors
+            if pred not in pred.peers
+        ]
         instruction_str = Prompt(
             template=conv_ins_template,
             prompt_kwargs={
                 "variable_and_peers_info": variable_and_peers_info,
                 "conversation_str": conversation_str,
+                "predecessors": predecessors,
             },
         )()
         log.info(f"Conversation start instruction base str: {instruction_str}")
@@ -705,9 +967,7 @@ def _backward_through_one_predecessor(
             template=obj_ins_template,
             prompt_kwargs={
                 "response_desc": response.role_desc,
-                "response_gradient": response.get_gradient_and_context_text(
-                    skip_correct_sample=True
-                ),
+                "response_gradient": response_gradient,
                 "instruction_to_backward_engine": pred.instruction_to_backward_engine,
             },
         )()
@@ -716,23 +976,34 @@ def _backward_through_one_predecessor(
             "conversation_sec": instruction_str,
             "objective_instruction_sec": objective_str,
         }
+        backward_engine_prompt_str = backward_engine.get_prompt(
+            **backward_engine_prompt_kwargs
+        )
+        # print(f"Backward engine prompt: {backward_engine_prompt_str}")
         gradient_output: GeneratorOutput = None
-        if response._score is not None and float(response._score) > 0.9:
+        if (
+            backward_pass_setup.compute_grad_for_errors_only
+            and response.score is not None
+            and float(response.score)
+            > backward_pass_setup.threshold_score_to_compute_grad_for_errors
+        ):
             log.debug(f"EvalFnToTextLoss: Skipping {pred} as the score is high enough.")
             # TODO: plus score descriptions
-            manual_response = f"You get score: {response._score}."
+            manual_response = f"You get score: {response.score}. No noticable error."
             gradient_output = GeneratorOutput(
                 data=manual_response, raw_response=manual_response
             )
         else:
-            # manual_response = f"You get score: {response._score}."
-            # gradient_output = GeneratorOutput(
-            #     data=manual_response, raw_response=manual_response
-            # )
 
             gradient_output: GeneratorOutput = backward_engine(
                 prompt_kwargs=backward_engine_prompt_kwargs
             )
+            if not isinstance(gradient_output, GeneratorOutput):
+                raise ValueError(
+                    f"Generator: Backward Engine should return a GeneratorOutput. Got {gradient_output} instead."
+                )
+        printc(f"Backward engine gradient: {gradient_output}")
+
         # USE this to trace each node's input and output, all nodes can be visualized
         log.info(
             f"Generator Backward Engine Prompt: {backward_engine.get_prompt( **backward_engine_prompt_kwargs)}"
@@ -741,29 +1012,25 @@ def _backward_through_one_predecessor(
             gradient_output.data
             or backward_engine.failure_message_to_optimizer(gradient_output)
         )
-        log.info(
-            f"Generator Gradient value: {gradient_value}, raw response: {gradient_output.raw_response}"
-        )
         # TODO: make it a debug feature
-        # prompt_str = backward_engine.get_prompt(**backward_engine_prompt_kwargs)
-        var_gradient = Parameter(
-            name=f"{response.name}_to_{pred.name}_grad",
-            # gradient_prompt=prompt_str,  # trace the prompt
+        var_gradient = Gradient(
             data=gradient_value,
-            requires_opt=True,
-            role_desc=f"feedback for {pred.name}",
-            score=response._score,  # add score to gradient
-            param_type=ParameterType.GRADIENT,
-            from_response_id=response.id,
+            data_id=response.data_id,
+            score=response.score,  # add score to gradient
+            from_response=response,
+            to_pred=pred,
         )
-        pred.add_gradient(var_gradient)
-        pred.set_score(response._score)
-
-        pred.gradients_context[var_gradient] = GradientContext(
-            context=conversation_str,
-            response_desc=response.role_desc,
-            variable_desc=pred.role_desc,  # parameter_desc
+        var_gradient.add_context(
+            GradientContext(
+                input_output=conversation_str,
+                response_desc=response.role_desc,
+                variable_desc=pred.role_desc,  # parameter_desc
+            )
         )
+        var_gradient.add_prompt(backward_engine_prompt_str)
+        pred.add_gradient(var_gradient)
+        if response.score is not None:
+            pred.set_score(response.score)
 
     def _run_callbacks(
         self,
@@ -798,7 +1065,7 @@ def _run_callbacks(
 
     def call(
         self,
-        prompt_kwargs: Optional[Dict] = {},  # the input need to be passed to the prompt
+        prompt_kwargs: Optional[Dict] = {},  # supports both str and parameter value
         model_kwargs: Optional[Dict] = {},
         use_cache: Optional[bool] = None,
         id: Optional[str] = None,
@@ -917,6 +1184,7 @@ def _extra_repr(self) -> str:
         ]
 
         s += f"trainable_prompt_kwargs={prompt_kwargs_repr}"
+        s += f", prompt={self.prompt}"
         return s
 
     def to_dict(self) -> Dict[str, Any]:
@@ -942,7 +1210,11 @@ class BackwardEngine(Generator):  # it is a generator with defaule template
 
     __doc__ = """The backward engine is a Generator with a default template for the backward pass.
 
-    If you want to customize the template, you can create your own backward engine"""
+    If you want to customize the template, you can create your own backward engine.
+
+    Yet, we will forever keep the training mode to False for the backward engine.
+    This is achieved by making forward the same as call.
+    """
 
     def __init__(self, **kwargs):
         if kwargs is None:
@@ -960,6 +1232,10 @@ def call(self, **kwargs) -> GeneratorOutputType:
             raise ValueError(f"Error in the backward engine: {output.error}")
         return output
 
+    def forward(self, **kwargs):
+        r"""Forward pass for the backward engine."""
+        return self.call(**kwargs)
+
     @staticmethod
     def failure_message_to_optimizer(
         gradient_response: GeneratorOutput,
diff --git a/adalflow/adalflow/core/prompt_builder.py b/adalflow/adalflow/core/prompt_builder.py
index 0d998b63..4197134e 100644
--- a/adalflow/adalflow/core/prompt_builder.py
+++ b/adalflow/adalflow/core/prompt_builder.py
@@ -7,9 +7,10 @@
 from jinja2 import Template, Environment, StrictUndefined, meta
 
 
-from adalflow.core.component import Component
 from adalflow.core.default_prompt_template import DEFAULT_ADALFLOW_SYSTEM_PROMPT
 from adalflow.optim.parameter import Parameter
+from dataclasses import dataclass
+from adalflow.core.base_data_class import DataClass
 
 
 logger = logging.getLogger(__name__)
@@ -17,7 +18,8 @@
 T = TypeVar("T")
 
 
-class Prompt(Component):
+@dataclass
+class Prompt(DataClass):
     __doc__ = r"""Renders a text string(prompt) from a Jinja2 template string.
 
     In default, we use the :ref:`DEFAULT_ADALFLOW_SYSTEM_PROMPT<core-default_prompt_template>`  as the template.
@@ -125,6 +127,9 @@ def print_prompt(self, **kwargs) -> str:
         except Exception as e:
             raise ValueError(f"Error rendering Jinja2 template: {e}")
 
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.call(*args, **kwds)
+
     def call(self, **kwargs) -> str:
         """
         Renders the prompt template with keyword arguments. Allow None values.
@@ -147,6 +152,15 @@ def _extra_repr(self) -> str:
             s += f", prompt_variables: {self.prompt_variables}"
         return s
 
+    def __repr__(self) -> str:
+        s = f"template: {self.template}"
+        prompt_kwargs_str = _convert_prompt_kwargs_to_str(self.prompt_kwargs)
+        if prompt_kwargs_str:
+            s += f", prompt_kwargs: {prompt_kwargs_str}"
+        if self.prompt_variables:
+            s += f", prompt_variables: {self.prompt_variables}"
+        return s
+
     @classmethod
     def from_dict(cls: type[T], data: Dict[str, Any]) -> T:
         obj = super().from_dict(data)
diff --git a/adalflow/adalflow/core/retriever.py b/adalflow/adalflow/core/retriever.py
index fb65a298..3778fdd8 100644
--- a/adalflow/adalflow/core/retriever.py
+++ b/adalflow/adalflow/core/retriever.py
@@ -13,8 +13,8 @@
 from adalflow.optim.grad_component import GradComponent
 
 if TYPE_CHECKING:
-    from adalflow.core.generator import Generator
-from adalflow.optim.parameter import Parameter
+    pass
+from adalflow.optim.parameter import Parameter, OutputParameter
 from adalflow.optim.types import ParameterType
 
 log = logging.getLogger(__name__)
@@ -123,41 +123,57 @@ def forward(
         top_k = Parameter(
             data=top_k or self.top_k,
             name="top_k",
-            requires_opt=True,
+            requires_opt=False,
             param_type=ParameterType.HYPERPARAM,
         )
         if input is None:
             raise ValueError("Input cannot be empty")
-        response = super().forward(input, top_k=top_k, **kwargs)
+        response: OutputParameter = super().forward(input, top_k=top_k, id=id, **kwargs)
+        if not isinstance(response, OutputParameter):
+            raise ValueError(
+                f"Retriever forward: Expect OutputParameter, but got {type(response)}"
+            )
+        response.trace_forward_pass(
+            input_args={"input": input, "top_k": top_k},
+            full_response=response.data,
+            id=self.id,
+            name=self.name,
+        )
         response.param_type = (
             ParameterType.RETRIEVER_OUTPUT
         )  # be more specific about the type
         return response
 
-    def backward(
-        self,
-        response: Parameter,
-        id: Optional[str] = None,
-        backward_engine: Optional["Generator"] = None,
-    ):
-        r"""Backward the response to pass the score to predecessors.
-        Function as a relay component"""
-        log.info(f"Retriever backward: {response.name}")
-        children_params = response.predecessors
-
-        # is_chain = True
-        if response.get_gradient_and_context_text().strip() == "":
-            log.info(f"Generator: Backward: No gradient found for {response}.")
-
-        for pred in children_params:
-            pred.set_score(response._score)
-            from adalflow.utils.logger import printc
-
-            printc(
-                f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}",
-                "blue",
-            )
-            if pred.param_type == ParameterType.DEMOS:
-                pred.add_score_to_trace(
-                    trace_id=id, score=response._score, is_teacher=self.teacher_mode
-                )
+    # def backward(
+    #     self,
+    #     response: Parameter,
+    #     id: Optional[str] = None,
+    #     backward_engine: Optional["Generator"] = None,
+    # ):
+    #     r"""Backward the response to pass the score to predecessors.
+    #     Function as a relay component"""
+    #     log.info(f"Retriever backward: {response.name}")
+    #     children_params = response.predecessors
+
+    #     # is_chain = True
+    #     if response.get_gradient_and_context_text().strip() == "":
+    #         log.info(f"Generator: Backward: No gradient found for {response}.")
+
+    #     for pred in children_params:
+    #         pred.set_score(response._score)
+    #         from adalflow.utils.logger import printc
+
+    #         printc(
+    #             f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}",
+    #             "blue",
+    #         )
+    #         if pred.param_type == ParameterType.DEMOS:
+    #             pred.add_score_to_trace(
+    #                 trace_id=id, score=response._score, is_teacher=self.teacher_mode
+    #             )
+
+    #         # pass the gradients
+    #         for grad in response.gradients:
+    #             # make a copy of the gradient
+    #             grad = deepcopy(grad)
+    #             pred.add_gradient(grad)
diff --git a/adalflow/adalflow/core/string_parser.py b/adalflow/adalflow/core/string_parser.py
index 3001b512..b18f619d 100644
--- a/adalflow/adalflow/core/string_parser.py
+++ b/adalflow/adalflow/core/string_parser.py
@@ -2,10 +2,11 @@
 
 From simple data types like boolean, integer, and float to more complex data types like JSON, YAML, and list strings."""
 
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Optional, Callable
 import logging
+from adalflow.utils.registry import EntityMapping
+
 
-from adalflow.core.component import Component
 import adalflow.core.functional as F
 
 log = logging.getLogger(__name__)
@@ -13,12 +14,15 @@
 BOOLEAN_PARSER_OUTPUT_TYPE = bool
 
 
-class Parser(Component):
+class Parser:
     __doc__ = r"""Base class for all string parsers."""
 
     def __init__(self):
         super().__init__()
 
+    def __call__(self, input: str) -> object:
+        return self.call(input)
+
     def call(self, input: str) -> object:
         raise NotImplementedError(
             "Parser subclasses must implement the __call__ method"
@@ -246,3 +250,72 @@ def call(self, input: str) -> YAML_PARSER_OUTPUT_TYPE:
             return yaml_obj
         except Exception as e:
             raise ValueError(f"Error: {e}")
+
+
+class FuncParser(Parser):
+    r"""Component that wraps a function.
+
+    Args:
+        fun (Callable): The function to be wrapped.
+
+    Examples:
+
+    function = lambda x: x + 1
+    fun_component = FunComponent(function)
+    print(fun_component(1))  # 2
+    """
+
+    def __init__(self, fun: Optional[Callable] = None, afun: Optional[Callable] = None):
+        super().__init__()
+        self.fun_name = fun.__name__
+        EntityMapping.register(self.fun_name, fun)
+
+    def call(self, *args, **kwargs):
+        fun = EntityMapping.get(self.fun_name)
+        return fun(*args, **kwargs)
+
+    def __repr__(self) -> str:
+        return super().__repr__() + f"fun_name={self.fun_name}"
+
+
+def func_to_parser(fun) -> FuncParser:
+    r"""Helper function to convert a function into a Parser class.
+    its own class name.
+
+    Can be used as both a decorator and a function.
+
+    Args:
+        fun (Callable): The function to be wrapped.
+    Returns:
+        FuncParser: The component that wraps the function.
+
+    Examples:
+    1. As a decorator:
+        >>> @func_to_parser
+        >>> def my_function(x):
+        >>>     return x + 1
+        >>> # is equivalent to
+        >>> class MyFunctionParser(FuncParser):
+        >>>     def __init__(self):
+        >>>         super().__init__(my_function)
+
+    2. As a function:
+        >>> my_function_parser = func_to_parser(my_function)
+    """
+
+    # Split the function name by underscores, capitalize each part, and join them back together
+    class_name = (
+        "".join(part.capitalize() for part in fun.__name__.split("_")) + "Component"
+    )
+    # register the function
+    EntityMapping.register(fun.__name__, fun)
+    # Define a new component class dynamically
+    parser_class = type(
+        class_name,
+        (FuncParser,),
+        {"__init__": lambda self: FuncParser.__init__(self, fun)},
+    )
+    # register the component
+    EntityMapping.register(class_name, parser_class)
+
+    return parser_class()
diff --git a/adalflow/adalflow/core/tool_manager.py b/adalflow/adalflow/core/tool_manager.py
index 3538762a..caf5137a 100644
--- a/adalflow/adalflow/core/tool_manager.py
+++ b/adalflow/adalflow/core/tool_manager.py
@@ -2,13 +2,27 @@
 The ToolManager manages a list of tools, context, and all ways to execute functions.
 """
 
-from typing import List, Dict, Optional, Any, Callable, Awaitable, Union
+from typing import (
+    List,
+    Dict,
+    Optional,
+    Any,
+    Callable,
+    Awaitable,
+    Union,
+    overload,
+    Literal,
+)
 import logging
 from copy import deepcopy
 import asyncio
+from adalflow.optim.parameter import Parameter, ParameterType
 import nest_asyncio
+import warnings
 
-from adalflow.core import Component
+from adalflow.core.container import ComponentList
+from adalflow.optim.grad_component import GradComponent2
+from adalflow.core.component import Component
 from adalflow.core.func_tool import FunctionTool
 from adalflow.core.types import (
     FunctionDefinition,
@@ -16,6 +30,8 @@
     Function,
     FunctionExpression,
 )
+from adalflow.utils import printc
+
 
 from adalflow.core.functional import (
     parse_function_call_expr,
@@ -42,15 +58,90 @@ def run_async_in_new_loop(coro):
         asyncio.set_event_loop(None)
 
 
+class CallFunctionTool(Component):
+    __doc__ = """Contains other unit gradcomponent such as calling
+                a FunctionTool"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, func: Parameter, context: Dict[str, object]):
+        return self.bicall(func, context=context)
+
+    def call(self, func: Function, context: Dict[str, object]) -> FunctionOutput:
+        return self.bicall(func, context=context)
+
+    def bicall(
+        self,
+        func: Union[Function, Parameter],
+        context: Dict[str, object] = {},
+    ):
+        if isinstance(func, Parameter):
+            # printc(f"context: {context}", color="yellow")
+            func_data: Function = func.map_to_successor(self)
+            if not isinstance(func_data, Function):
+                raise ValueError(f"Error parsing function expression: {func}")
+            tool: FunctionTool = context[func_data.name]
+            # print(f"tool training: {tool.training}")
+            output = tool.forward(*func_data.args, **func_data.kwargs)
+
+            from adalflow.optim.grad_component import fun_to_grad_component
+
+            # this will automatically create the outputparam, and connect output, func to the outputParam
+            @fun_to_grad_component
+            def dummy_pass_through_for_untrainable_fn(output, func):
+                return output
+
+            # NOTE: special case: handle the function which is not a grad_component
+            # here we have to specifically converts it to a parameter and handles the predecessors
+            # there is no trainable parameters inside of the tool but the tool response itself can be optimized by response optimizer
+            if not isinstance(output, Parameter):
+                return dummy_pass_through_for_untrainable_fn.forward(output, func)
+            else:
+                # reconnect the predecessor for tracing as it is not done in tool.forward
+                output.predecessors.add(func)
+            return output
+        else:
+            tool: FunctionTool = context[func.name]
+            output = tool.call(*func.args, **func.kwargs)
+            return output
+
+
+class FunctionExperssionToFunction(GradComponent2):
+    def __init__(self):
+        super().__init__(desc="Convert FunctionExpression to Function")
+
+    def call(self, expr: FunctionExpression, context: Dict[str, object]) -> Function:
+
+        assert isinstance(
+            expr, FunctionExpression
+        ), f"Expected FunctionExpression, got {type(expr)}"
+
+        expr_str = expr.action
+        func_name, args, kwargs = parse_function_call_expr(expr_str, context)
+        # printc(
+        #     f"func_name: {func_name}, args: {args}, kwargs: {kwargs}", color="yellow"
+        # )
+        output = Function(
+            name=func_name,
+            args=args,
+            kwargs=kwargs,
+            thought=expr.thought,
+        )
+        # printc(f"output: {output}", color="yellow")
+        return output
+
+
 # TODO: good to track all the failed function calls
+# Tool manager is a task component
 class ToolManager(Component):
     __doc__ = r""""Manage a list of tools, context, and all ways to execute functions.
 
-    yaml and json definitions are for quick access to the definitions of the tools.
-    If you need more specification, such as using exclude field, you can use the function_definitions.
-    Args:
 
+    ToolManager is a task component that does not need its own backward function.
 
+    yaml and json definitions are for quick access to the definitions of the tools.
+    If you need more specification, such as using exclude field, you can use the function_definitions.
     """
 
     def __init__(
@@ -62,56 +153,214 @@ def __init__(
     ):
         super().__init__()
         nest_asyncio.apply()  # Apply nest_asyncio to handle nested loops
-        # super(LocalDB, self).__init__()
-        self.tools = [
+        tools = [
             (
                 FunctionTool(fn=deepcopy(tool))
                 if not isinstance(tool, FunctionTool)
-                else deepcopy(tool)
+                else tool
             )
             for tool in tools
         ]
-        self._context_map = {tool.definition.func_name: tool for tool in self.tools}
+        self.tools = ComponentList(tools)
+        self._context_map = self.create_context_map_from_tools(self.tools)
         self._additional_context = additional_context or {}
         self.context = {**self._context_map, **self._additional_context}
         log.info(
             f"Initialized ToolManager with {len(self.tools)} tools and additional context {self._additional_context}"
         )
 
+    @staticmethod
+    def get_context_index(tool: FunctionTool) -> Dict[str, object]:
+        index = tool.definition.func_name
+        if tool.definition.class_instance:
+            index = f"{tool.definition.class_instance}.{index}"
+        output = {index: tool}
+        if tool.definition.func_name == "__call__":
+            # add another index of directly using the classinstance
+            output[f"{tool.definition.class_instance}"] = tool
+        return output
+
+    @staticmethod
+    def create_context_map_from_tools(tools: List[FunctionTool]) -> Dict[str, object]:
+        output: Dict[str, object] = {}
+        for tool in tools:
+            tool_map = ToolManager.get_context_index(tool)
+            for k, v in tool_map.items():
+                if k in output:
+                    # raise ValueError(f"Duplicate key {k} in the context map.")
+                    warnings.warn(f"Duplicate key {k} in the context map.")
+                    continue
+                output[k] = v
+        return output
+
     @property
     def yaml_definitions(self) -> List[str]:
-        return [tool.definition.to_yaml() for tool in self.tools]
+        output = []
+        for tool in self.tools:
+            if not tool.definition.class_instance:
+                output.append(tool.definition.to_yaml(exclude=["class_instance"]))
+            else:
+                output.append(tool.definition.to_yaml())
+            output.append(tool.definition.to_yaml(exclude=["class_instance"]))
+        return output
 
     @property
     def json_definitions(self) -> List[str]:
-        return [tool.definition.to_json() for tool in self.tools]
+        output = []
+        for tool in self.tools:
+            if not tool.definition.class_instance:
+                output.append(tool.definition.to_json(exclude=["class_instance"]))
+            else:
+                output.append(tool.definition.to_json())
+            output.append(tool.definition.to_json(exclude=["class_instance"]))
+        return output
 
     @property
     def function_definitions(self) -> List[FunctionDefinition]:
         return [tool.definition for tool in self.tools]
 
-    def parse_func_expr(self, expr: FunctionExpression) -> Function:
+    def parse_func_expr(
+        self,
+        expr: Union[FunctionExpression, Parameter],
+        map_fn: Callable = lambda x: x.data,
+    ) -> Union[Function, Parameter]:
         r"""Parse the function call expression."""
-        try:
-            expr_str = expr.action
-            func_name, args, kwargs = parse_function_call_expr(expr_str, self.context)
-            return Function(name=func_name, args=args, kwargs=kwargs)
-        except Exception as e:
-            log.error(f"Error {e} parsing function call expression: {expr_str}")
-            raise ValueError(f"Error {e} parsing function call expression: {expr_str}")
 
-    def execute_func(self, func: Function) -> FunctionOutput:
-        r"""Execute the function. If the function is async, use asyncio.run to execute it."""
-        try:
-            tool: FunctionTool = self.context[func.name]
-            if tool.is_async:
-                log.debug("Running async function in new loop")
-                return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs))
+        if isinstance(expr, Parameter):
+            # try:
+
+            func = FunctionExperssionToFunction()
+            expr.add_successor_map_fn(func, map_fn=map_fn)
+            # print("FunctionExperssionToFunction")
+            output = func.forward(expr, context=self.context)
+            # print(f"output data: {output.data}")
+            return output
+
+            #     except Exception as e:
+            #         error_msg = (
+            #             f"Error {e} parsing function call expression: {map_fn(expr)}"
+            #         )
+            #         return error_msg
+            # else:
+            try:
+                expr_str = expr.action
+                func_name, args, kwargs = parse_function_call_expr(
+                    expr_str, self.context
+                )
+                return Function(name=func_name, args=args, kwargs=kwargs)
+            except Exception as e:
+                log.error(f"Error {e} parsing function call expression: {expr}")
+                raise ValueError(f"Error {e} parsing function call expression: {expr}")
+
+    @overload
+    def call(
+        self, *, expr_or_fun: FunctionExpression, step: Literal["parse"] = "parse"
+    ) -> Function: ...
+
+    @overload
+    def call(
+        self, *, expr_or_fun: FunctionExpression, step: Literal["execute"] = "execute"
+    ) -> FunctionOutput: ...
+
+    @overload
+    def call(
+        self, *, expr_or_fun: Function, step: Literal["execute"] = "parse"
+    ) -> Function: ...
+
+    @overload
+    def call(
+        self, *, expr_or_fun: Function, step: Literal["execute"] = "execute"
+    ) -> FunctionOutput: ...
+
+    def call(
+        self,
+        *,
+        expr_or_fun: Union[FunctionExpression, Function],
+        step: Literal["execute"] = "execute",
+    ) -> Union[FunctionOutput, Function, Parameter]:
+        print(f"self.training: {self.training}, expr_or_fun: {expr_or_fun}")
+        if not isinstance(expr_or_fun, (Function, FunctionExpression)):
+            raise ValueError(
+                f"expr_or_fun should be either a Function or FunctionExpression. Got {expr_or_fun}"
+            )
+        if step == "parse":
+            if isinstance(expr_or_fun, Function):
+                return expr_or_fun
+            return self.parse_func_expr(expr_or_fun)
+        else:
+            if isinstance(expr_or_fun, Function):
+                return self.execute_func(expr_or_fun)
+            return self.execute_func_expr(expr_or_fun)
+
+    def forward(
+        self,
+        *,
+        expr_or_fun: Union[FunctionExpression, Function, Parameter],
+        step: Literal["parse", "execute"] = "execute",
+        map_fn: Callable = lambda x: x.data,  # how to map the parameter to the needed data
+    ) -> Union[FunctionOutput, Function, Parameter]:
+        "Run a forward pass on the tool manager such as parsing function expression or executing function."
+        if isinstance(expr_or_fun, Parameter):
+            expr_or_fun_data = map_fn(expr_or_fun)
+            if step == "execute":
+                if isinstance(expr_or_fun_data, Function):
+                    return self.execute_func(expr_or_fun, map_fn=map_fn)
+                else:
+                    raise NotImplementedError(
+                        "Only Function expressions are supported for now."
+                    )
             else:
-                return tool.call(*func.args, **func.kwargs)
-        except Exception as e:
-            log.error(f"Error {e} executing function: {func}")
-            raise ValueError(f"Error {e} executing function: {func}")
+                if isinstance(expr_or_fun_data, FunctionExpression):
+                    output = self.parse_func_expr(expr_or_fun, map_fn=map_fn)
+                    return output
+                else:
+                    raise NotImplementedError(
+                        f"Only function call expressions are supported for now. Got {expr_or_fun_data}"
+                    )
+        else:
+            raise ValueError(f"expr_or_fun should be a Parameter. Got {expr_or_fun}")
+            # return self.call(expr_or_fun=expr_or_fun, step=step)
+
+    def execute_func(
+        self, func: Union[Function, Parameter], map_fn: Callable = lambda x: x.data
+    ) -> Union[FunctionOutput, Parameter]:
+        r"""Execute the function. If the function is async, use asyncio.run to execute it."""
+
+        if isinstance(func, Parameter):
+            try:
+
+                call_func_tool = CallFunctionTool()
+                func.add_successor_map_fn(call_func_tool, map_fn=map_fn)
+                return call_func_tool.forward(func, context=self.context)
+
+            except Exception as e:
+                log.error(f"Error {e} executing function: {func.data}")
+                error_msg = f"Error {e} executing function: {func.data}"
+                return error_msg
+
+        else:
+            try:
+                tool: FunctionTool = self.context[func.name]
+                if tool.is_async:
+                    return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs))
+
+                else:
+                    return tool.call(*func.args, **func.kwargs)
+            except Exception as e:
+                log.error(f"Error {e} executing function: {func}")
+                raise ValueError(f"Error {e} executing function: {func}")
+
+        # try:
+        #     tool: FunctionTool = self.context[func.name]
+        #     if tool.is_async:
+        #         log.debug("Running async function in new loop")
+        #         return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs))
+        #     else:
+        #         # TODO ensure it is set to traing mode
+        #         return tool.forward(*func.args, **func.kwargs)
+        # except Exception as e:
+        #     log.error(f"Error {e} executing function: {func}")
+        #     raise ValueError(f"Error {e} executing function: {func}")
 
     async def execute_func_async(self, func: Function) -> FunctionOutput:
         r"""Execute the function. If the function is sync, use await to execute it."""
@@ -125,16 +374,40 @@ async def execute_func_async(self, func: Function) -> FunctionOutput:
             log.error(f"Error {e} executing function: {func}")
             raise ValueError(f"Error {e} executing function: {func}")
 
-    def execute_func_expr(self, expr: FunctionExpression) -> FunctionOutput:
+    def execute_func_expr(
+        self,
+        expr: Union[FunctionExpression, Parameter],
+        map_fn: Callable = lambda x: x.data,
+    ) -> Union[FunctionOutput, Parameter]:
         r"""Execute the function expression. Support both sync and async functions."""
-        func: Function = self.parse_func_expr(expr)
-        try:
 
-            return self.execute_func(func)
-        except Exception as e:
-            # NOTE: if the function expression is not a function call, try to execute it as a function expression
-            log.error(f"Error {e} executing function expression: {expr}")
-            raise ValueError(f"Error {e} executing function expression: {expr}")
+        if isinstance(expr, Parameter):
+
+            func: Parameter = self.parse_func_expr(expr, map_fn=map_fn)
+            if not isinstance(func, Parameter):
+                raise ValueError(f"Error parsing function expression: {expr}")
+
+            # execute the function
+            output: Parameter = self.execute_func(func)
+            if not isinstance(output, Parameter):
+                raise ValueError(f"Error executing function expression: {expr}")
+            output.predecessors.add(expr)
+            return output
+        else:
+
+            try:
+                func: Function = self.parse_func_expr(expr)
+                if not isinstance(func, Function):
+                    raise ValueError(f"Error parsing function expression: {expr}")
+
+                return self.execute_func(func)
+            except Exception as e:
+                # NOTE: if the function expression is not a function call, try to execute it as a function expression
+                log.error(f"Error {e} executing function expression: {expr}")
+                # raise ValueError(f"Error {e} executing function expression: {expr}")
+                return FunctionOutput(
+                    name=expr.action, input=expr, output=None, error=None
+                )
 
     async def execute_func_expr_async(self, expr: FunctionExpression) -> FunctionOutput:
         r"""Execute the function expression. Support both sync and async functions."""
@@ -186,3 +459,54 @@ def execute_func_expr_via_eval(self, expr: FunctionExpression) -> FunctionOutput
     def _extra_repr(self) -> str:
         s = f"Tools: {self.tools}, Additional Context: {self._additional_context}"
         return s
+
+
+if __name__ == "__main__":
+    # test tool manager
+    from adalflow.core.func_tool import FunctionTool
+    from adalflow.components.model_client import OpenAIClient
+    from adalflow.core.generator import Generator
+    from adalflow.optim.parameter import Parameter
+    from adalflow.utils import setup_env, printc
+
+    setup_env()
+
+    llm = Generator(
+        model_client=OpenAIClient(),
+        model_kwargs={"model": "gpt-3.5-turbo"},
+    )
+    # llm.train()
+
+    def llm_as_tool(input: str, id: Optional[str] = None) -> str:
+        """Used as a calculator tool."""
+        printc(f"llm_as_tool: {input}", color="yellow")
+
+        return llm(prompt_kwargs={"input_str": input}, id=id)
+
+    llm_tool = FunctionTool(llm_as_tool, component=llm)
+    # llm_tool.train()
+    # output: Parameter = llm_tool("What is 2+2?")
+    # output.draw_graph()
+    # print(output)
+
+    tool_manager = ToolManager(tools=[llm_tool])
+    tool_manager.train()
+    expr_or_fun = Parameter(
+        name="expr_or_fun",
+        data=FunctionExpression(action="llm_as_tool('What is 2+2?')"),
+        eval_input="What is 2+2?",
+        param_type=ParameterType.INPUT,
+    )
+    output: Parameter = tool_manager(expr_or_fun=expr_or_fun, step="parse")
+    print(output)
+    print(output.predecessors)
+    assert len(output.predecessors) == 1
+    # output = tool_manager(output, step="execute")
+    # print(output)
+    # output.draw_graph()
+
+    # expr_or_fun = FunctionExpression(action="llm_as_tool('What is 2+2?')")
+
+    # tool_manager.eval()
+    # output = tool_manager(expr_or_fun=expr_or_fun, step="execute")
+    # print(output)
diff --git a/adalflow/adalflow/core/types.py b/adalflow/adalflow/core/types.py
index 18724510..0ef80390 100644
--- a/adalflow/adalflow/core/types.py
+++ b/adalflow/adalflow/core/types.py
@@ -13,7 +13,6 @@
     Literal,
     Callable,
     Awaitable,
-    Type,
 )
 from collections import OrderedDict
 from dataclasses import (
@@ -25,6 +24,7 @@
 from datetime import datetime
 import uuid
 import logging
+import json
 
 from adalflow.core.base_data_class import DataClass, required_field
 from adalflow.core.tokenizer import Tokenizer
@@ -281,19 +281,25 @@ class RetrieverOutput(DataClass):
     It is up to the subclass of Retriever to specify the type of query and document.
     """
 
-    doc_indices: List[int] = field(metadata={"desc": "List of document indices"})
-    doc_scores: Optional[List[float]] = field(
+    id: str = field(default=None, metadata={"desc": "The unique id of the output"})
+
+    doc_indices: List[int] = field(
+        default=required_field, metadata={"desc": "List of document indices"}
+    )
+    doc_scores: List[float] = field(
         default=None, metadata={"desc": "List of document scores"}
     )
-    query: Optional[RetrieverQueryType] = field(
+    query: RetrieverQueryType = field(
         default=None, metadata={"desc": "The query used to retrieve the documents"}
     )
-    documents: Optional[List[RetrieverDocumentType]] = field(
+    documents: List[RetrieverDocumentType] = field(
         default=None, metadata={"desc": "List of retrieved documents"}
     )
 
 
-RetrieverOutputType = List[RetrieverOutput]  # so to support multiple queries at once
+RetrieverOutputType = Union[
+    List[RetrieverOutput], RetrieverOutput
+]  # so to support multiple queries at once
 
 
 #######################################################################################
@@ -305,8 +311,13 @@ class RetrieverOutput(DataClass):
 @dataclass
 class FunctionDefinition(DataClass):
     __doc__ = r"""The data modeling of a function definition, including the name, description, and parameters."""
-
-    func_name: str = field(metadata={"desc": "The name of the tool"})
+    class_instance: Optional[Any] = field(
+        default=None,
+        metadata={"desc": "The instance of the class this function belongs to"},
+    )
+    func_name: str = field(
+        metadata={"desc": "The name of the tool"}, default=required_field
+    )
     func_desc: Optional[str] = field(
         default=None, metadata={"desc": "The description of the tool"}
     )
@@ -406,9 +417,10 @@ def add(a, b):
 
     The benefits are less failed function calls.
     """
-    thought: Optional[str] = field(
-        default=None, metadata={"desc": "Why the function is called"}
-    )
+    # question: str = field(
+    #     default=None, metadata={"desc": "The question to ask the LLM"}
+    # )
+    thought: str = field(default=None, metadata={"desc": "Why the function is called"})
     action: str = field(
         default_factory=required_field,
         metadata={"desc": _action_desc},
@@ -513,36 +525,19 @@ class StepOutput(DataClass, Generic[T]):
         default=None, metadata={"desc": "The execution result shown for this action"}
     )
 
-    @classmethod
-    def with_action_type(cls, action_type: Type[T]) -> Type["StepOutput[T]"]:
-        """
-        Create a new StepOutput class with the specified action type.
-
-        Use this if you want to create schema for StepOutput with a specific action type.
-
-        Args:
-            action_type (Type[T]): The type to set for the action attribute.
-
-        Returns:
-            Type[StepOutput[T]]: A new subclass of StepOutput with the specified action type.
-
-        Example:
-
-        .. code-block:: python
-
-            from adalflow.core.types import StepOutput, FunctionExpression
-
-            StepOutputWithFunctionExpression = StepOutput.with_action_type(FunctionExpression)
-        """
-        # Create a new type variable map
-        type_var_map = {T: action_type}
-
-        # Create a new subclass with the updated type
-        new_cls = type(cls.__name__, (cls,), {"__type_var_map__": type_var_map})
-
-        # Update the __annotations__ to reflect the new type of action
-        new_cls.__annotations__["action"] = action_type
-        return new_cls
+    def to_prompt_str(self) -> str:
+        output: Dict[str, Any] = {}
+        if self.action and isinstance(self.action, FunctionExpression):
+            if self.action.thought:
+                output["thought"] = self.action.thought
+            output["action"] = self.action.action if self.action else None
+        if self.observation:
+            output["observation"] = (
+                self.observation.to_dict()
+                if hasattr(self.observation, "to_dict")
+                else str(self.observation)
+            )
+        return json.dumps(output)
 
 
 #######################################################################################
diff --git a/adalflow/adalflow/datasets/big_bench_hard.py b/adalflow/adalflow/datasets/big_bench_hard.py
index f98f2517..3e628f74 100644
--- a/adalflow/adalflow/datasets/big_bench_hard.py
+++ b/adalflow/adalflow/datasets/big_bench_hard.py
@@ -24,7 +24,7 @@ class BigBenchHard(Dataset):
 
     Size for each split:
     - train: 50 examples
-    - val: 50 examples
+    - val: 100 examples
     - test: 100 examples
 
     Args:
@@ -120,11 +120,11 @@ def _check_or_download_dataset(self, data_path: str = None, split: str = "train"
         ]
         val_examples = [
             {"x": ex["input"], "y": ex["target"], "id": str(uuid.uuid4())}
-            for ex in examples[50:100]
+            for ex in examples[50:150]
         ]
         test_examples = [
             {"x": ex["input"], "y": ex["target"], "id": str(uuid.uuid4())}
-            for ex in examples[150:250]
+            for ex in examples[150:]
         ]
         # ensure the
 
@@ -150,7 +150,7 @@ def get_default_task_instruction():
 if __name__ == "__main__":
     from adalflow.datasets.big_bench_hard import BigBenchHard
 
-    dataset = BigBenchHard(task_name="word_sorting", split="train")
+    dataset = BigBenchHard(task_name="object_counting", split="test")
     print(dataset[0:10])
     print(len(dataset))
     print(dataset.get_default_task_instruction())
diff --git a/adalflow/adalflow/datasets/hotpot_qa.py b/adalflow/adalflow/datasets/hotpot_qa.py
index 528ae388..22919e77 100644
--- a/adalflow/adalflow/datasets/hotpot_qa.py
+++ b/adalflow/adalflow/datasets/hotpot_qa.py
@@ -1,13 +1,12 @@
 import random
 import os
-import csv
-from typing import Literal
+from typing import Literal, List
 
 from adalflow.utils.lazy_import import safe_import, OptionalPackages
 
 
 from adalflow.utils.data import Dataset
-from adalflow.utils.file_io import save_csv
+from adalflow.utils.file_io import save_csv, save_json, load_json
 from adalflow.datasets.utils import prepare_dataset_path
 from adalflow.core.base_data_class import DataClass
 from adalflow.datasets.types import HotPotQAData
@@ -23,6 +22,16 @@ def __init__(
         size: int = None,
         **kwargs,
     ) -> None:
+        r"""
+        official_train: 15661
+        sampled_trainset: 11745
+        sampled_valset: 3916
+        test: 7405
+
+        All answers are a phrase in the supporting context where we can choose supporting facts from the context.
+
+        You can specify the size of the dataset to load by setting the size parameter.
+        """
         if split not in ["train", "val", "test"]:
             raise ValueError("Split must be one of 'train', 'val', 'test'")
 
@@ -36,7 +45,8 @@ def __init__(
         self.task_name = f"hotpot_qa_{keep_details}"
         data_path = prepare_dataset_path(self.root, self.task_name)
         # download and save
-        split_csv_path = os.path.join(data_path, f"{split}.csv")
+        split_csv_path = os.path.join(data_path, f"{split}.json")
+        print(f"split_csv_path: {split_csv_path}")
         self._check_or_download_dataset(
             split_csv_path, split, only_hard_examples, keep_details
         )
@@ -46,12 +56,20 @@ def __init__(
         # created_data_class = DynamicDataClassFactory.from_dict(
         #  "HotPotQAData", {"id": "str", "question": "str", "answer": "str"}
 
-        with open(split_csv_path, newline="") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for i, row in enumerate(reader):
-                if size is not None and i >= size:
-                    break
-                self.data.append(HotPotQAData.from_dict(row))
+        # with open(split_csv_path, newline="") as csvfile:
+        #     reader = csv.DictReader(csvfile)
+        #     for i, row in enumerate(reader):
+        #         if size is not None and i >= size:
+        #             break
+        #         self.data.append(HotPotQAData.from_dict(row))
+
+        self.data = load_json(split_csv_path)
+        if size is not None:
+            # use random seed to make sure the same data is loaded
+            # random.Random(0).shuffle(self.data)
+            self.data = self.data[:size]
+        # convert to dataclass
+        self.data = [HotPotQAData.from_dict(d) for d in self.data]
 
     def _check_or_download_dataset(
         self,
@@ -90,6 +108,24 @@ def _check_or_download_dataset(
         hf_official_dev = load_dataset(
             "hotpot_qa", "fullwiki", split="validation", trust_remote_code=True
         )
+        data_path_dir = os.path.dirname(data_path)
+        # save all the original data
+        all_original_keys = hf_official_train[0].keys()
+        for split, examples in zip(
+            ["hf_official_train", "hf_official_dev"],
+            [hf_official_train, hf_official_dev],
+        ):
+            target_path = os.path.join(data_path_dir, f"{split}.csv")
+            save_csv(examples, f=target_path, fieldnames=all_original_keys)
+            # for example in examples:
+            #     # is answer in the context
+            #     print(f"example: {example}")
+            #     context = str(json.dumps(example["context"]))
+            #     if example["answer"] in context:
+            #         print(f"answer in context")
+            #     else:
+            #         print(f"answer not in context")
+            print(f"saved {split} to {target_path}")
         keys = ["question", "answer"]
         if keep_details == "all":
             keys = [
@@ -101,33 +137,39 @@ def _check_or_download_dataset(
                 "context",
             ]
         elif keep_details == "dev_titles":
-            keys = ["id", "question", "answer", "supporting_facts"]
+            keys = ["id", "question", "answer", "supporting_facts", "context"]
 
-        official_train = []
+        official_train = []  # 15661
         for raw_example in hf_official_train:
             if raw_example["level"] == "hard":
                 example = {k: raw_example[k] for k in keys}
 
                 if "supporting_facts" in example:
                     example["gold_titles"] = set(example["supporting_facts"]["title"])
-                    del example["supporting_facts"]
+                    # del example["supporting_facts"]
 
                 official_train.append(example)
+        print(f"official_train: {len(official_train)}")
 
         rng = random.Random(0)
         rng.shuffle(official_train)
 
-        sampled_trainset = official_train[: len(official_train) * 75 // 100]
+        sampled_trainset = official_train[: len(official_train) * 70 // 100]  # 11745
+        print(f"sampled_trainset: {len(sampled_trainset)}")
 
-        sampled_valset = official_train[
-            len(official_train) * 75 // 100 :
+        sampled_valset = official_train[  # 3916
+            len(official_train) * 70 // 100 :
         ]  # this is not the official dev set
 
+        print(f"sampled_valset: {len(sampled_valset)}")
+
         # for example in self._train:
         #     if keep_details == "dev_titles":
         #         del example["gold_titles"]
 
-        test = []
+        test = []  # 7405
+
+        print(f"raw_example: {hf_official_dev[0]}")
         for raw_example in hf_official_dev:
             assert raw_example["level"] == "hard"
             example = {
@@ -136,24 +178,44 @@ def _check_or_download_dataset(
             }
             if "supporting_facts" in example:
                 example["gold_titles"] = set(example["supporting_facts"]["title"])
-                del example["supporting_facts"]
+
+                # del example["supporting_facts"]
             test.append(example)
 
-        keys = ["id", "question", "answer", "gold_titles"]
+        keys = ["id", "question", "answer", "gold_titles", "context"]
+
+        # split test into val and test
+        # random shuff the test
+        rng.shuffle(test)
+        test_split = test[: len(test) * 50 // 100]  # 3702
+        val_split = test[len(test) * 50 // 100 :]  # 3703
+
         # save to csv
         for split, examples in zip(
             ["train", "val", "test"],
-            [sampled_trainset, sampled_valset, test],
+            [sampled_trainset, val_split, test_split],
         ):
             # target_path = prepare_dataset_path(self.root, task_name, split)
-            save_csv(examples, f=data_path, fieldnames=keys)
+            target_path = os.path.join(data_path_dir, f"{split}.json")
+            # filter the examples with only the keys
+            save_examples: List[HotPotQAData] = []
+            for example in examples:
+                save_example = {k: example[k] for k in keys if k in example}
+                save_example = HotPotQAData.from_dict(save_example)
+                save_examples.append(save_example.to_dict())
+            save_json(save_examples, f=target_path)
+            if split == "train":
+                print(f"train example: {examples[0]}")
+            print(f"saved {split} to {target_path}")
 
         if split == "train":
             return sampled_trainset
         elif split == "val":
             return sampled_valset
-        else:
+        elif split == "test":
             return test
+        else:
+            raise ValueError("Split must be one of 'train', 'val', 'test'")
 
     def __getitem__(self, index) -> DataClass:
         return self.data[index]
@@ -172,3 +234,107 @@ def __len__(self):
     print(len(testdataset))
     print(f"valdataset[0]: {valdataset[0]}")
     print(f"testdataset[0]: {testdataset[0]}")
+    # example = {
+    #     "id": "5a8b57f25542995d1e6f1371",
+    #     "question": "Were Scott Derrickson and Ed Wood of the same nationality?",
+    #     "answer": "yes",
+    #     "type": "comparison",
+    #     "level": "hard",
+    #     "supporting_facts": {
+    #         "title": ["Scott Derrickson", "Ed Wood"],
+    #         "sent_id": [0, 0],
+    #     },
+    #     "context": {
+    #         "title": [
+    #             "Adam Collis",
+    #             "Ed Wood (film)",
+    #             "Tyler Bates",
+    #             "Doctor Strange (2016 film)",
+    #             "Hellraiser: Inferno",
+    #             "Sinister (film)",
+    #             "Deliver Us from Evil (2014 film)",
+    #             "Woodson, Arkansas",
+    #             "Conrad Brooks",
+    #             "The Exorcism of Emily Rose",
+    #         ],
+    #         "sentences": [
+    #             [
+    #                 "Adam Collis is an American filmmaker and actor.",
+    #                 " He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.",
+    #                 " He also studied cinema at the University of Southern California from 1991 to 1997.",
+    #                 ' Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995).',
+    #                 ' In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".',
+    #             ],
+    #             [
+    #                 "Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.",
+    #                 " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.",
+    #                 " Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.",
+    #             ],
+    #             [
+    #                 "Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.",
+    #                 ' Much of his work is in the action and horror film genres, with films like "Dawn of the Dead, 300, Sucker Punch," and "John Wick."',
+    #                 " He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.",
+    #                 ' With Gunn, he has scored every one of the director\'s films; including "Guardians of the Galaxy", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.',
+    #                 ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums "The Pale Emperor" and "Heaven Upside Down".',
+    #             ],
+    #             [
+    #                 "Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.",
+    #                 " It is the fourteenth film of the Marvel Cinematic Universe (MCU).",
+    #                 " The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.",
+    #                 ' In "Doctor Strange", surgeon Strange learns the mystic arts after a career-ending car accident.',
+    #             ],
+    #             [
+    #                 "Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.",
+    #                 ' It is the fifth installment in the "Hellraiser" series and the first "Hellraiser" film to go straight-to-DVD.',
+    #                 " It was directed by Scott Derrickson and released on October 3, 2000.",
+    #                 " The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.",
+    #                 " The film's reviews were mixed.",
+    #             ],
+    #             [
+    #                 "Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.",
+    #                 " It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.",
+    #             ],
+    #             [
+    #                 "Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.",
+    #                 ' The film is officially based on a 2001 non-fiction book entitled "Beware the Night" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was "inspired by actual accounts".',
+    #                 " The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.",
+    #             ],
+    #             [
+    #                 "Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.",
+    #                 " Its population was 403 at the 2010 census.",
+    #                 " It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.",
+    #                 " Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.",
+    #                 " Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.",
+    #             ],
+    #             [
+    #                 "Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.",
+    #                 " He moved to Hollywood, California in 1948 to pursue a career in acting.",
+    #                 ' He got his start in movies appearing in Ed Wood films such as "Plan 9 from Outer Space", "Glen or Glenda", and "Jail Bait."',
+    #                 " He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.",
+    #                 " He also has since gone on to write, produce and direct several films.",
+    #             ],
+    #             [
+    #                 "The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.",
+    #                 " The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.",
+    #             ],
+    #         ],
+    #     },
+    # }
+
+    # # save to csv
+    # keys = ["id", "question", "answer", "gold_titles", "context"]
+    # example["gold_titles"] = set(example["supporting_facts"]["title"])
+
+    # # test, save to hotpotQA
+
+    # data = HotPotQAData.from_dict({k: example[k] for k in keys})
+    # print(f"data: {data}")
+
+    # # save to json
+    # save_json([data.to_dict()], f="test.json")
+
+    # # load from json
+    # loaded_data = load_json("test.json")
+    # # convert to dataclass
+    # data = HotPotQAData.from_dict(loaded_data[0])
+    # print(f"data: {data}")
diff --git a/adalflow/adalflow/datasets/trec.py b/adalflow/adalflow/datasets/trec.py
index 75267609..da20ad2d 100644
--- a/adalflow/adalflow/datasets/trec.py
+++ b/adalflow/adalflow/datasets/trec.py
@@ -43,8 +43,6 @@ def sample_subset_dataset(dataset, num_samples: int, sample_weights):
 
 def prepare_datasets():
     from datasets import load_dataset
-    from datasets import Dataset as HFDataset
-    from adalflow.optim.sampler import ClassSampler
 
     dataset = load_dataset("trec")
     print(f"train: {len(dataset['train'])}, test: {len(dataset['test'])}")  # 5452, 500
@@ -59,19 +57,26 @@ def prepare_datasets():
     len_train_dataset = len(org_train_dataset)
 
     org_test_dataset = dataset["test"]
-    eval_size = 6 * num_classes
-
-    class_sampler = ClassSampler(
-        org_train_dataset.select(
-            range(0, len_train_dataset // 3)
-        ),  # created huggingface dataset type
-        num_classes=num_classes,
-        get_data_key_fun=lambda x: x["coarse_label"],
-    )
-
-    eval_dataset_split = [sample.data for sample in class_sampler(eval_size)]
-    # convert this back to huggingface dataset
-    eval_dataset_split = HFDataset.from_list(eval_dataset_split)
+    # eval_size = 18 * num_classes
+
+    # class_sampler = ClassSampler(
+    #     org_train_dataset.select(
+    #         range(0, len_train_dataset // 3)
+    #     ),  # created huggingface dataset type
+    #     num_classes=num_classes,
+    #     get_data_key_fun=lambda x: x["coarse_label"],
+    # )
+
+    # eval_dataset_split = [sample.data for sample in class_sampler(eval_size)]
+    # # convert this back to huggingface dataset
+    # eval_dataset_split = HFDataset.from_list(eval_dataset_split)
+
+    # sample eval from the first 1/3 of the train dataset
+    # eval_dataset_split = org_train_dataset.select(range(len_train_dataset // 3))
+    # # sample a subset of the eval dataset, just randomly sampling
+    # eval_dataset_split = sample_subset_dataset(
+    #     eval_dataset_split, eval_size, torch.ones(len(eval_dataset_split))
+    # )
 
     # (2) create train dataset from the last 2/3 of the train dataset, 100 samples per class
     train_dataset_split = org_train_dataset.select(
@@ -85,7 +90,7 @@ def prepare_datasets():
         train_dataset_split, train_size, class_weights
     )
     print(f"train example: {train_dataset_split[0]}")
-    print(f"train: {len(train_dataset_split)}, eval: {len(eval_dataset_split)}")
+    # print(f"train: {len(train_dataset_split)}, eval: {len(eval_dataset_split)}")
 
     # get the count for each class
     count_by_class: Dict[str, int] = {}
@@ -98,14 +103,23 @@ def prepare_datasets():
     # create the test dataset from the test dataset
     # weights for the test dataset
     labels = torch.tensor(org_test_dataset["coarse_label"])
-    class_weights = calculate_class_weights(labels)
+    # class_weights = calculate_class_weights(labels)
 
-    test_size = eval_size * 4
-    # weighted sampling on the test dataset
-    test_dataset_split = sample_subset_dataset(
-        org_test_dataset, test_size, class_weights
+    print(f"total test dataset: {len(org_test_dataset)}")
+
+    # shuff, and get the first 1/3 as validation, 2/3 as test
+    test_dataset_split = org_test_dataset.shuffle(seed=42)
+    eval_dataset_split = test_dataset_split.select(range(len(test_dataset_split) // 3))
+    test_dataset_split = test_dataset_split.select(
+        range(len(test_dataset_split) // 3, len(test_dataset_split))
     )
 
+    # test_size = eval_size * 2
+    # # weighted sampling on the test dataset
+    # test_dataset_split = sample_subset_dataset(
+    #     org_test_dataset, test_size, torch.ones(len(org_test_dataset))
+    # )
+
     print(
         f"train example: {train_dataset_split[0]}, type: {type(train_dataset_split[0])}"
     )
diff --git a/adalflow/adalflow/datasets/types.py b/adalflow/adalflow/datasets/types.py
index 3315d2d8..84950a48 100644
--- a/adalflow/adalflow/datasets/types.py
+++ b/adalflow/adalflow/datasets/types.py
@@ -1,5 +1,6 @@
 import uuid
 from dataclasses import dataclass, field
+from typing import Dict
 from adalflow.core.base_data_class import DataClass
 
 
@@ -32,6 +33,34 @@ class HotPotQAData(Example):
         metadata={"desc": "The set of titles that support the answer"},
         default=None,
     )
+    context: Dict[str, object] = field(
+        metadata={"desc": "The context of the question"},
+        default=None,
+    )
+
+    __input_fields__ = ["question"]
+    __output_fields__ = ["answer"]
+
+    # @staticmethod
+    # def from_dict(d: Dict[str, Any]) -> "HotPotQAData":
+    #     # Preprocess gold_titles
+    #     if "gold_titles" in d and isinstance(d["gold_titles"], str):
+    #         try:
+    #             d["gold_titles"] = json.loads(d["gold_titles"])
+    #         except json.JSONDecodeError:
+    #             # Replace single quotes with double quotes
+    #             fixed_str = d["gold_titles"].replace("'", '"')
+    #             d["gold_titles"] = set(json.loads(fixed_str))
+
+    #     # Preprocess context
+    #     if "context" in d and isinstance(d["context"], str):
+    #         try:
+    #             d["context"] = json.loads(d["context"])
+    #         except json.JSONDecodeError:
+    #             fixed_str = d["context"].replace("'", '"')
+    #             d["context"] = json.loads(fixed_str)
+
+    #     return HotPotQAData(**d)
 
 
 @dataclass
@@ -52,3 +81,31 @@ class TrecData(BaseData):
 
     __input_fields__ = ["question"]  # follow this order too.
     __output_fields__ = ["class_name", "class_index"]
+
+
+if __name__ == "__main__":
+    # test the hotpotqa data
+    data = HotPotQAData(
+        question="What is the capital of France?",
+        answer="Paris",
+        gold_titles=set(["Paris", "France"]),
+        context={"Paris": "The capital of France"},
+    )
+
+    data_dict = data.to_dict()
+    print("data_dict", data_dict)
+    data = HotPotQAData.from_dict(data_dict)
+    print("data", data)
+
+    from adalflow.utils.file_io import save_json, load_json
+
+    # save json
+    save_json(data_dict, f="task.json")
+    # load json
+    data_dict_loaded = load_json(f="task.json")
+
+    print("data_dict_loaded", data_dict_loaded)
+
+    # restore the data
+    data_restored = HotPotQAData.from_dict(data_dict_loaded)
+    print("data_restored", data_restored)
diff --git a/adalflow/adalflow/eval/__init__.py b/adalflow/adalflow/eval/__init__.py
index 67de685c..1d9ecd08 100644
--- a/adalflow/adalflow/eval/__init__.py
+++ b/adalflow/adalflow/eval/__init__.py
@@ -1,5 +1,5 @@
 from .answer_match_acc import AnswerMatchAcc
-from .retriever_recall import RetrieverRecall
+from .retriever_recall import RetrieverEvaluator
 from .llm_as_judge import LLMasJudge, DEFAULT_LLM_EVALUATOR_PROMPT
 from .g_eval import (
     GEvalJudgeEvaluator,
@@ -10,7 +10,7 @@
 
 __all__ = [
     "AnswerMatchAcc",
-    "RetrieverRecall",
+    "RetrieverEvaluator",
     "LLMasJudge",
     "DEFAULT_LLM_EVALUATOR_PROMPT",
     "GEvalJudgeEvaluator",
diff --git a/adalflow/adalflow/eval/answer_match_acc.py b/adalflow/adalflow/eval/answer_match_acc.py
index b45e61c1..03da6cfa 100644
--- a/adalflow/adalflow/eval/answer_match_acc.py
+++ b/adalflow/adalflow/eval/answer_match_acc.py
@@ -3,6 +3,7 @@
 from typing import List, Literal
 from adalflow.eval.base import BaseEvaluator, EvaluationResult
 from adalflow.optim.parameter import Parameter
+from adalflow.eval.utils import normalize_answer, f1_score
 
 
 class AnswerMatchAcc(BaseEvaluator):
@@ -27,12 +28,20 @@ class AnswerMatchAcc(BaseEvaluator):
         1.0
         >>> acc_list
         [1.0, 1.0, 1.0]
+
+    References:
+    1. HotpotQA: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py
     """
 
     def __init__(
         self,
         type: Literal[
-            "exact_match", "fuzzy_match", "rouge_score", "bleu_score", "bert_score"
+            "exact_match",
+            "fuzzy_match",
+            "rouge_score",
+            "bleu_score",
+            "bert_score",
+            "f1_score",
         ] = "exact_match",
     ):
         self.type = type
@@ -81,11 +90,13 @@ def compute_single_item(
                 f"Error converting pred_answer and gt_answer to string: {e}"
             )
         if self.type == "exact_match":
-            return 1.0 if y == y_gt else 0.0
+            return 1.0 if normalize_answer(y) == normalize_answer(y_gt) else 0.0
         elif self.type == "fuzzy_match":
-            y = y.lower()
-            y_gt = y_gt.lower()
+            y = normalize_answer(y)
+            y_gt = normalize_answer(y_gt)
             return 1.0 if y_gt in y else 0.0
+        elif self.type == "f1_score":
+            return f1_score(y, y_gt)
         elif self.type == "bert_score":
             from torchmetrics.text.bert import BERTScore
 
diff --git a/adalflow/adalflow/eval/retriever_recall.py b/adalflow/adalflow/eval/retriever_recall.py
index 9abe6d52..c433dc65 100644
--- a/adalflow/adalflow/eval/retriever_recall.py
+++ b/adalflow/adalflow/eval/retriever_recall.py
@@ -1,16 +1,23 @@
 """Retriever Recall @k metric."""
 
-from typing import List, Union
+from typing import List, Dict
 
 from adalflow.eval.base import BaseEvaluator, EvaluationResult
+from adalflow.eval.utils import normalize_answer
 
 
-class RetrieverRecall(BaseEvaluator):
-    __doc__ = r"""Recall@k measures the ratio of the number of relevant context strings in the top-k retrieved context to the total number of ground truth relevant context strings.
+class RetrieverEvaluator(BaseEvaluator):
+    __doc__ = r"""Return Recall@k and Precision@k.
+
+    Recall@k = Number of relevant retrieved documents/ Total number of relevant documents (len(gt_contexts))
+    Precision@k = Number of relevant retrieved documents/ Total number of retrieved documents (len(retrieved_contexts))
+
 
     In our implementation, we use exact string matching between each gt context and the joined retrieved context string.
     You can use the longest common subsequence (LCS) or other similarity metrics(or embedding based) to decide if it is a match or not.
 
+    You can also pass ids of retrieved and the reference.
+
     If you do not even have the ground truth context, but only grounth truth answers, you can consider using
     RAGAS framework for now. It computes the recall as:
 
@@ -43,36 +50,55 @@ class RetrieverRecall(BaseEvaluator):
     def __init__(self):
         super().__init__()
 
-    def _compute_single_item(
-        self, retrieved_context: str, gt_context: Union[str, List[str]]
-    ) -> float:
+    def compute_single_item(
+        self, retrieved_context: List[str], gt_context: List[str]
+    ) -> Dict[str, float]:
         r"""
         Compute the recall of the retrieved context for a single query.
 
         Args:
-            retrieved_context (str): Retrieved context string.
-            gt_context (Union[str, List[str]]): Context string or list of context strings to compare against.
+            retrieved_context (List[str]): List of retrieved context strings.
+            gt_context (List[str]): List of ground truth context strings.
 
         Returns:
             float: Recall value.
         """
-        if isinstance(gt_context, str):
-            gt_context = [gt_context]
-        recalled = 0
-        for gt_context_sentence in gt_context:
-            if gt_context_sentence in retrieved_context:
-                recalled += 1
-        return recalled / len(gt_context)
+        # 1 normalize the text
+        normalized_retrieved_context = [
+            normalize_answer(doc) for doc in retrieved_context
+        ]
+
+        normalized_gt_context = [normalize_answer(doc) for doc in gt_context]
+
+        set_retrieved = set(normalized_retrieved_context)
+        set_gt = set(normalized_gt_context)
+
+        # 2 calculate the recall with intersection
+
+        recall = len(set_gt.intersection(set_retrieved)) / len(set_gt)
+        precision = len(set_gt.intersection(set_retrieved)) / len(set_retrieved)
+
+        return {"recall": recall, "precision": precision}
+
+        # if isinstance(gt_context, str):
+        #     gt_context = [gt_context]
+        # recalled = 0
+        # for gt_context_sentence in gt_context:
+        #     normalized_gt_context = normalize_answer(gt_context_sentence)
+        #     normalized_retrieved_context = normalize_answer(retrieved_context)
+        #     if normalized_gt_context in normalized_retrieved_context:
+        #         recalled += 1
+        # return recalled / len(gt_context)
 
     def compute(
         self,
-        retrieved_contexts: Union[List[str], List[List[str]]],
+        retrieved_contexts: List[List[str]],
         gt_contexts: List[List[str]],
     ) -> EvaluationResult:
         r"""
         Compute the recall of the retrieved context for a list of queries.
         Args:
-            retrieved_contexts (Union[List[str], List[List[str]]): List of retrieved context strings. Using List[str] we assume you have joined all the context sentences into one string.
+            retrieved_context: List of retrieved context strings.
             gt_contexts ( List[List[str]]): List of ground truth context strings.
 
         Returns:
@@ -84,15 +110,53 @@ def compute(
             raise ValueError(
                 "The number of retrieved context lists and ground truth context lists should be the same."
             )
-        k = len(retrieved_contexts)
-        recall_list = []
+        k = len(retrieved_contexts[0])
+        metric_list = []
         for retrieved_context, gt_context in zip(retrieved_contexts, gt_contexts):
-            if isinstance(retrieved_context, list):
-                retrieved_context = " ".join(retrieved_context)
-            recall = self._compute_single_item(retrieved_context, gt_context)
-            recall_list.append(recall)
-
-        avg_score = sum(recall_list) / len(recall_list)
-        return EvaluationResult(
-            avg_score, recall_list, additional_info={"type": f"RetrieverRecall@{k}"}
+            # if isinstance(retrieved_context, list):
+            #     retrieved_context = " ".join(retrieved_context)
+            metric = self.compute_single_item(retrieved_context, gt_context)
+            metric_list.append(metric)
+
+        # average through each key value
+
+        avg_recall = sum([metric["recall"] for metric in metric_list]) / len(
+            metric_list
         )
+        avg_precision = sum([metric["precision"] for metric in metric_list]) / len(
+            metric_list
+        )
+
+        return {
+            "avg_recall": avg_recall,
+            "avg_precision": avg_precision,
+            "recall_list": [metric["recall"] for metric in metric_list],
+            "precision_list": [metric["precision"] for metric in metric_list],
+            "top_k": k,
+        }
+
+        # return EvaluationResult(
+        #     avg_score, recall_list, additional_info={"type": f"RetrieverRecall@{k}"}
+        # )
+
+
+if __name__ == "__main__":
+    from adalflow.datasets import HotPotQA, HotPotQAData
+
+    train_dataset = HotPotQA(split="train", size=10)
+    data: HotPotQAData = train_dataset[0]
+    gold_titles = data.gold_titles
+    context_titles = data.context["title"]
+    print(f"gold_titles: {gold_titles}, context_titles: {context_titles}")
+    print(f"train: {len(train_dataset)}, example: {train_dataset[0]}")
+
+    # compute the recall and precision for 10 items
+    retriever_eval = RetrieverEvaluator()
+
+    gt_contexts = [list(data.gold_titles) for data in train_dataset[:10]]
+
+    retrieved_contexts = [list(data.context["title"]) for data in train_dataset[:10]]
+
+    result = retriever_eval.compute(retrieved_contexts, gt_contexts)
+
+    print(f"result: {result}")
diff --git a/adalflow/adalflow/eval/utils.py b/adalflow/adalflow/eval/utils.py
new file mode 100644
index 00000000..babf5b78
--- /dev/null
+++ b/adalflow/adalflow/eval/utils.py
@@ -0,0 +1,48 @@
+# from hotpotqa github
+import re
+
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(y: str, y_gt: str) -> float:
+    if not isinstance(y, str) or not isinstance(y_gt, str):
+        raise ValueError(f"y: {y},{type(y)}, y_gt: {y_gt},{type(y_gt)} must be string.")
+    prediction_tokens = normalize_answer(y).split()
+    ground_truth_tokens = normalize_answer(y_gt).split()
+
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+
+    if len(prediction_tokens) == len(ground_truth_tokens) == 0:
+        # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
+        print(
+            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n"
+        )
+
+    if num_same == 0:
+        return 0
+
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+
+    return f1
diff --git a/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py b/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py
index a78c1ec6..9c509598 100644
--- a/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py
+++ b/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py
@@ -14,6 +14,7 @@
 from adalflow.core.functional import random_sample
 from adalflow.optim.optimizer import DemoOptimizer
 from adalflow.optim.types import ParameterType
+from adalflow.utils import printc
 
 log = logging.getLogger(__name__)
 
@@ -219,7 +220,10 @@ def samples_to_str(
                     yaml_str = sample.to_yaml(exclude=exclude_fields)
 
                 else:
-                    yaml_str = sample.to_yaml(exclude=["id", "score"])
+                    yaml_str = sample.to_yaml(
+                        include=sample.get_input_fields() + sample.get_output_fields()
+                    )
+                    printc(f"yaml_str: {yaml_str}")
                 sample_strs.append(yaml_str + "\n")
             except Exception as e:
                 print(f"Error: {e} to yaml for {sample}")
diff --git a/adalflow/adalflow/optim/grad_component.py b/adalflow/adalflow/optim/grad_component.py
index b73e536e..92d3a56c 100644
--- a/adalflow/adalflow/optim/grad_component.py
+++ b/adalflow/adalflow/optim/grad_component.py
@@ -1,19 +1,40 @@
 """Base class for Autograd Components that can be called and backpropagated through."""
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, Optional, Dict
 from collections import OrderedDict
+import uuid
 import logging
+from copy import deepcopy
 
 if TYPE_CHECKING:
     from adalflow.core.generator import BackwardEngine
-    from adalflow.optim.parameter import Parameter
 
+    from adalflow.core import ModelClient
+from adalflow.optim.parameter import (
+    Parameter,
+    OutputParameter,
+    Gradient,
+    GradientContext,
+)
 from adalflow.optim.types import ParameterType
+from adalflow.core.types import GeneratorOutput
+from adalflow.utils import printc
+
+import json
 
 from adalflow.core.component import Component
 from adalflow.optim.function import BackwardContext
+from adalflow.utils.registry import EntityMapping
+from adalflow.core.prompt_builder import Prompt
+from adalflow.optim.text_grad.backend_engine_prompt import (
+    LOSS_CONVERSATION_TEMPLATE_STRING,
+    LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN,
+    OBJECTIVE_INSTRUCTION_BASE,
+    OBJECTIVE_INSTRUCTION_CHAIN,
+)
+
 
-__all__ = ["GradComponent"]
+__all__ = ["GradComponent", "FunGradComponent", "fun_to_grad_component"]
 log = logging.getLogger(__name__)
 
 
@@ -28,22 +49,43 @@ class GradComponent(Component):
     The __call__ method will check if the component is in training mode,
     and call the `forward` method to return a `Parameter` object if it is in training mode,
     otherwise, it will call the `call` method to return the output such as "GeneratorOutput", "RetrieverOutput", etc.
+
+    Note: Avoid using the attributes and methods that are defined here and in the `Component` class unless you are overriding them.
     """
     backward_engine: "BackwardEngine"
     _component_type = "grad"
+    id = None
+    _component_desc = "GradComponent"
 
     def __init__(self, *args, **kwargs):
         super().__init__()
         super().__setattr__("backward_engine", None)
-
-    def __call__(self, *args, **kwargs):
-        if self.training:
-            return self.forward(*args, **kwargs)
+        super().__setattr__("id", str(uuid.uuid4()))
+
+    # def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs):
+    #     raise NotImplementedError("set_backward_engine method is not implemented")
+    def set_backward_engine(
+        self,
+        backward_engine: "BackwardEngine" = None,
+        model_client: "ModelClient" = None,
+        model_kwargs: Dict[str, object] = None,
+    ):
+        from adalflow.core.generator import BackwardEngine
+
+        self.backward_engine = backward_engine
+        if not backward_engine:
+            log.info(
+                "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs."
+            )
+            self.backward_engine = BackwardEngine(model_client, model_kwargs)
         else:
-            return self.call(*args, **kwargs)
+            if type(backward_engine) is not BackwardEngine:
+                raise TypeError(
+                    f"EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine. Got {type(backward_engine)}."
+                )
 
-    def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs):
-        raise NotImplementedError("set_backward_engine method is not implemented")
+    def disable_backward_engine(self):
+        self.backward_engine = None
 
     def call(self, *args, **kwargs):
         raise NotImplementedError("call method is not implemented")
@@ -59,17 +101,12 @@ def forward(self, *args, **kwargs) -> "Parameter":
         3. Return the parameter object.
         """
 
-        from adalflow.optim.parameter import Parameter
+        from adalflow.optim.parameter import Parameter, OutputParameter
 
         log.debug(
             f"Forwarding through {self.name} with args: {args} and kwargs: {kwargs}"
         )
 
-        # if "id" not in kwargs:
-        #     raise ValueError(
-        #         "id must be provided in the kwargs of a GradComponent for tracing."
-        #     )
-
         # 1. get all predecessors from all args and kwargs
         input_args = OrderedDict()
 
@@ -77,6 +114,9 @@ def forward(self, *args, **kwargs) -> "Parameter":
         for idx, arg in enumerate(args):
             input_args[f"arg_{idx}"] = arg
 
+        # Get data id from the kwargs
+        data_id = kwargs.get("id", None)
+
         # Add keyword args to the ordered dict, preserving order
         predecessors = []
         for v in input_args.values():
@@ -84,11 +124,15 @@ def forward(self, *args, **kwargs) -> "Parameter":
                 predecessors.append(v)
                 if v.param_type == ParameterType.INPUT:
                     v.data_id = kwargs.get("id", None)
+                if data_id is None:
+                    data_id = v.data_id
         for v in kwargs.values():
             if isinstance(v, Parameter):
                 predecessors.append(v)
                 if v.param_type == ParameterType.INPUT:
                     v.data_id = kwargs.get("id", None)
+                if data_id is None:
+                    data_id = v.data_id
 
         # 2. unwrap the parameter object to take only the data, successor_map_fn: lambda x: x.data in default
         # unwrap args
@@ -115,21 +159,39 @@ def forward(self, *args, **kwargs) -> "Parameter":
 
         call_response = self.call(*unwrapped_args, **unwrapped_kwargs)
 
+        if isinstance(call_response, Parameter):
+            raise ValueError(
+                f"A GradComponent call should not return Parameter, got {call_response.name}"
+            )
+            predecessors.append(call_response)
+            return call_response
+
         # 4. Create a Parameter object to trace the forward pass
-        input_args.update(kwargs)
-        response = Parameter(
+        # input_args.update(kwargs)
+        # use unwrapped args  and unwrapped kwargs to trace the forward pass
+        tracing_args = {i: v for i, v in enumerate(unwrapped_args)}
+        tracing_args.update(**unwrapped_kwargs)
+
+        response = OutputParameter(
             data=call_response,
             name=self.name + "_output",
             role_desc=self.name + " response",
             param_type=ParameterType.OUTPUT,
+            data_id=data_id,
         )
         response.set_predecessors(predecessors)
-        response.trace_forward_pass(input_args=input_args, full_response=call_response)
+        response.trace_forward_pass(
+            input_args=tracing_args,
+            full_response=call_response,
+            id=self.id,  # this is component id
+            name=self.name,
+        )
         response.set_grad_fn(
             BackwardContext(
                 backward_fn=self.backward,
                 response=response,
-                id=kwargs.get("id", None),
+                id=data_id,
+                input_kwargs=kwargs,
             )
         )
         return response
@@ -141,21 +203,392 @@ def backward(self, *, response: "Parameter", id: str = None, **kwargs):
 
         Subclass should implement this method if you need additional backward logic.
         """
+
         log.info(f"GradComponent backward: {response.name}")
         children_params = response.predecessors
 
         if response.get_gradient_and_context_text().strip() == "":
             log.info(f"Generator: Backward: No gradient found for {response}.")
 
-        for pred in children_params:
-            pred.set_score(response._score)
-            from adalflow.utils.logger import printc
+        # backward the backward engine disable signal
+        if response.backward_engine_disabled:
+            for pred in children_params:
+                pred.backward_engine_disabled = True
+
+        for _, pred in enumerate(children_params):
+            if response.score is not None:
+                pred.set_score(response.score)
 
-            printc(
-                f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}",
-                "blue",
-            )
             if pred.param_type == ParameterType.DEMOS:
                 pred.add_score_to_trace(
-                    trace_id=id, score=response._score, is_teacher=self.teacher_mode
+                    trace_id=id, score=response.score, is_teacher=self.teacher_mode
+                )
+
+            # pass the current gradient to pred
+
+            # TODO: each gradcomponent will have its own context, but
+            # passing the successor's gradient.data to the current.
+
+            for grad in response.gradients:
+                # NOTE: make a copy of the gradient, we should not modify the original gradient
+                grad = deepcopy(grad)
+                # update the gradient context and from and to
+                # grad.update_from_to(response, pred)
+                grad.is_default_copy = (
+                    True  # response and pred will keep the original gradient
+                )
+                # NOTE: test of keep the initial gradient context
+                # grad.add_context(
+                #     GradientContext(
+                #         variable_desc=pred.role_desc,
+                #         response_desc=response.name,
+                #         input_output=f"""{response.component_trace.to_context_str()}""",
+                #     )
+                # )
+
+                pred.add_gradient(grad)
+
+
+class FunGradComponent(GradComponent):
+    r"""Wraps a function as a GradComponent.
+
+    Args:
+        fun (Callable): The function to be wrapped.
+
+    Examples:
+
+    function = lambda x: x + 1
+    fun_component = FunComponent(function)
+    print(fun_component(1))  # 2
+    """
+
+    def __init__(self, fun: Optional[Callable] = None, afun: Optional[Callable] = None):
+        super().__init__()
+        self.fun_name = fun.__name__
+        EntityMapping.register(self.fun_name, fun)
+
+    def call(self, *args, **kwargs):
+        fun = EntityMapping.get(self.fun_name)
+        return fun(*args, **kwargs)
+
+    def _extra_repr(self) -> str:
+        return super()._extra_repr() + f"fun_name={self.fun_name}"
+
+
+def fun_to_grad_component(fun) -> FunGradComponent:
+    r"""Helper function to convert a function into a Component with
+    its own class name.
+
+    Can be used as both a decorator and a function.
+
+    Args:
+        fun (Callable): The function to be wrapped.
+    Returns:
+        FunComponent: The component that wraps the function.
+
+    Examples:
+    1. As a decorator:
+        >>> @fun_to_component
+        >>> def my_function(x):
+        >>>     return x + 1
+        >>> # is equivalent to
+        >>> class MyFunctionComponent(FunComponent):
+        >>>     def __init__(self):
+        >>>         super().__init__(my_function)
+
+    2. As a function:
+        >>> my_function_component = fun_to_component(my_function)
+    """
+
+    # Split the function name by underscores, capitalize each part, and join them back together
+    class_name = (
+        "".join(part.capitalize() for part in fun.__name__.split("_")) + "GradComponent"
+    )
+    # register the function
+    EntityMapping.register(fun.__name__, fun)
+    # Define a new component class dynamically
+    component_class = type(
+        class_name,
+        (FunGradComponent,),
+        {"__init__": lambda self: FunGradComponent.__init__(self, fun)},
+    )
+    # register the component
+    EntityMapping.register(class_name, component_class)
+
+    return component_class()
+
+
+class GradComponent2(GradComponent):
+    "Graduable functional component"
+
+    def __init__(
+        self,
+        desc: str,
+        name: Optional[str] = None,
+        backward_engine: Optional["BackwardEngine"] = None,
+        model_client: "ModelClient" = None,
+        model_kwargs: Dict[str, object] = None,
+    ):
+        super().__init__()
+        self.desc = desc
+        self.backward_engine = backward_engine
+        self.model_client = model_client
+        self.name = name or f"{self.__class__.__name__}"
+
+        self.backward_engine = None
+        if backward_engine is None:
+            log.info(
+                "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs."
+            )
+            if model_client and model_kwargs:
+
+                self.set_backward_engine(backward_engine, model_client, model_kwargs)
+        else:
+            if not isinstance(backward_engine, BackwardEngine):
+                raise TypeError(
+                    "EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine."
                 )
+            self.backward_engine = backward_engine
+
+    # def set_backward_engine(
+    #     self,
+    #     backward_engine: "BackwardEngine" = None,
+    #     model_client: "ModelClient" = None,
+    #     model_kwargs: Dict[str, object] = None,
+    # ):
+    #     from adalflow.core.generator import BackwardEngine
+
+    #     self.backward_engine = backward_engine
+    #     if not backward_engine:
+    #         log.info(
+    #             "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs."
+    #         )
+    #         self.backward_engine = BackwardEngine(model_client, model_kwargs)
+    #     else:
+    #         if type(backward_engine) is not BackwardEngine:
+    #             raise TypeError(
+    #                 f"EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine. Got {type(backward_engine)}."
+    #             )
+
+    @staticmethod
+    def _backward_through_one_predecessor(
+        pred: Parameter,
+        kwargs: Dict[str, Parameter],
+        response: Parameter,
+        desc: str,
+        backward_engine: "BackwardEngine",
+        ground_truth: object = None,
+        is_intermediate_node: bool = False,  # if the node is an intermediate node in the backpropagation chain
+        metadata: Dict[str, str] = None,
+    ):
+        if not pred.requires_opt:
+            if response.score is not None:
+                pred.set_score(response.score)
+            log.debug(
+                f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
+            )
+            return
+        log.debug(
+            f"EvalFnToTextLoss: Backward through {pred}, is_intermediate_node: {is_intermediate_node}"
+        )
+
+        if pred.check_if_already_computed_gradient_respect_to(response.id):
+            log.info(
+                f"EvalFnToTextLoss: Gradient already computed for {pred.role_desc} with respect to {response.role_desc}"
+            )
+
+            return
+
+        if backward_engine is None:
+            log.error(
+                "EvalFnToTextLoss: backward_engine is required for text prompt optimization."
+            )
+            raise ValueError(
+                "EvalFnToTextLoss: backward_engine is required for text prompt optimization."
+            )
+
+        instruction_str, objective_str = None, None
+
+        # convert kwargs to key, (value, type(eval_input))
+
+        inputs = {}
+        for k, v in kwargs.items():
+            inputs[k] = (v.get_param_info(), str(type(v.eval_input)))
+
+        # response information
+        conversation_str = Prompt(
+            LOSS_CONVERSATION_TEMPLATE_STRING,
+            prompt_kwargs={
+                "inputs": inputs,
+                "eval_fn_desc": desc,
+                "response_value": response.get_prompt_data(),
+                "metadata": json.dumps(metadata) if metadata else None,
+            },
+        )()
+
+        conv_ins_template = LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN
+        obj_ins_template = OBJECTIVE_INSTRUCTION_BASE
+
+        if is_intermediate_node:
+            # conv_ins_template = CONVERSATION_START_INSTRUCTION_STRING_FN_CHAIN
+            obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN
+
+        instruction_str = Prompt(
+            conv_ins_template,
+            prompt_kwargs={
+                "variable": pred.get_param_info(),
+                "conversation_str": conversation_str,
+            },
+        )()
+        objective_str = Prompt(
+            obj_ins_template,
+            prompt_kwargs={
+                "response_name": response.name,
+                "response_desc": response.role_desc,
+                "response_gradient": response.data,
+            },
+        )()
+
+        log.info(f"EvalFnToTextLoss: Instruction: {instruction_str}")
+        log.info(f"EvalFnToTextLoss: Objective: {objective_str}")
+        log.info(f"EvalFnToTextLoss: Conversation: {conversation_str}")
+
+        # Compute the gradient
+        backward_engine_prompt_kwargs = {
+            "conversation_sec": instruction_str,
+            "objective_instruction_sec": objective_str,
+            # "evaluate_variable_instruction_sec": eval_str,
+        }
+        gradient_value: GeneratorOutput = backward_engine(
+            prompt_kwargs=backward_engine_prompt_kwargs
+        )
+        gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs)
+        # print(f"Backward engine prompt: {gradient_prompt}")
+        gradient_value_data = (
+            gradient_value.data
+            or backward_engine.failure_message_to_optimizer(
+                gradient_response=gradient_value
+            )
+        )
+
+        gradient_value_data = (
+            f"expected answer: {ground_truth},\n Feedback: {gradient_value_data}"
+        )
+        # print(f"gradient_value_data: {gradient_value_data}")
+
+        log.debug(f"EvalFnToTextLoss: Gradient for {pred}: {gradient_value_data}")
+
+        # score should be passed to grad
+        gradient_param = Gradient(
+            data=gradient_value_data,
+            data_id=response.data_id,
+            score=response.score,
+            from_response=response,
+            to_pred=pred,
+        )
+        gradient_param.add_prompt(gradient_prompt)
+        gradient_param.add_context(
+            GradientContext(
+                input_output=conversation_str,
+                response_desc=response.role_desc,
+                variable_desc=pred.role_desc,
+                # ground_truth=ground_truth,
+            )
+        )
+        pred.add_gradient(gradient_param)
+
+        # backward the end to end score
+        # TODO: not really useful
+        if response.score is not None:
+            pred.set_score(response.score)
+        pred.set_gt(ground_truth)
+
+        # TODO: reduce meta
+
+    def backward(self, *, response: "OutputParameter", id: str = None, **kwargs):
+        """Backward pass of the function. In default, it will pass all the scores to the predecessors.
+
+        Note: backward is mainly used internally and better to only allow kwargs as the input.
+
+        Subclass should implement this method if you need additional backward logic.
+        """
+
+        log.info(f"GradComponent backward: {response.name}")
+        children_params = response.predecessors
+
+        input_kwargs = kwargs.get("input_kwargs", {})
+
+        is_intermediate_node = False
+        response_gradient_context = response.get_gradient_and_context_text().strip()
+        if response_gradient_context != "":
+            log.info("EvalFnToTextLoss is an intermediate node.")
+            is_intermediate_node = True
+
+        if response.get_gradient_and_context_text().strip() == "":
+            log.info(f"Generator: Backward: No gradient found for {response}.")
+
+        # backward the backward engine disable signal
+        if response.backward_engine_disabled:
+            for pred in children_params:
+                pred.backward_engine_disabled = True
+
+        # use pass through gradient when there is one predecessor
+        if not self.backward_engine or len(children_params) < 2:
+            super().backward(response=response, id=id)
+
+        else:
+
+            for _, pred in enumerate(children_params):
+                if response.score is not None:
+                    pred.set_score(response.score)
+                printc(f"score {response.score} for pred name: {pred.name}")
+                if not pred.requires_opt:
+                    continue
+
+                if pred.param_type == ParameterType.DEMOS:
+                    pred.add_score_to_trace(
+                        trace_id=id, score=response.score, is_teacher=self.teacher_mode
+                    )
+
+                self._backward_through_one_predecessor(
+                    pred=pred,
+                    kwargs=input_kwargs,
+                    response=response,
+                    backward_engine=self.backward_engine,
+                    desc=self.desc,
+                    is_intermediate_node=is_intermediate_node,
+                )
+
+
+if __name__ == "__main__":
+    # Test FunGradComponent
+    from adalflow.optim.parameter import Parameter
+
+    def my_function(x):
+        return x + 1
+
+    my_function_component = fun_to_grad_component(my_function)
+    print(my_function_component)  # 2
+    # eval mode
+    output = my_function_component(1)
+    print(output)
+    # training mode
+    my_function_component.train()
+    output = my_function_component(Parameter(data=1, name="input"))
+    print(output)
+
+    # now test the decorator
+    @fun_to_grad_component
+    def my_function(x):
+        return x + 1
+
+    print(my_function(1))
+    # eval mode
+    output = my_function(1)
+    print(output)
+    assert output == 2
+
+    # training mode
+    my_function.train()
+    output = my_function(Parameter(data=1, name="input"))
+    print(output)
diff --git a/adalflow/adalflow/optim/loss_component.py b/adalflow/adalflow/optim/loss_component.py
index e53ac609..bfe6d875 100644
--- a/adalflow/adalflow/optim/loss_component.py
+++ b/adalflow/adalflow/optim/loss_component.py
@@ -1,6 +1,7 @@
 """Base class for Autograd Components that can be called and backpropagated through."""
 
 from typing import TYPE_CHECKING
+import uuid
 
 if TYPE_CHECKING:
     from adalflow.core.generator import BackwardEngine
@@ -27,10 +28,12 @@ class LossComponent(Component):
     """
     backward_engine: "BackwardEngine"
     _component_type = "loss"
+    id = None
 
     def __init__(self, *args, **kwargs):
         super().__init__()
         super().__setattr__("backward_engine", None)
+        super().__setattr__("id", str(uuid.uuid4()))
 
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
@@ -38,6 +41,9 @@ def __call__(self, *args, **kwargs):
     def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs):
         raise NotImplementedError("set_backward_engine method is not implemented")
 
+    def disable_backward_engine(self):
+        self.backward_engine = None
+
     def forward(self, *args, **kwargs) -> "Parameter":
         r"""Default just wraps the call method."""
         raise NotImplementedError("forward method is not implemented")
diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py
index 5b60995c..69142f13 100644
--- a/adalflow/adalflow/optim/parameter.py
+++ b/adalflow/adalflow/optim/parameter.py
@@ -13,14 +13,17 @@
     Callable,
     TYPE_CHECKING,
 )
-from pyvis.network import Network
 from collections import defaultdict
+from pyvis.network import Network
 import logging
 import os
 from dataclasses import dataclass, field
 import uuid
 from adalflow.optim.types import ParameterType
 from adalflow.core.base_data_class import DataClass
+from adalflow.utils.logger import printc
+import html
+
 
 if TYPE_CHECKING:
     from adalflow.optim.text_grad.tgd_optimizer import TGDData, TGDOptimizerTrace
@@ -31,23 +34,40 @@
 
 
 @dataclass
-class GradientContext:
+class GradientContext(DataClass):
+    """GradientContext is used to describe the component's function and trace its input and output.
+
+    To get the component's function desc, use GradientContext.to_yaml_signature()
+    To get the data: use instance.to_yaml()
+    """
+
     variable_desc: str = field(
         metadata={"desc": "The description of the target parameter"}
     )
-    response_desc: str = field(
-        metadata={"desc": "The description of the response parameter"}
-    )
-    context: str = field(
+    # from template LOSS_CONVERSATION_TEMPLATE_STRING
+    # LLM_CONVERSATION_TEMPLATE from backward_engine_prompt
+    input_output: str = field(
         metadata={
             "desc": "The context of the gradient in form of a conversation indicating \
-                the relation of the current parameter to the response parameter (gradient)"
+                the relation of the current parameter to the response parameter"
         }
     )
+    response_desc: str = field(
+        metadata={"desc": "The description of the response parameter"}
+    )
+    # input: Dict[str, Any] = field(
+    #     metadata={"desc": "The input to the whole system"}, default=None
+    # )
+
+    # ground_truth: Any = field(
+    #     metadata={"desc": "The ground truth of the response parameter"}, default=None
+    # )
 
 
 @dataclass
-class ComponentTrace:
+class ComponentTrace(DataClass):
+    name: str = field(metadata={"desc": "The name of the component"}, default=None)
+    id: str = field(metadata={"desc": "The unique id of the component"}, default=None)
     input_args: Dict[str, Any] = field(
         metadata={"desc": "The input arguments of the GradComponent forward"},
         default=None,
@@ -55,6 +75,9 @@ class ComponentTrace:
     full_response: object = field(
         metadata={"desc": "The full response of the GradComponent output"}, default=None
     )
+    raw_response: str = field(
+        metadata={"desc": "The raw response of the generator"}, default=None
+    )
     api_kwargs: Dict[str, Any] = field(
         metadata={
             "desc": "The api_kwargs for components like Generator and Retriever that pass to the model client"
@@ -62,6 +85,10 @@ class ComponentTrace:
         default=None,
     )
 
+    def to_context_str(self):
+        output = f"""<INPUT>: {self.input_args}. <OUTPUT>: {self.full_response}"""
+        return output
+
 
 # TODO: use this to better trace the score
 @dataclass
@@ -75,28 +102,46 @@ class ScoreTrace:
     )
 
 
+@dataclass(frozen=True)
+class ComponentNode(DataClass):
+    """Used to represent a node in the component graph."""
+
+    id: str = field(metadata={"desc": "The unique id of the component"})
+    name: str = field(metadata={"desc": "The name of the component"})
+    type: Literal["INPUT", "COMPONENT"] = field(
+        metadata={"desc": "The type of the node"}, default="COMPONENT"
+    )
+
+
 COMBINED_GRADIENTS_TEMPLATE = r"""
-{% if combined_gradients %}
-Batch size: {{ combined_gradients|length }}
+{% if component_schema %}
+<COMPONENT_SCHEMA>
+Gradients are from {{ component_schema | length }} components.
+{% for component_id, schema in component_schema.items() %}
+id: {{ component_id }}
+{{ schema }}
+{% endfor %}
+</COMPONENT_SCHEMA>
 {% endif %}
-{% for g in combined_gradients %}
-{% set gradient = g[0] %}
-{% set gradient_context = g[1] %}
 
-{% if gradient_context %}
-{{loop.index}}.
-<CONTEXT>{{gradient_context.context}}</CONTEXT>
+{% if combined_gradients %}
+{% for group in combined_gradients %}
+<DataID: {{ loop.index }} >
+<AVERAGE_SCORE>{{ group.average_score|round(2) }}</AVERAGE_SCORE>
+{% for gradient in group.gradients %}
+{{ loop.index }}.
+INPUT_OUTPUT: {{ gradient.context }}
+{% if gradient.score is not none %}
+<SCORE>{{ gradient.score }}</SCORE>
+<FEEDBACK>{{ gradient.gradient }}</FEEDBACK>
 {% endif %}
+{% endfor %}
+</DataID>
 
-{% if gradient.data %}
-  {% if gradient_context %}
-{#The output is used as <{{gradient_context.response_desc}}>#}
-<FEEDBACK>{{gradient.data}}</FEEDBACK>
-{% else %}
-<FEEDBACK>{{gradient.data}}</FEEDBACK>
-{% endif %}
+
+{% endfor %}
 {% endif %}
-{% endfor %}"""
+"""
 
 
 class Parameter(Generic[T]):
@@ -127,6 +172,14 @@ class Parameter(Generic[T]):
     1. https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py
     """
 
+    allowed_types = {
+        ParameterType.NONE,
+        ParameterType.PROMPT,
+        ParameterType.DEMOS,
+        ParameterType.HYPERPARAM,
+        ParameterType.INPUT,
+    }
+
     id: str = None  # Unique id of the parameter
     name: str = None  # Name of the parameter, easier to read for humans
     role_desc: str = ""  # Description of the role of the parameter
@@ -139,41 +192,38 @@ class Parameter(Generic[T]):
     proposing: bool = False  # State of the parameter
     predecessors: Set["Parameter"] = set()  # Predecessors of the parameter
     peers: Set["Parameter"] = set()  # Peers of the parameter
-    # TODO: input_args should be OrderedDict to keep the order of args
-    input_args: Dict[str, Any] = None  # Input arguments of the GradComponent forward
-    full_response: object = None  # Full response of the GradComponent output
     eval_input: object = None  # Eval input passing to the eval_fn or evaluator you use
     successor_map_fn: Dict[str, Callable] = (
         None  # Map function to get the data from the output
     )
-    from_response_id: str = (
-        None  # for parameterType GRADIENT, the id of the response parameter
-    )
+
     backward_engine_disabled: bool = (
         False  # Disable the backward engine for the parameter
     )
 
-    component_trace: ComponentTrace = None  # Trace of the component
     tgd_optimizer_trace: "TGDOptimizerTrace" = None  # Trace of the TGD optimizer
 
+    data_in_prompt: Callable = (
+        None  # Callable to get the str of the data to be used in the prompt
+    )
+    gt: object = None  # Ground truth of the parameter
+
     def __init__(
         self,
         *,
-        id: Optional[str] = None,
+        id: Optional[str] = None,  # unique id of the parameter
         data: T = None,  # for generator output, the data will be set up as raw_response
         data_id: str = None,  # for tracing the data item in the training/val/test set
         requires_opt: bool = True,
         role_desc: str = "",
         param_type: ParameterType = ParameterType.NONE,
         name: str = None,  # name is used to refer to the parameter in the prompt, easier to read for humans
-        gradient_prompt: str = None,
-        raw_response: str = None,  # use this to track the raw response of generator instead of the data (can be parsed)
         instruction_to_optimizer: str = None,
         instruction_to_backward_engine: str = None,
         score: Optional[float] = None,
         eval_input: object = None,
-        from_response_id: Optional[str] = None,
         successor_map_fn: Optional[Dict[str, Callable]] = None,
+        data_in_prompt: Callable = None,
     ):
         self.id = id or str(uuid.uuid4())
         self.data_id = data_id
@@ -188,24 +238,24 @@ def __init__(
                 else f"param_{self.id}"
             )
         self.param_type = param_type
+        # allow subclasses to override allowed_types dynamically
+        allowed_types = getattr(self.__class__, "allowed_types", set())
+        if param_type not in allowed_types:
+            raise ValueError(
+                f"{param_type.name} is not allowed for {self.__class__.__name__}"
+            )
+
         self.data = data  # often string and will be used in the prompts
         self.requires_opt = requires_opt
         self.data_type = type(data)
 
         self.set_eval_fn_input(eval_input=data)
-        self.gradients: List[Parameter] = []  # <FEEDBACK>gradient.data</FEEDBACK>
-        self.gradient_prompt: str = (
-            gradient_prompt  # the whole llm prompt to compute the gradient
-        )
-        self.gradients_context: Dict[Parameter, GradientContext] = defaultdict(
-            lambda: None
-        )  # input and output from an operator, each operator should have a template
-        # <CONVERSATION>...</CONVERSATION>
+        self.gradients: Set[Gradient] = set()
+
         self.grad_fn = None
 
         self.previous_data = None  # used to store the previous data
         # context of the forward pass
-        self.raw_response = raw_response
 
         self.instruction_to_optimizer: str = instruction_to_optimizer
         self.instruction_to_backward_engine: str = instruction_to_backward_engine
@@ -214,7 +264,7 @@ def __init__(
         self._traces: Dict[str, DataClass] = {}  # id to data items (DynamicDataClass)
         self._student_traces: Dict[str, DataClass] = {}  # id
 
-        self._score: float = (
+        self.score: float = (
             score  # end to end evaluation score, TODO: might have multiple scores if using multiple eval fns  # score is set in the gradients in the backward pass
         )
 
@@ -224,9 +274,15 @@ def __init__(
         self._previous_demos: List[DataClass] = []
         self.eval_input = eval_input
 
-        self.from_response_id = from_response_id  # for gradient parameter
         self.successor_map_fn = successor_map_fn or {}
-        self.component_trace = ComponentTrace()
+
+        def default_prompt_map_fn(param: Parameter):
+            # if isinstance(param.data, GeneratorOutput):
+            #     return param.data.raw_response
+            return param.data
+
+        self.data_in_prompt = data_in_prompt or default_prompt_map_fn
+        self.gt = None
 
     def map_to_successor(self, successor: object) -> T:
         """Apply the map function to the successor based on the successor's id."""
@@ -245,14 +301,262 @@ def check_if_already_computed_gradient_respect_to(self, response_id: str) -> boo
         from_response_ids = [g.from_response_id for g in self.gradients]
         return response_id in from_response_ids
 
-    def add_gradient(self, gradient: "Parameter"):
-        if gradient.param_type != ParameterType.GRADIENT:
-            raise ValueError("Cannot add non-gradient parameter to gradients list.")
+    ############################################################################################################
+    # Handle gt
+    ############################################################################################################
+    def set_gt(self, gt: object):
+
+        self.gt = gt
+
+    def get_gt(self) -> object:
+        return self.gt
+
+    # ############################################################################################################
+    # Handle gradients and context
+    # ############################################################################################################
+    def add_gradient(self, gradient: "Gradient"):
+        # if gradient.param_type != ParameterType.GRADIENT:
+        #     raise ValueError("Cannot add non-gradient parameter to gradients list.")
 
         if gradient.from_response_id is None:
             raise ValueError("Gradient must have a from_response_id.")
 
-        self.gradients.append(gradient)
+        start_order = len(self.gradients)
+        gradient.order = start_order
+
+        self.gradients.add(gradient)
+        # sort the gradients by the data_id, response_component_id, and score
+        self.sort_gradients()
+
+    def reset_gradients(self):
+        self.gradients = set()
+
+    def get_gradients_names(self) -> str:
+        names = [g.name for g in self.gradients]
+        names = ", ".join(names)
+        return names
+
+    def get_prompt_data(self) -> str:
+        return self.data_in_prompt(self)
+
+    def get_gradients_str(self) -> str:
+        if not self.gradients:
+            return ""
+
+        gradients_str = ""
+        for i, g in enumerate(self.gradients):
+            gradients_str += f"{i}. {g.data}\n"
+
+        return gradients_str
+
+    def get_gradient_and_context_text(self, skip_correct_sample: bool = False) -> str:
+        """Aggregates and returns:
+        1. the gradients
+        2. the context text for which the gradients are computed
+
+        Sort the gradients from the lowest score to the highest score.
+        Highlight the gradients with the lowest score to the optimizer.
+        """
+        from adalflow.core.prompt_builder import Prompt
+
+        if not self.gradients:
+            return ""
+
+        # sore gradients by the score from low to high
+        # self.gradients = sorted(
+        #     self.gradients, key=lambda x: x.score if x.score is not None else 1
+        # )
+        # print the score for the sorted gradients
+        lowest_score_gradients = []
+        for i, g in enumerate(self.gradients):
+            if skip_correct_sample:
+                if g.score > 0.5:
+                    continue
+            lowest_score_gradients.append(g)
+
+        gradient_context_combined_str = ""
+        if lowest_score_gradients and len(lowest_score_gradients) > 0:
+
+            # parse the gradients and context.
+            # gradients_and_context: List[Dict[str, Any]] = (
+            #     []
+            # )  # {gradient: data, context: GradientContext.input_output}
+            # for g in lowest_score_gradients:
+            #     gradients_and_context.append(
+            #         {
+            #             "data_id": g.data_id,
+            #             "gradient": g.data,
+            #             "context": g.context.input_output,
+            #             "score": g.score,
+            #         }
+            #     )
+
+            # group gradients by data_id and calculate average scores
+            grouped_gradients = defaultdict(
+                lambda: {"gradients": [], "score_sum": 0, "count": 0}
+            )
+            for g in lowest_score_gradients:
+                group = grouped_gradients[g.data_id]
+                group["gradients"].append(
+                    {
+                        "gradient": g.data,
+                        "context": g.context.input_output,
+                        "score": g.score,
+                    }
+                )
+                group["score_sum"] += g.score if g.score is not None else 0
+                group["count"] += 1
+
+            # Calculate average scores and sort groups
+            grouped_list = []
+            for data_id, group in grouped_gradients.items():
+                average_score = (
+                    group["score_sum"] / group["count"] if group["count"] > 0 else 0
+                )
+                grouped_list.append(
+                    {
+                        "data_id": data_id,
+                        "average_score": average_score,
+                        "gradients": group["gradients"],
+                    }
+                )
+            sorted_groups = sorted(grouped_list, key=lambda x: x["average_score"])
+
+            gradient_context_combined_str = Prompt(
+                template=COMBINED_GRADIENTS_TEMPLATE,
+                prompt_kwargs={"combined_gradients": sorted_groups},
+            )().strip()
+
+        # get component id: gradient
+        component_id_to_gradient: Dict[str, Gradient] = {}
+        for g in lowest_score_gradients:
+            component_id_to_gradient[g.from_response_component_id] = g
+
+        componend_id_to_schema: Dict[str, str] = {}
+        for id, g in component_id_to_gradient.items():
+            componend_id_to_schema[id] = g.context.to_yaml(exclude={"input_output"})
+
+        # if there are multiple successors, there will be multiple component schemas
+
+        return gradient_context_combined_str
+
+    def get_gradients_component_schema(self, skip_correct_sample: bool = False) -> str:
+        """Aggregates and returns:
+        1. the gradients
+        2. the context text for which the gradients are computed
+
+        Sort the gradients from the lowest score to the highest score.
+        Highlight the gradients with the lowest score to the optimizer.
+        """
+        from adalflow.core.prompt_builder import Prompt
+
+        # print(
+        #     f"len of gradients: {len(self.gradients)}, scores: {[g._score for g in self.gradients]} for {self.name}"
+        # )
+
+        if not self.gradients:
+            return ""
+
+        # sore gradients by the _score from low to high
+        # self.gradients = sorted(
+        #     self.gradients, key=lambda x: x.score if x.score is not None else 1
+        # )
+        # print the score for the sorted gradients
+        lowest_score_gradients = []
+        for i, g in enumerate(self.gradients):
+            if skip_correct_sample:
+                if g.score > 0.5:
+                    continue
+            lowest_score_gradients.append(g)
+
+        # Group gradients by `data_id` and calculate average scores
+        grouped_gradients = defaultdict(
+            lambda: {"gradients": [], "score_sum": 0, "count": 0}
+        )
+        for g in lowest_score_gradients:
+            group = grouped_gradients[g.data_id]
+            group["gradients"].append(
+                {
+                    "gradient": g.data,
+                    "context": g.context.input_output,
+                    "score": g.score,
+                }
+            )
+            group["score_sum"] += g.score if g.score is not None else 0
+            group["count"] += 1
+
+        # Calculate average scores and sort groups
+        grouped_list = []
+        for data_id, group in grouped_gradients.items():
+            average_score = (
+                group["score_sum"] / group["count"] if group["count"] > 0 else 0
+            )
+            grouped_list.append(
+                {
+                    "data_id": data_id,
+                    "average_score": average_score,
+                    "gradients": group["gradients"],
+                }
+            )
+        sorted_groups = sorted(grouped_list, key=lambda x: x["average_score"])
+
+        # get component id: gradient
+        component_id_to_gradient: Dict[str, Gradient] = {}
+        for g in lowest_score_gradients:
+            component_id_to_gradient[g.from_response_component_id] = g
+
+        componend_id_to_schema: Dict[str, str] = {}
+        for id, g in component_id_to_gradient.items():
+            componend_id_to_schema[id] = g.context.to_yaml(exclude=["input_output"])
+
+        # parse the gradients and context.
+        gradients_and_context: List[Dict[str, Any]] = (
+            []
+        )  # {gradient: data, context: GradientContext.input_output}
+        for g in lowest_score_gradients:
+            gradients_and_context.append(
+                {
+                    "data_id": g.data_id,
+                    "gradient": g.data,
+                    "context": g.context.input_output,
+                    "score": g.score,
+                }
+            )
+
+        gradient_context_combined_str = Prompt(
+            template=COMBINED_GRADIENTS_TEMPLATE,
+            prompt_kwargs={
+                "combined_gradients": sorted_groups,
+                "component_schema": componend_id_to_schema,
+            },
+        )().strip()
+
+        # if there are multiple successors, there will be multiple component schemas
+
+        return gradient_context_combined_str
+
+    def merge_gradients_for_cycle_components(self):
+        """Merge data_id, from_response_component_id into the same gradient"""
+
+    def sort_gradients(self):
+        """With rules mentioned in Graient class, we will track the gradients by data_id, then response_component_id, then score"""
+
+        self.gradients = sorted(
+            self.gradients,
+            key=lambda x: (
+                x.data_id,
+                x.from_response_component_id,
+                -x.order if x.order is not None else 0,
+                x.from_response_id,
+                x.score,
+            ),
+        )
+        # make it a set again
+        self.gradients = set(self.gradients)
+
+    ############################################################################################################
+    # Setters and getters
+    ############################################################################################################
 
     def set_predecessors(self, predecessors: List["Parameter"] = None):
         if predecessors is None:
@@ -269,11 +573,14 @@ def set_grad_fn(self, grad_fn):
         self.grad_fn = grad_fn
 
     def get_param_info(self):
+        """Used to represent the parameter in the prompt."""
         return {
             "name": self.name,
             "role_desc": self.role_desc,
-            "data": self.data,
+            "prompt_data": self.data_in_prompt(self),  # default to use all data
             "param_type": self.param_type,
+            "requires_opt": self.requires_opt,
+            "eval_input": self.eval_input,  # for output passing to the eval_fn
         }
 
     def set_peers(self, peers: List["Parameter"] = None):
@@ -291,27 +598,13 @@ def set_peers(self, peers: List["Parameter"] = None):
     # Trace the tgd optimizer data
     ############################################################################################################
     def trace_optimizer(self, api_kwargs: Dict[str, Any], response: "TGDData"):
+        r"""Trace the inputs and output of a TGD optimizer."""
         from adalflow.optim.text_grad.tgd_optimizer import TGDOptimizerTrace
 
         self.tgd_optimizer_trace = TGDOptimizerTrace(
             api_kwargs=api_kwargs, output=response
         )
 
-    ############################################################################################################
-    #  Trace component, include trace_forward_pass & trace_api_kwargs for now
-    ############################################################################################################
-    def trace_forward_pass(self, input_args: Dict[str, Any], full_response: object):
-        r"""Trace the forward pass of the parameter."""
-        self.input_args = input_args
-        self.full_response = full_response
-        # TODO: remove the input_args and full_response to use component_trace
-        self.component_trace.input_args = input_args
-        self.component_trace.full_response = full_response
-
-    def trace_api_kwargs(self, api_kwargs: Dict[str, Any]):
-        r"""Trace the api_kwargs for components like Generator and Retriever that pass to the model client."""
-        self.component_trace.api_kwargs = api_kwargs
-
     def set_eval_fn_input(self, eval_input: object):
         r"""Set the input for the eval_fn."""
         self.eval_input = eval_input
@@ -326,7 +619,12 @@ def set_score(self, score: float):
 
         But this score is only used to relay the score to the demo parametr.
         """
-        self._score = score
+        score = float(score)
+        if not isinstance(score, float):
+            raise ValueError(
+                f"score is not float, but {type(score)}, parameter name: {self.name}"
+            )
+        self.score = score
 
     def add_dataclass_to_trace(self, trace: DataClass, is_teacher: bool = True):
         r"""Called by the generator.forward to add a trace to the parameter.
@@ -381,10 +679,6 @@ def revert_data(self, include_demos: bool = False):
         self.previous_data = None
         self.proposing = False
 
-        # reset the gradients and context
-        # self.reset_gradients()
-        # self.reset_gradients_context()
-
         # cant reset gradients yet for the loss
         if include_demos:
             self._demos = self._previous_demos
@@ -398,9 +692,6 @@ def step_data(self, include_demos: bool = False):
         self.previous_data = None
         self.proposing = False
 
-        # reset the gradients and context
-        # self.reset_gradients()
-        # self.reset_gradients_context()
         if include_demos:
             self._previous_demos = []
 
@@ -421,61 +712,6 @@ def update_value(self, data: T):
             self.data_type = type(data)
         self.data = data
 
-    def reset_gradients(self):
-        self.gradients = []
-
-    def reset_gradients_context(self):
-        self.gradients_context = defaultdict(lambda: None)
-
-    def get_gradients_names(self) -> str:
-        names = [g.name for g in self.gradients]
-        names = ", ".join(names)
-        return names
-
-    def get_gradient_and_context_text(self, skip_correct_sample: bool = False) -> str:
-        """Aggregates and returns:
-        1. the gradients
-        2. the context text for which the gradients are computed
-
-        Sort the gradients from the lowest score to the highest score.
-        Highlight the gradients with the lowest score to the optimizer.
-        """
-        from adalflow.core.prompt_builder import Prompt
-
-        # print(
-        #     f"len of gradients: {len(self.gradients)}, scores: {[g._score for g in self.gradients]} for {self.name}"
-        # )
-
-        # sore gradients by the _score from low to high
-        self.gradients = sorted(
-            self.gradients, key=lambda x: x._score if x._score is not None else 1
-        )
-        # print the score for the sorted gradients
-        lowest_score_gradients = []
-        for i, g in enumerate(self.gradients):
-            if skip_correct_sample:
-                if g._score > 0.5:
-                    continue
-            lowest_score_gradients.append(g)
-            print(f"{i} Score: {g._score} for {g.name}, {type(g._score)}")
-
-        gradient_context_combined = list(
-            zip(
-                lowest_score_gradients,
-                [self.gradients_context[g] for g in lowest_score_gradients],
-            )
-        )
-        # set all gradients value to None
-        # for g in self.gradients:
-        #     g.data = None
-
-        gradient_context_combined_str = Prompt(
-            template=COMBINED_GRADIENTS_TEMPLATE,
-            prompt_kwargs={"combined_gradients": gradient_context_combined},
-        )().strip()
-
-        return gradient_context_combined_str
-
     # TODO: dont use short value
     def get_short_value(self, n_words_offset: int = 10) -> str:
         """
@@ -485,7 +721,8 @@ def get_short_value(self, n_words_offset: int = 10) -> str:
         :type n_words_offset: int
         """
         # 1. ensure the data is a string
-        data = self.data
+        # data = self.data
+        data = self.get_prompt_data()
         if not isinstance(self.data, str):
             data = str(self.data)
         words = data.split(" ")
@@ -498,6 +735,12 @@ def get_short_value(self, n_words_offset: int = 10) -> str:
         )
         return short_value
 
+    def reset_all_gradients(self):
+        """Traverse the graph and reset the gradients for all nodes."""
+        nodes, _ = Parameter.trace_graph(self)
+        for node in nodes:
+            node.reset_gradients()
+
     @staticmethod
     def trace_graph(
         root: "Parameter",
@@ -507,6 +750,8 @@ def trace_graph(
         def build_graph(node: "Parameter"):
             if node in nodes:
                 return
+            if node is None:
+                raise ValueError("Node is None")
             nodes.add(node)
             for pred in node.predecessors:
                 edges.add((pred, node))
@@ -515,21 +760,6 @@ def build_graph(node: "Parameter"):
         build_graph(root)
         return nodes, edges
 
-    def report_cycle(cycle_nodes: List["Parameter"]):
-        """
-        Report the detected cycle and provide guidance to the user on how to avoid it.
-        """
-        cycle_names = [node.name for node in cycle_nodes]
-        log.warning(f"Cycle detected: {' -> '.join(cycle_names)}")
-        print(f"Cycle detected in the graph: {' -> '.join(cycle_names)}")
-
-        # Provide guidance on how to avoid the cycle
-        print("To avoid the cycle, consider the following strategies:")
-        print("- Modify the graph structure to remove cyclic dependencies.")
-        print(
-            "- Check the relationships between these nodes to ensure no feedback loops."
-        )
-
     def backward(
         self,
     ):
@@ -559,63 +789,94 @@ def build_topo(node: Parameter):
             if not node.requires_opt:
                 log.debug(f"Skipping {node.name} as it does not require optimization")
                 continue
-            log.debug(f"v: {node.data}, grad_fn: {node.grad_fn}, {node.get_grad_fn()}")
+            component_name = None
+            if hasattr(node, "component_trace"):
+                component_name = node.component_trace.name
+            printc(
+                f"node: {node.name}, component: {component_name}, grad_fn: {node.grad_fn}."
+            )
             if node.get_grad_fn() is not None:  # gradient function takes in the engine
                 log.debug(f"Calling gradient function for {node.name}")
                 node.grad_fn()
 
-    # def backward(
-    #     self,
-    # ):  # engine should be the llm or customized backwards function to pass feedback
-
-    #     # topological sort of all the predecessors of the current parameter in the graph
-    #     log.debug(f"Backward pass for {self.data}, backward function: {self.grad_fn}")
-    #     topo: List[Parameter] = []
-    #     visited = set()
-    #     in_stack = set()  # Nodes currently being visited to detect cycles
-    #     cycle_detected = False  # Flag to check if any cycle was detected
-
-    #     def build_topo(node: Parameter, stack: Set[Parameter] = set()):
-    #         nonlocal cycle_detected
-
-    #         if stack is None:
-    #             stack = []
-
-    #         # If the node is already in the stack, we have detected a cycle
-    #         if node in in_stack:
-    #             cycle_detected = True
-    #             cycle_nodes = stack + [node]  # The cycle includes the current path
-    #             self.report_cycle(cycle_nodes)
-    #             return False  # Stop further processing due to cycle
-    #         if node in visited:
-    #             return
-    #         visited.add(node)
-    #         in_stack.add(node)
-    #         stack.append(node)
-    #         for pred in node.predecessors:
-    #             build_topo(pred)
-    #         topo.append(node)
-    #         stack.pop()  # Backtrack, remove the node from the current path
-
-    #         in_stack.remove(node)  # Remove from the stack after processing
-    #         return True
-
-    #     # build_topo(self)
-    #     if not build_topo(self):
-    #         log.error("Cycle detected, stopping backward pass.")
-    #         return  # Stop the backward pass due to cycle detection
-    #     # backpropagation
-
-    #     self.gradients = set()
-    #     for node in reversed(topo):
-    #         if not node.requires_opt:
-    #             log.debug(f"Skipping {node.name} as it does not require optimization")
-    #             continue
-    #         node.gradients = _check_and_reduce_gradients(node)
-    #         log.debug(f"v: {node.data}, grad_fn: {node.grad_fn}, {node.get_grad_fn()}")
-    #         if node.get_grad_fn() is not None:  # gradient function takes in the engine
-    #             log.debug(f"Calling gradient function for {node.name}")
-    #             node.grad_fn()
+    @staticmethod
+    def generate_node_html(node: "Parameter", output_dir="node_pages"):
+        """Generate an HTML page for a specific node."""
+        import json
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        filename = f"{output_dir}/{node.name}.html"
+
+        # Gather gradients as JSON objects
+        gradients = []
+        for i, g in enumerate(node.gradients):
+            gradient = g.to_json_obj()
+            for k, v in gradient.items():
+                if isinstance(v, str):
+                    gradient[k] = v.replace("<", "&lt;").replace(">", "&gt;")
+            gradients.append(gradient)
+
+        data_json = None
+        node_data_type = str(type(node.data)).replace("<", "&lt;").replace(">", "&gt;")
+        printc(f"Node data type: {node_data_type}")
+        if isinstance(node.data, dict):
+            data_json = data_json
+        elif isinstance(node.data, DataClass):
+            try:
+                data_json = node.data.to_json_obj()
+            except Exception:
+
+                data_json = str(node.data)
+
+        else:
+            data_json = str(node.data)
+            data_json = {"data": data_json}
+
+        gradients_json = json.dumps(gradients, indent=4, ensure_ascii=False)
+
+        optimizer_trace = None
+        if node.tgd_optimizer_trace:
+            optimizer_trace = node.tgd_optimizer_trace.to_json_obj()
+            optimizer_trace = json.dumps(optimizer_trace, indent=4, ensure_ascii=False)
+
+        with open(filename, "w") as file:
+            file.write(
+                f"""
+            <!DOCTYPE html>
+            <html lang="en">
+            <head>
+                <meta charset="UTF-8">
+                <meta name="viewport" content="width=device-width, initial-scale=1.0">
+                <title>{node.name}</title>
+                <style>
+                    pre {{
+                        background-color: #f5f5f5;
+                        padding: 10px;
+                        border-radius: 5px;
+                        overflow-x: auto;
+                    }}
+                </style>
+            </head>
+            <body>
+                <h1>Details for Node: {node.name}</h1>
+                <p><b>ID:</b> {node.id}</p>
+                <p><b>Role:</b> {node.role_desc}</p>
+                <p><b>DataType:</b> {node_data_type}</p>
+                <pre><b>Data:</b> \n{json.dumps(data_json, indent=4)}</pre>
+                <p><b>Data ID:</b> {node.data_id}</p>
+                <p><b>Previous Value:</b> {node.previous_data}</p>
+                <p><b>Requires Optimization:</b> {node.requires_opt}</p>
+                <p><b>Type:</b> {node.param_type.value} ({node.param_type.description})</p>
+                <pre><b>Gradients:</b>\n{gradients_json}</pre>
+                <pre><b>TGD Optimizer Trace:</b>\n{optimizer_trace}</pre>
+
+            </body>
+            </html>
+            """
+            )
+        print(f"Generated HTML for node: {node.name} at {filename}")
 
     def draw_interactive_html_graph(
         self,
@@ -636,18 +897,30 @@ def draw_interactive_html_graph(
         """
         from jinja2 import Template
 
-        # Define the output file path
         output_file = "interactive_graph.html"
         final_file = filepath + "_" + output_file if filepath else output_file
 
-        # Create a pyvis Network instance
         net = Network(height="750px", width="100%", directed=True)
 
+        node_colors = {
+            ParameterType.PROMPT: "lightblue",
+            ParameterType.DEMOS: "orange",
+            ParameterType.INPUT: "gray",
+            ParameterType.OUTPUT: "green",
+            ParameterType.GENERATOR_OUTPUT: "purple",
+            ParameterType.RETRIEVER_OUTPUT: "red",
+            ParameterType.LOSS_OUTPUT: "pink",
+            ParameterType.SUM_OUTPUT: "blue",
+        }
+
         # Add nodes to the graph
         node_ids = set()
         for node in nodes:
+            self.generate_node_html(node, output_dir=filepath)
+
             label = (
-                f"<b>Name:</b> {node.name}<br>"
+                f"""<div style="max-height: 150px; overflow-y: auto; border: 1px solid #ccc; padding: 10px; background: white; position: relative; font-family: Arial, sans-serif;">"""
+                f"<b>Name:</b> {node.name[0:10]}<br>"
                 f"<b>Role:</b> {node.role_desc.capitalize()}<br>"
                 f"<b>Value:</b> {node.data}<br>"
                 f"<b>Data ID:</b> {node.data_id}<br>"
@@ -655,18 +928,16 @@ def draw_interactive_html_graph(
             if node.proposing:
                 label += "<b>Proposing:</b> Yes<br>"
                 label += f"<b>Previous Value:</b> {node.previous_data}<br>"
-            if node.requires_opt:
-                label += "<b>Requires Optimization:</b> Yes<br>"
+            label += f"<b>Requires Optimization:</b> {node.requires_opt}<br>"
             if node.param_type:
-                label += f"<b>Type:</b> {node.param_type}<br>"
-            if node.gradients:
-                label += f"<b>Gradients:</b> {node.get_gradients_names()}<br>"
+                label += f"<b>Type:</b> {node.param_type.value}<br>"
 
             net.add_node(
-                node.id,
-                label=node.name,
+                n_id=node.id,
+                label=node.name[0:16],
                 title=label,
-                color="lightblue" if node.proposing else "orange",
+                color=node_colors.get(node.param_type, "gray"),
+                url=f"{filepath}/{node.name}.html",
             )
             node_ids.add(node.id)
 
@@ -679,33 +950,63 @@ def draw_interactive_html_graph(
                     f"Skipping edge from {source.name} to {target.name} as one of the nodes does not exist."
                 )
 
-        # Enable physics for better layout
         net.toggle_physics(True)
         net.template = Template(
             """
-        <!DOCTYPE html>
-        <html>
-        <head>
-            <script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
-            <link href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis-network.min.css" rel="stylesheet" />
-        </head>
-        <body>
-            <div id="mynetwork" style="height: {{ height }};"></div>
-            <script type="text/javascript">
-                var nodes = new vis.DataSet({{ nodes | safe }});
-                var edges = new vis.DataSet({{ edges | safe }});
-                var container = document.getElementById('mynetwork');
-                var data = { nodes: nodes, edges: edges };
-                var options = {{ options | safe }};
-                var network = new vis.Network(container, data, options);
-            </script>
-        </body>
-        </html>
-        """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
+        <link href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis-network.min.css" rel="stylesheet" />
+        <style>
+            #tooltip {
+                display: none;
+                position: absolute;
+                max-width: 300px;
+                border: 1px solid #ccc;
+                padding: 10px;
+                background: white;
+                z-index: 1000;
+                font-family: Arial, sans-serif;
+                font-size: 12px;
+                line-height: 1.5;
+            }
+            #tooltip button {
+                display: block;
+                margin-top: 10px;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="tooltip">
+            <div id="tooltip-content"></div>
+            <button onclick="document.getElementById('tooltip').style.display='none'">Close</button>
+        </div>
+        <div id="mynetwork" style="height: {{ height }};"></div>
+        <script type="text/javascript">
+            var nodes = new vis.DataSet({{ nodes | safe }});
+            var edges = new vis.DataSet({{ edges | safe }});
+            var container = document.getElementById('mynetwork');
+            var data = { nodes: nodes, edges: edges };
+            var options = {{ options | safe }};
+            var network = new vis.Network(container, data, options);
+
+            // Handle node click to open a link
+            network.on("click", function (params) {
+                if (params.nodes.length > 0) {
+                    const nodeId = params.nodes[0];
+                    const node = nodes.get(nodeId);
+                    if (node.url) {
+                        window.open(node.url, '_blank');
+                    }
+                }
+            });
+        </script>
+    </body>
+    </html>
+    """
         )
 
-        # Save the graph as an HTML file
-
         net.show(final_file)
         print(f"Interactive graph saved to {final_file}")
 
@@ -739,12 +1040,6 @@ def draw_graph(
                 "Please install graphviz using 'pip install graphviz' to use this feature"
             ) from e
 
-        # try:
-        #     from tensorboardX import SummaryWriter
-        # except ImportError as e:
-        #     raise ImportError(
-        #         "Please install tensorboardX using 'pip install tensorboardX' to use this feature"
-        #     ) from e
         assert rankdir in ["LR", "TB"]
         try:
             import textwrap
@@ -766,6 +1061,7 @@ def draw_graph(
             if filepath
             else os.path.join(root_path, "graphs", filename)
         )
+        # final_path = f"{filepath}.{format}"
         print(f"Saving graph to {filepath}.{format}")
 
         def wrap_text(text, width):
@@ -792,13 +1088,14 @@ def wrap_and_escape(text, width=40):
             return wrap_text(text, width)
 
         nodes, edges = self.trace_graph(self)
-        dot = Digraph(format=format, graph_attr={"rankdir": rankdir})
+        dot = Digraph(format=format, graph_attr={"rankdir": rankdir, "dpi": "300"})
         node_names = set()
         for n in nodes:
             label_color = "darkblue"
 
             node_label = (
                 f"<table border='0' cellborder='1' cellspacing='0'>"
+                f"<tr><td><b><font color='{label_color}'>Name: </font></b></td><td>{wrap_and_escape(n.id)}</td></tr>"
                 f"<tr><td><b><font color='{label_color}'>Name: </font></b></td><td>{wrap_and_escape(n.name)}</td></tr>"
                 f"<tr><td><b><font color='{label_color}'>Role: </font></b></td><td>{wrap_and_escape(n.role_desc.capitalize())}</td></tr>"
                 f"<tr><td><b><font color='{label_color}'>Value: </font></b></td><td>{wrap_and_escape(n.data)}</td></tr>"
@@ -812,32 +1109,45 @@ def wrap_and_escape(text, width=40):
                 node_label += f"<tr><td><b><font color='{label_color}'>Requires Optimization: </font ></b></td><td>{{'Yes'}}</td></tr>"
             if n.param_type:
                 node_label += f"<tr><td><b><font color='{label_color}'>Type: </font></b></td><td>{wrap_and_escape(n.param_type.name)}</td></tr>"
-            if full_trace and n.component_trace.api_kwargs is not None:
+            if (
+                full_trace
+                and hasattr(n, "component_trace")
+                and n.component_trace.api_kwargs is not None
+            ):
                 node_label += f"<tr><td><b><font color='{label_color}'> API kwargs: </font></b></td><td>{wrap_and_escape(str(n.component_trace.api_kwargs))}</td></tr>"
 
             # show the score for intermediate nodes
-            if n._score is not None and len(n.predecessors) > 0:
-                node_label += f"<tr><td><b><font color='{label_color}'>Score: </font></b></td><td>{str(n._score)}</td></tr>"
+            if n.score is not None and len(n.predecessors) > 0:
+                node_label += f"<tr><td><b><font color='{label_color}'>Score: </font></b></td><td>{str(n.score)}</td></tr>"
             if add_grads:
                 node_label += f"<tr><td><b><font color='{label_color}'>Gradients: </font></b></td><td>{wrap_and_escape(n.get_gradients_names())}</td></tr>"
                 # add a list of each gradient with short value
                 # combine the gradients and context
-                combined_gradients_contexts = zip(
-                    n.gradients, [n.gradients_context[g] for g in n.gradients]
-                )
-                for g, context in combined_gradients_contexts:
-                    gradient_context = context
+                # combined_gradients_contexts = zip(
+                #     n.gradients, [n.gradients_context[g] for g in n.gradients]
+                # )
+                # if "output" in n.name:
+                for g in n.gradients:
+                    gradient_context = g.context
                     log.info(f"Gradient context display: {gradient_context}")
                     log.info(f"data: {g.data}")
                     node_label += f"<tr><td><b><font color='{label_color}'>Gradient {g.name} Feedback: </font></b></td><td>{wrap_and_escape(g.data)}</td></tr>"
-                    if gradient_context != "":
-                        node_label += f"<tr><td><b><font color='{label_color}'>Gradient {g.name} Context: </font></b></td><td>{wrap_and_escape(gradient_context)}</td></tr>"
+                    # if gradient_context != "":
+                    #     node_label += f"<tr><td><b><font color='{label_color}'>Gradient {g.name} Context: </font></b></td><td>{wrap_and_escape(gradient_context)}</td></tr>"
+                    # if g.prompt:
+                    #     node_label += f"<tr><td><b><font color='{label_color}'>Gradient {g.name} Prompt: </font></b></td><td>{wrap_and_escape(g.prompt)}</td></tr>"
             if len(n._traces.values()) > 0:
                 node_label += f"<tr><td><b><font color='{label_color}'>Traces: keys: </font></b></td><td>{wrap_and_escape(str(n._traces.keys()))}</td></tr>"
                 node_label += f"<tr><td><b><font color='{label_color}'>Traces: values: </font></b></td><td>{wrap_and_escape(str(n._traces.values()))}</td></tr>"
             if n.tgd_optimizer_trace is not None:
                 node_label += f"<tr><td><b><font color='{label_color}'>TGD Optimizer Trace: </font></b></td><td>{wrap_and_escape(str(n.tgd_optimizer_trace))}</td></tr>"
 
+            # show component trace, id and name
+            if hasattr(n, "component_trace") and n.component_trace.id is not None:
+                node_label += f"<tr><td><b><font color='{label_color}'>Component Trace ID: </font></b></td><td>{wrap_and_escape(str(n.component_trace.id))}</td></tr>"
+            if hasattr(n, "component_trace") and n.component_trace.name is not None:
+                node_label += f"<tr><td><b><font color='{label_color}'>Component Trace Name: </font></b></td><td>{wrap_and_escape(str(n.component_trace.name))}</td></tr>"
+
             node_label += "</table>"
             # check if the name exists in dot
             if n.name in node_names:
@@ -854,57 +1164,306 @@ def wrap_and_escape(text, width=40):
             for g in n.gradients:
 
                 log.info(f"Gradient: {g.name}, {g.to_dict()}")
-                log.info(f"Gradient prompt: {g.gradient_prompt}")
+                log.info(f"Gradient prompt: {g.prompt}")
         for n1, n2 in edges:
             dot.edge(n1.name, n2.name)
 
-        dot.render(filepath, format=format, cleanup=True)
-        # from PIL import Image
-        # try:
-        #     import matplotlib.pyplot as plt
-        # except ImportError as e:
-        #     raise ImportError(
-        #         "Please install matplotlib using 'pip install matplotlib' to use this feature"
-        #     ) from e
-        #     ) from e
-        # from io import BytesIO
-        # import numpy as np
-
-        # # Read the rendered image file into memory using matplotlib
-        # with open(f"{filepath}.{format}", "rb") as f:
-        #     image_bytes = f.read()
-
-        # # Use matplotlib to read the image from bytes
-        # image = plt.imread(BytesIO(image_bytes), format=format)
-
-        # # Ensure the image is in the format [H, W, C]
-        # if image.ndim == 2:  # Grayscale image
-        #     image = np.expand_dims(image, axis=2)
-
-        # Read the rendered image file
-        # writer.add_image("graph", image, dataformats="HWC", global_step=1)
-        # writer.close()
-
-        # filename = f"{filepath}_prompts.json"
-        # prompts = {}
-        # for n in nodes:
-        #     prompts[n.name] = {
-        #         "raw_response": n.raw_response,
-        #     }
-        #     for g in n.gradients:
-        #         prompts[g.name] = {
-        #             "gradient_prompt": g.gradient_prompt,
-        #         }
-
-        # save_json(prompts, filename)
-        # save root node to_dict to json
+        # dot.render(filepath, format=format, cleanup=True)
+
         save_json(self.to_dict(), f"{filepath}_root.json")
 
         # draw interactive graph
-        self.draw_interactive_html_graph(
-            filepath=filepath, nodes=[n for n in nodes], edges=edges
+        graph_file: Dict[str, str] = self.draw_interactive_html_graph(
+            filepath=filepath, nodes=nodes, edges=edges
+        )
+        output = {
+            # "graph_path": final_path,
+            "root_path": f"{filepath}_root.json",
+            "interactive_html_graph": graph_file["graph_path"],
+        }
+        print(f"Graph saved as {filepath}.{format}")
+        return output
+
+    def draw_output_subgraph(
+        self,
+        add_grads: bool = True,
+        format: str = "png",
+        rankdir: str = "TB",
+        filepath: str = None,
+    ) -> Dict:
+        """
+        Build and visualize a subgraph containing only OUTPUT parameters.
+
+        Args:
+            add_grads (bool): Whether to include gradient edges.
+            format (str): Format for output (e.g., png, svg).
+            rankdir (str): Graph layout direction ("LR" or "TB").
+            filepath (str): Path to save the graph.
+        """
+
+        assert rankdir in ["LR", "TB"]
+        from adalflow.utils.global_config import get_adalflow_default_root_path
+
+        try:
+            from graphviz import Digraph
+
+        except ImportError as e:
+            raise ImportError(
+                "Please install graphviz using 'pip install graphviz' to use this feature"
+            ) from e
+
+        root_path = get_adalflow_default_root_path()
+
+        filename = f"trace_component_output_graph_{self.name}_id_{self.id}.{format}"
+        filepath = (
+            os.path.join(filepath, filename)
+            if filepath
+            else os.path.join(root_path, "graphs", filename)
         )
-        return {"graph_path": filepath, "root_path": f"{filepath}_root.json"}
+
+        # Step 1: Collect OUTPUT nodes and edges
+        nodes, edges = self._collect_output_subgraph()
+
+        # Step 2: Render using Graphviz
+        print(f"Saving OUTPUT subgraph to {filepath}")
+
+        dot = Digraph(format=format, graph_attr={"rankdir": rankdir})
+        node_ids = set()
+
+        for node in nodes:
+            escaped_name = html.escape(node.name if node.name else "")
+            escaped_param_type = html.escape(
+                node.param_type.name if node.param_type else ""
+            )
+            escaped_value = html.escape(
+                node.get_short_value() if node.get_short_value() else ""
+            )
+
+            node_label = f"""
+            <table border="0" cellborder="1" cellspacing="0">
+                <tr><td><b>Name:</b></td><td>{escaped_name}</td></tr>
+                <tr><td><b>Type:</b></td><td>{escaped_param_type}</td></tr>
+                <tr><td><b>Value:</b></td><td>{escaped_value}</td></tr>"""
+            # add the component trace id and name
+            if hasattr(node, "component_trace") and node.component_trace.id is not None:
+                escaped_ct_id = html.escape(str(node.component_trace.id))
+                node_label += f"<tr><td><b>Component Trace ID:</b></td><td>{escaped_ct_id}</td></tr>"
+            if (
+                hasattr(node, "component_trace")
+                and node.component_trace.name is not None
+            ):
+                escaped_ct_name = html.escape(str(node.component_trace.name))
+                node_label += f"<tr><td><b>Component Trace Name:</b></td><td>{escaped_ct_name}</td></tr>"
+
+            node_label += "</table>"
+            dot.node(
+                name=node.id,
+                label=f"<{node_label}>",
+                shape="plaintext",
+                color="lightblue" if node.requires_opt else "gray",
+            )
+            node_ids.add(node.id)
+
+        for source, target in edges:
+            if source.id in node_ids and target.id in node_ids:
+                dot.edge(source.id, target.id)
+
+        # Step 3: Save and render
+        dot.render(filepath, cleanup=True)
+        print(f"Graph saved as {filepath}")
+        return {"output_subgraph": filepath}
+
+    def draw_component_subgraph(
+        self,
+        format: str = "png",
+        rankdir: str = "TB",
+        filepath: str = None,
+    ):
+        """
+        Build and visualize a subgraph containing only OUTPUT parameters.
+
+        Args:
+            format (str): Format for output (e.g., png, svg).
+            rankdir (str): Graph layout direction ("LR" or "TB").
+            filepath (str): Path to save the graph.
+        """
+        assert rankdir in ["LR", "TB"]
+        from adalflow.utils.global_config import get_adalflow_default_root_path
+
+        try:
+            from graphviz import Digraph
+        except ImportError as e:
+            raise ImportError(
+                "Please install graphviz using 'pip install graphviz' to use this feature"
+            ) from e
+
+        # Step 1: Collect OUTPUT nodes and edges
+        component_nodes, edges, component_nodes_orders = (
+            self._collect_component_subgraph()
+        )
+        root_path = get_adalflow_default_root_path()
+
+        # Step 2: Setup graph rendering
+        filename = f"output_component_{self.name}_{self.id}.{format}"
+        filepath = filepath or f"./{filename}"
+
+        filepath = (
+            os.path.join(filepath, filename)
+            if filepath
+            else os.path.join(root_path, "graphs", filename)
+        )
+        print(f"Saving OUTPUT subgraph to {filepath}")
+
+        dot = Digraph(format=format, graph_attr={"rankdir": rankdir})
+
+        # Add nodes
+        for node in component_nodes:
+            node_label = """
+            <table border="0" cellborder="1" cellspacing="0">"""
+
+            if node.name:
+                node_label += """<tr><td><b>Name:</b></td><td>{node.name}</td></tr>"""
+            if node.type:
+                node_label += """<tr><td><b>TYPE:</b></td><td>{node.type}</td></tr>"""
+
+            # add the list of orders
+            if node.id in component_nodes_orders:
+                node_label += f"<tr><td><b>Order:</b></td><td>{component_nodes_orders[node.id]}</td></tr>"
+            node_label += "</table>"
+            dot.node(
+                name=node.id if node.id else "id missing",
+                label=f"<{node_label}>",
+                shape="plaintext",
+                color="lightblue",
+            )
+
+        # Add edges with order labels
+        for source_id, target_id, edge_order in edges:
+            dot.edge(source_id, target_id)  # , label=str(edge_order), color="black")
+
+        # Step 3: Save and render
+        dot.render(filepath, cleanup=True)
+        print(f"Graph saved as {filepath}")
+        return {"component_graph": f"{filepath}"}
+
+    def _collect_output_subgraph(
+        self,
+    ) -> Tuple[Set["Parameter"], List[Tuple["Parameter", "Parameter"]]]:
+        """
+        Collect nodes of type OUTPUT and their relationships.
+
+        Returns:
+            nodes (Set[Parameter]): Set of OUTPUT nodes.
+            edges (List[Tuple[Parameter, Parameter]]): Edges between OUTPUT nodes.
+        """
+        output_nodes = set()
+        edges = []
+
+        visited = set()  # check component_trace.id and name
+
+        def traverse(node: "Parameter"):
+            if node in visited:
+                return
+            visited.add(node)
+
+            # Add OUTPUT nodes to the set
+            if (
+                node.param_type == ParameterType.OUTPUT
+                or "OUTPUT" in node.param_type.name
+            ):
+                output_nodes.add(node)
+
+            # Traverse predecessors and add edges
+            for pred in node.predecessors:
+                if (
+                    pred.param_type == ParameterType.OUTPUT
+                    or "OUTPUT" in pred.param_type.name
+                ):
+                    edges.append((pred, node))
+                traverse(pred)
+
+        traverse(self)
+        return output_nodes, edges
+
+    def _collect_component_subgraph(
+        self,
+    ) -> Tuple[Set[ComponentNode], List[Tuple[str, str]]]:
+        """
+        Collect OUTPUT nodes and their relationships as ComponentNodes.
+
+        Returns:
+            component_nodes (Set[ComponentNode]): Set of component nodes (id and name only).
+            edges (List[Tuple[str, str]]): Edges between component IDs.
+        """
+        component_nodes = set()  # To store component nodes as ComponentNode
+        component_nodes_orders: Dict[str, List[int]] = (
+            {}
+        )  # To store component nodes order
+        edges = []  # To store edges between component IDs
+
+        visited = set()  # Track visited parameters to avoid cycles
+        edge_counter = [0]  # Mutable counter for edge order tracking
+
+        def traverse(node: "Parameter"):
+            if node in visited:
+                return
+            visited.add(node)
+
+            # Check if node is of OUTPUT type
+            if (
+                node.param_type == ParameterType.OUTPUT
+                or "OUTPUT" in node.param_type.name
+            ):
+                component_id = node.component_trace.id or f"unknown_id_{uuid.uuid4()}"
+                component_name = node.component_trace.name or "Unknown Component"
+
+                # Create a ComponentNode and add to the set
+                component_node = ComponentNode(id=component_id, name=component_name)
+                component_nodes.add(component_node)
+
+                # Traverse predecessors and add edges
+                for pred in node.predecessors:
+                    # if pred.param_type != ParameterType.OUTPUT:
+                    #     continue
+                    pred_id = f"unknown_id_{uuid.uuid4()}"
+                    pred_name = "Unknown Component"
+
+                    if hasattr(pred, "component_trace") and pred.component_trace.id:
+                        pred_id = pred.component_trace.id
+                        pred_name = pred.component_trace.name
+
+                    # Add edge if predecessor is also of OUTPUT type
+                    if (
+                        pred.param_type == ParameterType.OUTPUT
+                        or "OUTPUT" in pred.param_type.name
+                    ):
+                        edges.append((pred_id, component_id, edge_counter[0]))
+                        component_nodes.add(ComponentNode(id=pred_id, name=pred_name))
+                        edge_counter[0] += 1
+
+                    if pred.param_type == ParameterType.INPUT:
+                        pred_id = pred.id
+                        pred_name = pred.name
+                        pred_node = ComponentNode(
+                            id=pred_id, name=pred_name, type="INPUT"
+                        )
+                        component_nodes.add(pred_node)
+                        # add an edge from input to the first output
+                        edges.append((pred_id, component_id, edge_counter[0]))
+                        edge_counter[0] += 1
+
+                    traverse(pred)
+
+        # Start traversal from the current parameter
+        traverse(self)
+        # Reverse the edge order
+        # total_edges = len(edges)
+        # edges = [
+        #     (source, target, (total_edges - 1) - edge_number)
+        #     for idx, (source, target, edge_number) in enumerate(edges)
+        # ]
+
+        return component_nodes, edges, component_nodes_orders
 
     def to_dict(self):
         return {
@@ -918,17 +1477,11 @@ def to_dict(self):
             "predecessors": [pred.to_dict() for pred in self.predecessors],
             "gradients": [grad.to_dict() for grad in self.gradients],
             "previous_data": self.previous_data,
-            "gradients_context": [
-                (k.name, v) for k, v in self.gradients_context.items()
-            ],
             "grad_fn": str(
                 self.grad_fn
             ),  # Simplify for serialization, modify as needed
-            "gradient_prompt": str(self.gradient_prompt),
-            "raw_response": self.raw_response,
-            "score": self._score,
+            "score": self.score,
             "traces": {k: v.to_dict() for k, v in self._traces.items()},
-            "input_args": self.input_args,
             # demos
             "demos": [d.to_dict() for d in self._demos],
         }
@@ -946,21 +1499,322 @@ def from_dict(cls, data: dict):
             predecessors=predecessors,
             gradients=[cls.from_dict(grad) for grad in data["gradients"]],
             previous_data=data["previous_data"],
-            gradient_prompt=data["gradient_prompt"],
-            raw_response=data["raw_response"],
-            input_args=data["input_args"],
             score=data["score"],
             # demos
             demos=[DataClass.from_dict(d) for d in data["demos"]],
         )
         # Reconstruct gradients_context from the list of tuples
-        param.gradients_context = defaultdict(
-            lambda: None, {cls.from_dict(k): v for k, v in data["gradients_context"]}
-        )
         param._traces = {k: DataClass.from_dict(v) for k, v in data["traces"].items()}
         return param
 
     # TODO: very hard to read directly, need to simplify and let users use to_dict for better readability
     def __repr__(self):
         return f"Parameter(name={self.name}, requires_opt={self.requires_opt}, param_type={self.param_type}, role_desc={self.role_desc}, data={self.data}, predecessors={self.predecessors}, gradients={self.gradients},\
-            raw_response={self.raw_response}, input_args={self.input_args}, traces={self._traces})"
+            traces={self._traces})"
+
+
+# TODO: separate the Parameter class into different classes and each class will have its own methods instead of all in one class
+class InputParameter(Parameter):
+    """One of the simplest types of parameters, representing an input to the system.
+    Input parameter will not be trainable, but serves a tracing purpose in the computation graph.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        role_desc: str,
+        data: Any,
+        requires_opt: bool = False,
+        param_type: ParameterType = ParameterType.INPUT,
+    ):
+        super().__init__(
+            name=name,
+            role_desc=role_desc,
+            data=data,
+            requires_opt=requires_opt,
+            param_type=param_type,
+        )
+
+
+class HyperParameter(Parameter):
+    """One of the simplest types of parameters, representing a hyperparameter to the system."""
+
+    def __init__(
+        self,
+        name: str,
+        role_desc: str,
+        data: Any,
+        requires_opt: bool = False,
+        param_type: ParameterType = ParameterType.HYPERPARAM,
+    ):
+        super().__init__(
+            name=name,
+            role_desc=role_desc,
+            data=data,
+            requires_opt=requires_opt,
+            param_type=param_type,
+        )
+
+
+class PromptParameter(Parameter):
+
+    def __init__(
+        self,
+        name: str,
+        role_desc: str,
+        data: Any,
+        requires_opt: bool = True,
+        param_type: ParameterType = ParameterType.PROMPT,
+    ):
+        super().__init__(
+            name=name,
+            role_desc=role_desc,
+            data=data,
+            requires_opt=requires_opt,
+            param_type=param_type,
+        )
+
+
+class DemoParameter(Parameter):
+
+    def __init__(
+        self,
+        name: str,
+        role_desc: str,
+        data: Any,
+        requires_opt: bool = True,
+        param_type: ParameterType = ParameterType.DEMOS,
+    ):
+        super().__init__(
+            name=name,
+            role_desc=role_desc,
+            data=data,
+            requires_opt=requires_opt,
+            param_type=param_type,
+        )
+
+
+class OutputParameter(Parameter):
+    __doc__ = r"""The output parameter is the most complex type of parameter in the system.
+
+    It will trace the predecessors, set up a grad_fn, store gradients, and trace the forward pass by tracking the component_trace.
+    """
+    allowed_types = {
+        ParameterType.OUTPUT,
+        ParameterType.LOSS_OUTPUT,
+        ParameterType.GENERATOR_OUTPUT,
+        ParameterType.SUM_OUTPUT,
+    }
+    component_trace: ComponentTrace = (
+        None  # Trace of the component that produced this output
+    )
+    full_response: object = None  # The full response from the component
+
+    def __init__(
+        self,
+        *,
+        id: Optional[str] = None,  # unique id of the parameter
+        data: T = None,  # for generator output, the data will be set up as raw_response
+        data_id: str = None,  # for tracing the data item in the training/val/test set
+        requires_opt: bool = True,
+        role_desc: str = "",
+        param_type: ParameterType = ParameterType.OUTPUT,
+        name: str = None,  # name is used to refer to the parameter in the prompt, easier to read for humans
+        instruction_to_optimizer: str = None,
+        instruction_to_backward_engine: str = None,
+        score: Optional[float] = None,
+        eval_input: object = None,
+        successor_map_fn: Optional[Dict[str, Callable]] = None,
+        data_in_prompt: Optional[Callable] = None,
+        full_response: Optional[Any] = None,
+    ):
+        super().__init__(
+            id=id,
+            data=data,
+            data_id=data_id,
+            requires_opt=requires_opt,
+            role_desc=role_desc,
+            param_type=param_type,
+            name=name,
+            instruction_to_optimizer=instruction_to_optimizer,
+            instruction_to_backward_engine=instruction_to_backward_engine,
+            score=score,
+            eval_input=eval_input,
+            successor_map_fn=successor_map_fn,
+            data_in_prompt=data_in_prompt,
+        )
+
+        self.component_trace = ComponentTrace()
+        self.full_response = full_response
+
+    ############################################################################################################
+    #  Trace component, include trace_forward_pass & trace_api_kwargs for now
+    ############################################################################################################
+    def trace_forward_pass(
+        self,
+        input_args: Dict[str, Any],
+        full_response: object,
+        id: str = None,
+        name: str = None,
+    ):
+        r"""Trace the forward pass of the parameter. Adding the component information to the trace"""
+        self.input_args = input_args
+        self.full_response = full_response
+        # TODO: remove the input_args and full_response to use component_trace
+        self.component_trace.input_args = input_args
+        self.component_trace.full_response = full_response
+        self.component_trace.id = id
+        self.component_trace.name = name
+        # just for convenience to trace full response separately
+        self.full_response = full_response
+
+    def trace_api_kwargs(self, api_kwargs: Dict[str, Any]):
+        r"""Trace the api_kwargs for components like Generator and Retriever that pass to the model client."""
+        self.component_trace.api_kwargs = api_kwargs
+
+    def to_dict(self):
+        super_dict = super().to_dict()
+        super_dict.update(
+            {
+                "component_trace": self.component_trace.to_dict(),
+            }
+        )
+
+    # def to_json(self):
+    #     import json
+
+    #     return json.dumps(self.to_dict())
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        component_trace = ComponentTrace.from_dict(data["component_trace"])
+        return super().from_dict(data).update({"component_trace": component_trace})
+
+    def __repr__(self):
+        super_repr = super().__repr__()
+        start = super_repr.find("Parameter")
+        if start == 0:
+            end = start + len("Parameter")
+            super_repr = super_repr[:start] + "OutputParameter" + super_repr[end:]
+        return super_repr
+
+
+# gradients= List[Gradient]
+
+
+@dataclass
+class Gradient(DataClass):
+    __doc__ = r"""It will handle gradients and feedbacks.
+
+    It tracks the d_from_response_id / d_to_pred_id and the score of the whole response.
+
+    if two gradients have the same data_id, different from_response_id, and same from_response_component_id, this is a cycle component structure.
+    """
+    data_id: Optional[str] = None  # the id of the response from data in the dataset
+    from_response_component_id: str = (
+        None  # the id of the component from which the gradient is calculated
+    )
+    order: Optional[int] = None  # the order of the gradient in the list of gradients
+
+    from_response_id: str = (
+        None  # the id of the response from which the gradient is calculated
+    )
+
+    to_pred_id: str = (
+        None  # the id of the parameter to which the gradient is calculated and attached to d(from_response_id) / d(to_pred_id)
+    )
+
+    score: Optional[float] = None
+
+    context: GradientContext = None
+    data: Any = None
+    prompt: Optional[str] = None  # the LLM prompt to generate the gradient
+
+    is_default_copy: bool = False  # whether the gradient is a default copy
+
+    def __init__(
+        self,
+        *,
+        from_response: "Parameter",
+        to_pred: "Parameter",
+        id: Optional[str] = None,  # the id of the gradient
+        score: Optional[float] = None,
+        data_id: Optional[str] = None,
+        data: Any = None,
+    ):
+        self.id = id or str(uuid.uuid4())
+        self._generate_name(from_response, to_pred)
+        self.from_response_component_id = from_response.component_trace.id
+        if not self.from_response_component_id:
+            raise ValueError(
+                "The from_response_component_id should not be None. Please ensure the component_trace is set."
+            )
+        self.from_response_id = from_response.id
+        self.to_pred_id = to_pred.id
+        self.score = score
+        self.data_id = data_id
+        if self.data_id is None:
+            raise ValueError("The data_id should not be None.")
+        self.data = data
+        self.order = None
+
+    def _generate_name(self, response: "Parameter", pred: "Parameter"):
+        self.name = f"d_{response.name}_/_{pred.name}({response.id}_/_{pred.id})"
+        self.role_desc = f"Gradient from {response.name} to {pred.name}"
+
+    def add_context(self, context: GradientContext):
+        self.context = context
+
+    def add_data(self, data: Any):
+        self.data = data
+
+    def update_from_to(self, from_response: "Parameter", to_pred: "Parameter"):
+        self.from_response_id = from_response.id
+        self.to_pred_id = to_pred.id
+        self._generate_name(from_response, to_pred)
+        self.from_response_component_id = from_response.component_trace.id
+
+    def add_prompt(self, prompt: str):
+        self.prompt = prompt
+
+    def __hash__(self):
+        # Use immutable and unique attributes to compute the hash
+        return hash((self.id, self.data_id, self.from_response_id, self.to_pred_id))
+
+    def __eq__(self, other):
+        # Ensure equality comparison is based on the same unique attributes
+        if not isinstance(other, Gradient):
+            return False
+        return (
+            self.id == other.id
+            and self.data_id == other.data_id
+            and self.from_response_id == other.from_response_id
+            and self.to_pred_id == other.to_pred_id
+        )
+
+
+if __name__ == "__main__":
+
+    # test gradient hash and to_dict
+    from_response = OutputParameter(
+        name="p1",
+        role_desc="role1",
+        data=1,
+    )
+    from_response.component_trace = ComponentTrace(id="1")
+    g1 = Gradient(
+        from_response=from_response,
+        to_pred=Parameter(name="p2", role_desc="role2", data=2),
+        data_id="1",
+    )
+    g2 = Gradient(
+        from_response=from_response,
+        to_pred=Parameter(name="p2", role_desc="role2", data=2),
+        data_id="1",
+    )
+    print(g1 == g2)
+    print(g1.__hash__())
+    print(g2.__hash__())
+    print(isinstance(g1, Gradient))  # Should print True
+
+    print(g1.to_dict())
diff --git a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
index a5f3ddb1..4291b049 100644
--- a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
+++ b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py
@@ -8,28 +8,120 @@
 # NOTE: having peers is important to keep the scope of the prompt consistent and not cross-reference with other variables
 ### System prompt and the template is shared by all GradComponent ###
 
+# FEEDBACK_ENGINE_TEMPLATE = r"""<START_OF_SYSTEM_PROMPT>
+# You are the feedback engine in an optimization system consisting of multiple components.
+
+# Your task is to provide intelligent and creative feedback in each component for the target variable enclosed in <TARGET_VARIABLE> or <VARIABLES> tags
+# so that the optimizer can optimize this variable to improve the objective enclosed in <OBJECTIVE_FUNCTION> tags.
+
+# Instructions:
+# 1. Understand the role of each variable in the component system BEFORE you give feedback.
+# 2. You MUST attribute the feedback to the correct variable only.
+# 3. Focus on the downstream objective without proposing new versions of the variable.
+# 4. From the <CONVERSATION> section, see how the variable is obtained and used.
+# 5. The variable might have peers also used to instruct the language model, but your feedback should only focus on the target variable.
+# 6. If the error is not directly related to the variable itself, you can say: \"There is no noticeable error.\"
+# 7. Be specific, concise, critical, and direct.
+# 8. If the same DataID appears multiple times, it means the component/variable is called repeatedly in the same order as it appears in the gradient list.
+
+
+# {% if output_format_str %}
+# {{output_format_str}}
+# {% endif %}
+
+# <END_OF_SYSTEM_PROMPT>
+# <START_OF_USER>
+# <CONVERSATION>
+# {{conversation_sec}}
+# </CONVERSATION>
+# <OBJECTIVE_INSTRUCTION>
+# {{objective_instruction_sec}}
+# </OBJECTIVE_INSTRUCTION>
+# <END_OF_USER>
+# """
+
+FEEDBACK_ENGINE_PEERS_TEMPLATE = r"""<START_OF_SYSTEM_PROMPT>
+You are the feedback engine in an optimization system consisting of multiple components.
+
+A component can have multiple inputs, and you handle one that is enclosed in <TARGET_VARIABLE> or <VARIABLES> tags.
+You will provide intelligent and creative feedback so that the optimizer can optimize this variable to improve the objective enclosed in <OBJECTIVE_FUNCTION> tags.
+
+About <VARIABLES> or <PEERS>:
+* If a variable is of type "output", it is the output of another predecessor component. In this case, you MUST attribute the error to the RIGHT variable.
+* If a variable plays no role to the error, simply state "This variable did not cause the error. No need to change the essense of this variable."
+
+1. From <CONVERSATION></CONVERSATION> section, you can find how the variable is obtained and used.
+2. The variable might have other peers that are used together to instruct the language model. But only focus on the target variable.
+3. As there might be peers, and multi-components, it is possible that the feedback/error is not directly related to the variable itself.
+4. When you reason, really think about the variable's role in the component(infer from the CONVERSATION section) and the VARIABLE section before you provide feedback.
+5. Be specific, concise, critical, and direct.
+
+
+{% if output_format_str %}
+{{output_format_str}}
+{% endif %}
+
+<END_OF_SYSTEM_PROMPT>
+<START_OF_USER>
+<CONVERSATION>
+{{conversation_sec}}
+</CONVERSATION>
+<OBJECTIVE_INSTRUCTION>
+{{objective_instruction_sec}}
+</OBJECTIVE_INSTRUCTION>
+<END_OF_USER>
+"""
+# 1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.
+
+# <TASK_PIPELINE>
+# Here is a summary on the task pipeline you are optimizing:
+# retriever: retrieves relevant documents for the question. (Not trainable, you have no control)
+# LLM: Answer questions by reading the context  and reason the best answer.
+# </TASK_PIPELINE>
+# You are the feedback engine in an optimization system consisting of multiple components.
+# You are the feedback engine to provide feedback for a target variable in a compound LLM system.
+
+# The evaluation and feedback is backpropogated all the way to you, and you will assess the current component's inputs, output along with its feedback.
+# A component can have multiple inputs, and you handle one that is enclosed in <TARGET_VARIABLE> or <VARIABLES> tags.
+# You will provide intelligent and creative feedback so that the optimizer can optimize this variable to improve the objective enclosed in <OBJECTIVE_FUNCTION> tags.
+
 FEEDBACK_ENGINE_TEMPLATE = r"""<START_OF_SYSTEM_PROMPT>
-You are the feedback engine in an optimization system.
+You are a detective excel at determining the root cause of a system error.
+You start with an evaluation function that measures performance, and you receive the system input.
+The system can be a a compound system, potentially consisting of multiple components.
+You will receive feedback from your direct successor, and your goal is to investigate your component’s inputs and outputs to identify whether any of your input variables are causing the error.
+
+Your target input variable is enclosed in <TARGET_VARIABLE> (representing one of the input variables that may or may not be causing the error).
+Alternatively, it may be enclosed in <VARIABLES> tags (in which case you must pass feedback to all variables, indicating which ones cause the errors and which do not).
 
-Your task is to provide intelligent and creative feedback for the target variable enclosed in <VARIABLE></VARIABLE> tags,
-so that the optimizer can optimize this variable to improve the objective enclosed in <OBJECTIVE_FUNCTION></OBJECTIVE_FUNCTION> tags.
+1. From <CONVERSATION></CONVERSATION> section, you can find how the variable is obtained and used.
+2. As there might be multiple precedessors, and multi-components, it is possible that the feedback/error is not directly related to the variable itself.
+3. When you reason, really think about the variable's role in the component(infer from the CONVERSATION section) and the VARIABLE section before you provide feedback.
+4. Be specific, concise, critical, and direct.
+5. Maximum 3 sentences.
 
-1. Focus on the downstream OBJECTIVE without proposing new versions of the variable.
-2. Feedback examples: "Since language models have the X failure mode...", "Adding X can fix this error because...", "Removing X can improve the objective function because...", "Changing X to Y would fix the mistake..."
-3. Consider the variable in the context of its peers if provided.
+If the same DataID has multiple gradients, it means this component/variable is called multiple times in the compound system(with a cycle) in the same order as it appears in the gradient list.
+
+{% if output_format_str %}
+{{output_format_str}}
+{% endif %}
 
-Remember:
-Be specific, concise, critical, and direct.
 <END_OF_SYSTEM_PROMPT>
+<START_OF_USER>
 <CONVERSATION>
 {{conversation_sec}}
 </CONVERSATION>
+<OBJECTIVE_INSTRUCTION>
 {{objective_instruction_sec}}
+</OBJECTIVE_INSTRUCTION>
+<END_OF_USER>
 """
 ##############################################
 # Loss Component
 ##############################################
+# In such cases, you can just say "There is no noticeable error".
 
+# 2. Feedback examples: "Since language models have the X failure mode...", "Adding X can fix this error because...", "Removing X can improve the objective function because...", "Changing X to Y would fix the mistake..."
 
 # Objective instruction for LLM as gradComponent with user custom instruction
 
@@ -39,32 +131,120 @@
 # Note: {{instruction_to_backward_engine}}
 # {% endif %}
 # </OBJECTIVE_FUNCTION>"""
+# Your only goal is to clearly states how it obtained the "<OUTPUTS/SCORE>".
+
+
+# OBJECTIVE_INSTRUCTION_BASE = r"""<OBJECTIVE_FUNCTION>
+# Your only goal is to clearly states how it obtained the "<OUTPUTS/SCORE>",
+# so that you can inform other components on the specific errors.
+# e.g. "The <gt> and <pred> are not an exact match, it differs by <difference>."
+# Especially when the score is low.
+# Be CONCISE. Be SPECIFIC.
+# </OBJECTIVE_FUNCTION>"""
+
+# OBJECTIVE_INSTRUCTION_BASE = r"""<OBJECTIVE_FUNCTION>
+# Your task: Provide specific feedback based on the score in the \"<OUTPUTS/SCORE>\" value.
+# - Especially note when the score is low (e.g. 0.0).
+# - Be concise.
+# - Be specific about why the score is low. For example:
+#   The retrieved context is insufficient to answer the question accurately.
+# </OBJECTIVE_FUNCTION>"""
 
 OBJECTIVE_INSTRUCTION_BASE = r"""<OBJECTIVE_FUNCTION>
-Your only goal is to clearly states how it obtained the "<OUTPUTS/SCORE>".
+Your task is to provide the response with specific feedback based on the expected correct response (y_gt/ground_truth) and the score in the "<OUTPUTS/SCORE>".
 Especially when the score is low.
 Be CONCISE.
+
 Be specific on why it has a low score.
+Specify the difference between the expected correct response and the response.
+</OBJECTIVE_FUNCTION>"""
+
+# Be specific on why it has a low score.
+
+### NOTE: Last node's feedback
+# OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The <INPUTS/SCORE> was later used as "{{response_name}}: {{response_desc}}".
+# <OBJECTIVE_FUNCTION>
+# Your only goal is to clearly provide feedback on obtaining "Eval output/score": {{response_gradient}}.
+# Be CONCISE and specific on how it can be improved.
+# </OBJECTIVE_FUNCTION>"""
+
+OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The <INPUTS/SCORE> was later used as "{{response_name}}: {{response_desc}}".
+<OBJECTIVE_FUNCTION>
+Your only goal is to clearly states how it obtained the "Eval output/score": {{response_gradient}}.
+Especially when the score is low.
+Be CONCISE.
+If you have enough context, add a more specific feedback on how it failed.
 e.g. "The retrieved context is not enough to answer the question so the problem relies on the retrieval part."
 </OBJECTIVE_FUNCTION>"""
 
+###  Loss/Score Information  ###
+# INPUTS: parameter.get_param_info():
+# the input_output of a GradientContext
+
+# response_value -> response.get_prompt_data()
+# LOSS_CONVERSATION_TEMPLATE_STRING = r"""
+# The target variable is passed to the EVAL_FUNC and compared with the correct value.
+
+# EVAL_FUNC: {{eval_fn_desc}}
+
+# INPUTS:
+# {% for key, (value, eval_type) in inputs.items() %}
+# ({{ key }}) (role: {{ value.role_desc }}),
+# data: {{ value.prompt_data }},
+# input_to_eval_fn: {{ value.eval_input }},
+# data_type: {{ eval_type }}
+# {% endfor %}
+
+# OUTPUTS/SCORE: {{response_value}}
+# {% if metadata %}
+# Note: {{metadata}}
+# {% endif %}"""
+
+# LOSS_CONVERSATION_TEMPLATE_STRING = r"""
+# The variable is passed to the eval function and compared with a expected value(y_gt or ground_truth).
+
+# EVAL_FUNC: {{eval_fn_desc}}
+
+# INPUTS:
+# {% for key, (value, eval_type) in inputs.items() %}
+# ({{ key }}) (role: {{ value.role_desc }}),
+# data: {{ value.prompt_data }},
+# input_to_eval_fn: {{ value.eval_input }},
+# data_type: {{ eval_type }}
+# {% endfor %}
+
+# OUTPUTS/SCORE: {{response_value}}
+# {% if metadata %}
+# Note: {{metadata}}
+# {% endif %}"""
+
 
 ### Variable to get feedback on, often it is pred in the loss component
+# pass parameter.get_param_info() to get the variable info
 LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN = r"""
 TARGET VARIABLE:
-<NAME> {{variable_name}} </NAME>
-<ROLE> {{variable_desc}} </ROLE>
-<VARIABLE> {{variable_value}} </VARIABLE>
+<NAME> {{variable.name}} </NAME>
+<ROLE> {{variable.role_desc}} </ROLE>
+<VARIABLE> {{variable.prompt_data}} </VARIABLE>
 {{conversation_str}}
 """
 
 ###  Loss/Score Information  ###
 LOSS_CONVERSATION_TEMPLATE_STRING = r"""
-The variable is passed to the eval function and compared with a target/ground truth value.
+The variable is passed to the eval function and compared with a target/ground truth value to get
+its score regarding to a SYSTEM_QUESTION: {{system_question}}.
+
+EVAL_FUNC: {{eval_fn_desc}}
 
-<EVAL_FUNC_DESCRIPTION>: {{eval_fn_desc}}
-<INPUTS>: {{input_str}}
-<OUTPUTS/SCORE>: {{response_value}}
+INPUTS to EVAL_FUNC:
+{% for key, (value, eval_type) in inputs.items() %}
+({{ key }}) (role: {{ value.role_desc }}),
+data: {{ value.prompt_data }},
+input_to_eval_fn: {{ value.eval_input }},
+data_type: {{ eval_type }}
+{% endfor %}
+
+OUTPUTS/SCORE: {{response_value}}
 {% if metadata %}
 Note: {{metadata}}
 {% endif %}"""
@@ -77,10 +257,26 @@
 CONVERSATION_START_INSTRUCTION_CHAIN = r"""
 {{variable_and_peers_info}}
 
+{# system trainable variables #}
+{% if predecessors %}
+<START_OF_PRECESSORS>
+The target variable is used together with these predecessors variables besides of the peers:
+{% for system_variable in predecessors %}
+{{loop.index}}.
+Name: {{system_variable.name}}
+Type: {{system_variable.param_type}}
+Description: {{system_variable.role_desc}}
+WILL_BE_OPTIMIZED: {{system_variable.requires_opt}}
+Vaule: {{system_variable.prompt_data}}
+{% endfor %}
+<END_OF_PRECESSORS>
+{% endif %}
+
 Here is a conversation with the language model (LM):
 {{conversation_str}}
 """
 
+# For the generator in the chain,
 OBJECTIVE_INSTRUCTION_CHAIN = r"""
 This conversation is part of a larger system. The <LM_OUTPUT> was later used as {{response_desc}}.
 <OBJECTIVE_FUNCTION>
@@ -90,35 +286,62 @@
 {% endif %}
 </OBJECTIVE_FUNCTION>"""
 
-###  Backward engine: user prompt
-# First part to provide context of LLM as gradComponent
-# The target variable is used as either input or a task instruction to a language model (LM):
-# replace the "The target variable is used as either input or a task instruction to a language model (LM):" with the {{variable_desc}}
-# NAME: {{variable_name}}
-# Description: {{variable_desc}}
-LLM_CONVERSATION_TEMPLATE = r"""
-LM_INPUT: {{input_value}}
-LM_OUTPUT: {{llm_output}}"""
 
+SUMMARY_TASK = """
+Here is a summary on the task pipeline you are optimizing:
+query_generator: "generates a sub-query based on the initial query"
+retriever: "retrieves relevant documents based on the sub-query"
+llm: "Answer a question with available context with exact answer extracted from the context"
+
+The query_generator is called twice in the pipeline.
+And the retrieved documents are deduplicated and combined to form the final context.
+The final context is then passed to the llm to generate the answer where we want to use the exact phrase from the context.
+"""
+
+
+# VARIABLE_AND_PEERS_INFO = r"""
+# <START_OF_VARIABLE_DESC>
+# {{variable.name}}
+# <TYPE> {{variable.param_type}} </TYPE>
+# <ROLE> {{variable.role_desc}} </ROLE>
+# <VARIABLE>{{ variable.prompt_data}}</VARIABLE>
+# <END_OF_VARIABLE_DESC>
+# {% if peers %}
+# <VARIBLE_PEERS>
+# The variable is used together with the these peer variables to instruct the language model:
+# {% for peer in peers %}
+# {{loop.index}}.
+# PEER_NAME: {{peer.name}},
+# PEER_TYPE: {{peer.param_type}},
+# PEER_ROLE: {{peer.role_desc}}
+# WILL_BE_OPTIMIZED: {{peer.requires_opt}}
+# {% if peer.prompt_data %}
+# PEER_VARIABLE: {{peer.prompt_data}}
+# {% else %}
+# PEER_VARIABLE: EMPTY
+# {% endif %}
+# {% endfor %}
+# </VARIBLE_PEERS>
+# {% endif %}
+# """
 
 VARIABLE_AND_PEERS_INFO = r"""
 <START_OF_VARIABLE_DESC>
 {{variable.name}}
 <TYPE> {{variable.param_type}} </TYPE>
 <ROLE> {{variable.role_desc}} </ROLE>
-<VARIABLE> {{variable.data}} </VARIABLE>
+<VARIABLE>{{ variable.prompt_data}}</VARIABLE>
 <END_OF_VARIABLE_DESC>
 {% if peers %}
 <VARIBLE_PEERS>
-The variable is used together with the these peer variables to instruct the language model:
 {% for peer in peers %}
 {{loop.index}}.
 PEER_NAME: {{peer.name}},
 PEER_TYPE: {{peer.param_type}},
 PEER_ROLE: {{peer.role_desc}}
 WILL_BE_OPTIMIZED: {{peer.requires_opt}}
-{% if peer.data %}
-PEER_VARIABLE: {{peer.data}}
+{% if peer.prompt_data %}
+PEER_VARIABLE: {{peer.prompt_data}}
 {% else %}
 PEER_VARIABLE: EMPTY
 {% endif %}
@@ -127,6 +350,54 @@
 {% endif %}
 """
 
+# The variable is used together with the these peer variables to instruct the language model on the task.
+# - Do not overlap with the scope of the peer.
+
+
+# a list of variables
+ALL_PRED_INFO = r"""
+<VARIABLES>
+{% if variables %}
+Length of the list: {{variables|length}}
+{% for variable in variables %}
+{{loop.index}}.
+NAME: {{variable.name}},
+TYPE: {{variable.param_type}},
+ROLE: {{variable.role_desc}}
+WILL_BE_OPTIMIZED: {{variable.requires_opt}}
+VARIABLE: {{ variable.prompt_data}}
+{% endfor %}
+{% endif %}
+</VARIABLES>
+"""
+
+
+###  Backward engine: user prompt
+# First part to provide context of LLM as gradComponent
+# The target variable is used as either input or a task instruction to a language model (LM):
+# replace the "The target variable is used as either input or a task instruction to a language model (LM):" with the {{variable_desc}}
+# NAME: {{variable_name}}
+# Description: {{variable_desc}}
+LLM_CONVERSATION_TEMPLATE = r"""
+LM_INPUT: {{input_value}}
+LM_OUTPUT: {{llm_output}}
+{% if gt %}
+GROUND_TRUTH: {{gt}}
+{% endif %}
+"""
+
+# OUTPUT_INSTRUCTION = r"""
+# You will create a feedback for each of the variable in the list above.
+# If a variable will not be optimied, you just output empty string for that variable..
+# NOTE: you MUST output a list of strings with the same length as the list above as ["...", "...", "..."]
+# """
+OUTPUT_INSTRUCTION = r"""
+You will create a feedback for each of the variable in the list.
+If a variable will not be optimied, you just output empty string.
+Your output will be a list of strings with the SAME LENGTH as the <VARIABLES> list
+as format of ["...", "...", "..."]
+"""
+
 
 # # When the parameter has no gradient, it is the start of the backpropagation chain, used as a loss function
 # CONVERSATION_START_INSTRUCTION_BASE = r"""
@@ -135,3 +406,7 @@
 # Here is an evaluation of the variable using a language model:
 # {{conversation_str}}
 # """
+
+##############################################
+# Backward multiple peers at the same time
+##############################################
diff --git a/adalflow/adalflow/optim/text_grad/ops.py b/adalflow/adalflow/optim/text_grad/ops.py
index ddce60dc..601cbfaf 100644
--- a/adalflow/adalflow/optim/text_grad/ops.py
+++ b/adalflow/adalflow/optim/text_grad/ops.py
@@ -4,7 +4,7 @@
 import logging
 
 from adalflow.optim.function import BackwardContext
-from adalflow.optim.parameter import Parameter
+from adalflow.optim.parameter import Parameter, OutputParameter
 from adalflow.optim.types import ParameterType
 from adalflow.optim.grad_component import GradComponent
 
@@ -33,7 +33,10 @@ def sum_ops(params: List[Parameter]) -> Parameter:
 # TODO: make all loss functions to support batch losses
 # TODO: use a temlate to format the concatenated values
 class Sum(GradComponent):
-    __doc__ = """The class to define a sum operation on a list of parameters, such as losses or gradients."""
+    __doc__ = """The class to define a sum operation on a list of parameters, such as losses or gradients.
+
+    It enables gradients combination of a batch of data samples.
+    """
 
     name = "Sum"
 
@@ -54,19 +57,25 @@ def forward(self, params: List[Parameter]) -> Parameter:
                 raise ValueError(
                     f"Sum operation only accepts a list of Parameters, got {type(param)}"
                 )
-        concat_values = "\n".join([str(p.data) for p in params])  # to_dict
+        concat_values = ",".join([str(p.data) for p in params])  # default concatenation
         role_descriptions = set([p.role_desc for p in params])
         role_descriptions = ", ".join(role_descriptions)
 
-        total = Parameter(
+        total = OutputParameter(
             data=concat_values,
             role_desc=f"A combination of a list of variables: {role_descriptions}",
             requires_opt=any([p.requires_opt for p in params]),
             name="sum",
-            score=sum([p._score for p in params]),  # total has a score
+            score=sum([p.score for p in params]),  # total has a score
             param_type=ParameterType.SUM_OUTPUT,
         )
         total.set_predecessors(params)
+        total.trace_forward_pass(
+            input_args=params,
+            full_response=concat_values,
+            id=total.id,
+            name=total.name,
+        )
 
         log.info("Sum forward", extra={"total": total.data})
 
@@ -114,13 +123,21 @@ def backward(self, summation: Parameter):
             }
             log.info(f"""Idempotent sum backward: {extra}""")
 
-            param_gradient = Parameter(
-                name=f"sum_to_{param.name}_grad",
-                data=param_gradient_value,
-                role_desc=f"Feedback to {param.role_desc}",
-                score=summation._score,
-                from_response_id=summation.id,
-                param_type=ParameterType.GRADIENT,
-            )
-            param.add_gradient(param_gradient)
-            log.debug(f"Added gradient to {param.role_desc}: {param_gradient.data}")
+            # param_gradient = Gradient(
+            #     data=param_gradient_value,
+            #     data_id=summation.data_id,
+            #     score=summation._score,
+            #     from_response=summation,
+            #     to_pred=param,
+            # )
+            # param.add_gradient(param_gradient)
+            # log.debug(f"Added gradient to {param.role_desc}: {param_gradient.data}")
+
+
+if __name__ == "__main__":
+    # test the sum ops
+
+    a = Parameter(data=1)
+    b = Parameter(data=2)
+    c = sum_ops(List[a, b])
+    c.backward()
diff --git a/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py b/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py
index 89ebd471..a7aa6fcd 100644
--- a/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py
+++ b/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py
@@ -11,7 +11,12 @@
     from adalflow.core import ModelClient
     from adalflow.core.generator import BackwardEngine
 from adalflow.core.types import GeneratorOutput
-from adalflow.optim.parameter import Parameter, GradientContext
+from adalflow.optim.parameter import (
+    Parameter,
+    GradientContext,
+    Gradient,
+    OutputParameter,
+)
 from adalflow.optim.types import ParameterType
 
 from adalflow.core.prompt_builder import Prompt
@@ -20,58 +25,13 @@
     LOSS_CONVERSATION_TEMPLATE_STRING,
     LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN,
     OBJECTIVE_INSTRUCTION_BASE,
+    OBJECTIVE_INSTRUCTION_CHAIN,
 )
+from adalflow.utils import printc
 
 
 log = logging.getLogger(__name__)
 
-###  Loss/Score Information  ###
-# LOSS_CONVERSATION_TEMPLATE_STRING = r"""
-# The variable is passed to the eval function and compared with a target/ground truth value.
-
-# <EVAL_FUNC_DESCRIPTION>: {{eval_fn_desc}}
-# <INPUTS>: {{input_str}}
-# <OUTPUTS/SCORE>: {{response_value}}
-# {% if metadata %}
-# Note: {{metadata}}
-# {% endif %}"""
-
-
-# Does not have gradient on the output, the loss function of the backpropagation chain
-# CONVERSATION_START_INSTRUCTION_STRING_FN_BASE = r"""You will give feedback to a variable with the following role:
-# <ROLE> {{variable_desc}} </ROLE>.
-# Here is an evaluation of the variable using the eval function:
-# {{conversation}}"""
-
-# Has the gradient on the output, the layer in the backpropagation chain
-# Conversation will be provided differently.
-
-# ### Variable Information ###
-# CONVERSATION_START_INSTRUCTION_STRING_FN = r"""
-# TARGET VARIABLE:
-# <NAME> {{variable_name}} </NAME>
-# <ROLE> {{variable_desc}} </ROLE>
-# <VARIABLE> {{variable_value}} </VARIABLE>
-# {{conversation_str}}
-# """
-
-# Third part of the user prompt
-# OBJECTIVE_INSTRUCTION_BASE = r"""<OBJECTIVE_FUNCTION>
-# Your only goal is to clearly states how it obtained the "<OUTPUTS/SCORE>".
-# Especially when the score is low.
-# Be CONCISE.
-# If you have enough context, add a more specific feedback on how it failed.
-# </OBJECTIVE_FUNCTION>"""
-
-
-OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The <INPUTS/SCORE> was later used as "{{response_name}}: {{response_desc}}".
-<OBJECTIVE_FUNCTION>
-Your only goal is to clearly states how it obtained the "Eval output/score": {{response_gradient}}.
-Especially when the score is low.
-Be CONCISE.
-If you have enough context, add a more specific feedback on how it failed.
-</OBJECTIVE_FUNCTION>"""
-
 
 class EvalFnToTextLoss(LossComponent):
     __doc__ = """Convert an evaluation function to a text loss.
@@ -141,7 +101,18 @@ def forward(
         kwargs: Dict[str, Parameter],
         response_desc: str = None,
         metadata: Dict[str, str] = None,  # additional notes on the input kwargs
+        id: str = None,
+        gt: object = None,
+        input: Dict[str, object] = None,
     ) -> Parameter:
+        r"""
+        Args:
+            kwargs: The inputs to the eval_fn.
+            response_desc: Description of the output.
+            metadata: Additional notes on the input kwargs.
+            id: The unique identifier for the data point.
+            gt: The ground truth for the evaluation function.
+        """
         if response_desc is None:
             response_desc = "Output of EvalFnToTextLoss."
 
@@ -159,19 +130,26 @@ def forward(
             eval_inputs[k] = v.eval_input
         score: float = self.eval_fn(**eval_inputs)
 
-        # Create a parameter
-        # TODO: improve the readability of the input and response
-        eval_param: Parameter = Parameter(
+        eval_param: Parameter = OutputParameter(
             name=self.name + "_output",
             data=score,
             requires_opt=True,
             role_desc=response_desc,
             score=score,
             param_type=ParameterType.LOSS_OUTPUT,
+            data_id=id,
         )
+        eval_param.set_gt(gt)
         eval_param.set_predecessors(predesessors)
+        eval_param.trace_forward_pass(
+            input_args=kwargs,
+            full_response=score,
+            id=self.id,
+            name=self.name,
+        )
 
         log.info(f"EvalFnToTextLoss: Input: {kwargs}, Output: {eval_param}")
+        # extract ground truth from eval_inputs, anything
         eval_param.set_grad_fn(
             BackwardContext(
                 backward_fn=self.backward,
@@ -180,6 +158,8 @@ def forward(
                 eval_fn_desc=self.eval_fn_desc,
                 kwargs=kwargs,
                 metadata=metadata,
+                ground_truth=gt,
+                input=input,
             )
         )
         return eval_param
@@ -207,14 +187,18 @@ def set_backward_engine(
     @staticmethod
     def _backward_through_one_predecessor(
         pred: Parameter,
-        inputs_string: str,
+        kwargs: Dict[str, Parameter],
         response: Parameter,
         eval_fn_desc: str,
         backward_engine: "BackwardEngine",
+        ground_truth: object = None,
         is_intermediate_node: bool = False,  # if the node is an intermediate node in the backpropagation chain
         metadata: Dict[str, str] = None,
+        input: Dict[str, object] = None,  # system input
     ):
         if not pred.requires_opt:
+            if response.score is not None:
+                pred.set_score(response.score)
             log.debug(
                 f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
             )
@@ -227,9 +211,7 @@ def _backward_through_one_predecessor(
             log.info(
                 f"EvalFnToTextLoss: Gradient already computed for {pred.role_desc} with respect to {response.role_desc}"
             )
-            # print(
-            #     f"Gradient already computed for {pred.role_desc} with respect to {response.role_desc}"
-            # )
+
             return
 
         if backward_engine is None:
@@ -242,13 +224,20 @@ def _backward_through_one_predecessor(
 
         instruction_str, objective_str = None, None
 
+        # convert kwargs to key, (value, type(eval_input))
+
+        inputs = {}
+        for k, v in kwargs.items():
+            inputs[k] = (v.get_param_info(), str(type(v.eval_input)))
+
         # response information
         conversation_str = Prompt(
             LOSS_CONVERSATION_TEMPLATE_STRING,
             prompt_kwargs={
-                "input_str": inputs_string,
+                "system_question": input,
+                "inputs": inputs,
                 "eval_fn_desc": eval_fn_desc,
-                "response_value": response.data,
+                "response_value": response.get_prompt_data(),
                 "metadata": json.dumps(metadata) if metadata else None,
             },
         )()
@@ -263,9 +252,7 @@ def _backward_through_one_predecessor(
         instruction_str = Prompt(
             conv_ins_template,
             prompt_kwargs={
-                "variable_desc": pred.role_desc,
-                "variable_name": pred.name,
-                "variable_value": pred.data,
+                "variable": pred.get_param_info(),
                 "conversation_str": conversation_str,
             },
         )()
@@ -291,40 +278,50 @@ def _backward_through_one_predecessor(
         gradient_value: GeneratorOutput = backward_engine(
             prompt_kwargs=backward_engine_prompt_kwargs
         )
-        # gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs)
+        gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs)
+        # print(f"Backward engine prompt: {gradient_prompt}")
         gradient_value_data = (
             gradient_value.data
             or backward_engine.failure_message_to_optimizer(
                 gradient_response=gradient_value
             )
         )
-        # print(f"gradient_prompt: {gradient_prompt}")
-        # gradient_value_data = response.data.to_yaml()
+
+        gradient_value_data = (
+            f"expected answer: {ground_truth},\n Feedback: {gradient_value_data}"
+        )
+        # print(f"gradient_value_data: {gradient_value_data}")
 
         log.debug(f"EvalFnToTextLoss: Gradient for {pred}: {gradient_value_data}")
 
         # score should be passed to grad
-        gradient_param = Parameter(
-            name=f"{response.name}_to_{pred.name}_grad",
+        gradient_param = Gradient(
             data=gradient_value_data,
-            requires_opt=True,
-            # gradient_prompt=gradient_prompt,
-            role_desc=f"Feedback for {pred.role_desc}",
+            data_id=response.data_id,
             score=response.data,
-            from_response_id=response.id,
-            param_type=ParameterType.GRADIENT,
+            from_response=response,
+            to_pred=pred,
         )
-        pred.add_gradient(gradient_param)
-        pred.gradients_context[gradient_param] = GradientContext(
-            context=conversation_str,
-            response_desc=response.role_desc,
-            variable_desc=pred.role_desc,
+        gradient_param.add_prompt(gradient_prompt)
+        gradient_param.add_context(
+            GradientContext(
+                input_output=conversation_str,
+                response_desc=response.role_desc,
+                variable_desc=pred.role_desc,
+                # input=input,
+                # ground_truth=ground_truth,
+            )
         )
+        pred.add_gradient(gradient_param)
 
         # backward the end to end score
         # TODO: not really useful
-        pred.set_score(response.data)
-        print(f"setting pred name {pred.name} score to {response.data}")
+        if response.score is not None:
+            pred.set_score(response.score)
+        pred.set_gt(ground_truth)
+        printc(f"pred: {pred.eval_input}, gt: {ground_truth}")
+        # print(f"setting pred name {pred.name} score to {response.data}")
+        # print(f"gradient_param: {pred.gradients}")
 
         # TODO: reduce meta
 
@@ -333,10 +330,12 @@ def backward(
         response: Parameter,
         eval_fn_desc: str,
         kwargs: Dict[str, Parameter],
+        ground_truth: object = None,
         backward_engine: Optional[
             "BackwardEngine"
         ] = None,  # only needed for text prompt optimization
         metadata: Dict[str, str] = None,
+        input: Dict[str, object] = None,
     ):
         r"""Ensure to set backward_engine for the text prompt optimization. It can be None if you
         are only doing demo optimization and it will not have gradients but simply backpropagate the score.
@@ -351,30 +350,29 @@ def backward(
         log.info(f"response_gradient_context: {response_gradient_context}")
 
         # go through all child parameters
-        if backward_engine and not response.backward_engine_disabled:
-            # Convert all input arguments to string
-            inputs_string = "\n\n".join(
-                [
-                    f"({k}) (role: {v.role_desc}), data: {v.data}, input_to_eval_fn: {v.eval_input}, data_type: {type(v.eval_input)}"
-                    for k, v in kwargs.items()
-                ]
-            )
-            for pred in children_params:
-                if not pred.requires_opt:
-                    log.debug(
-                        f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
+        if backward_engine:
+            if not response.backward_engine_disabled:
+                for pred in children_params:
+                    if not pred.requires_opt:
+                        log.debug(
+                            f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
+                        )
+                        continue
+
+                    self._backward_through_one_predecessor(
+                        pred,
+                        kwargs,
+                        response,
+                        eval_fn_desc,
+                        backward_engine,
+                        ground_truth=ground_truth,
+                        is_intermediate_node=is_intermediate_node,
+                        metadata=metadata,
+                        input=input,
                     )
-                    continue
-
-                self._backward_through_one_predecessor(
-                    pred,
-                    inputs_string,
-                    response,
-                    eval_fn_desc,
-                    backward_engine,
-                    is_intermediate_node,
-                    metadata,
-                )
+            else:  # recursively disable backward for all children
+                for pred in children_params:
+                    pred.backward_engine_disabled = True
         # backward for the score for the demo
         for pred in children_params:
             # if not pred.requires_opt:
@@ -382,11 +380,11 @@ def backward(
             #         f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization."
             #     )
             #     continue
-            if not isinstance(response.data, float):
+            if not (isinstance(response.data, float) or isinstance(response.data, int)):
                 raise TypeError(
                     f"EvalFnToTextLoss: response.data must be a float. Got {type(response.data)}."
                 )
-            pred._score = response.data
+            pred.score = response.data
             from adalflow.utils.logger import printc
 
             printc(
@@ -473,10 +471,8 @@ def parse_integer_answer(answer: str, only_first_line: bool = False):
     )
     # model.set_mock_output(mock_output_data="4")
     model.train()
-    print(f"model.train: {model.training}")
 
     y: Parameter = model(prompt_kwargs={"input_str": x})
-    print(f"y: {y}")
 
     loss = eval_fn_to_text_loss(
         {
@@ -489,9 +485,7 @@ def parse_integer_answer(answer: str, only_first_line: bool = False):
             ),
         }
     )
-    print(f"loss: {loss}")
     loss.backward()
-    print(loss.to_dict())
     assert len(loss.predecessors) == 2
     assert len(y.predecessors) == 2
     dot = loss.draw_graph(add_grads=True, filepath="real_data")
diff --git a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
index 219c299a..8f8c6b28 100644
--- a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
+++ b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
@@ -18,6 +18,8 @@
 
 from adalflow.core.base_data_class import DataClass
 from adalflow.tracing.decorators import trace_generator_states
+from adalflow.utils.logger import printc
+from adalflow.core.types import GeneratorOutput
 
 
 if TYPE_CHECKING:
@@ -43,11 +45,31 @@ class HistoryPrompt(DataClass):
 # {{loop.index}}. {{failed_proposal}}
 # {% endfor %}
 # {% endif %}
+
 TEXT_GRAD_DESC_TEMPLATE = r"""<START_OF_SYSTEM_PROMPT>
 {{optimizer_system_prompt}}
 <END_OF_SYSTEM_PROMPT>
 <START_OF_USER_MESSAGE>
-
+You are {{steps}} steps since your successful improvement.
+{# Variable and peers info #}
+<START_OF_VARIABLE_AND_PEERS_INFO>
+{{variable_and_peers_info}}
+<END_OF_VARIABLE_AND_PEERS_INFO>
+{# system trainable variables #}
+{% if system_variables %}
+<START_OF_SYSTEM_VARIABLES>
+The target variable is used together with these system variables besides of its peers:
+{% for system_variable in system_variables %}
+{{loop.index}}.
+Name: {{system_variable.name}}
+Type: {{system_variable.param_type}}
+Description: {{system_variable.role_desc}}
+WILL_BE_OPTIMIZED: {{system_variable.requires_opt}}
+Vaule: {{system_variable.prompt_data}}
+{% endfor %}
+Strategically plan the role of each system variable to collaborate with each other for final correct answer.
+<END_OF_SYSTEM_VARIABLES>
+{% endif %}
 {# OPRO past history #}
 {% if past_history %}
 <START_OF_HISTORY_PERFORMANCE>
@@ -55,22 +77,21 @@ class HistoryPrompt(DataClass):
 {% for history in past_history %}
 {{loop.index}}. {{history}}
 {% endfor %}
-IMPORTANT: Your goal is to generate new variable values that score higher than all previous iterations.
+IMPORTANT: Your goal is to generate new variable that score higher than all past iterations.
+{# Momentum #}
+{% if failed_proposals %}
+Here are the most recent failed proposals:
+{% for failed_proposal in failed_proposals %}
+{{loop.index}}. {{failed_proposal}}
+{% endfor %}
+{% endif %}
+You MUST Try a different approach from the failed proposals.
 <END_OF_HISTORY_PERFORMANCE>
 {% endif %}
 Here are the context and feedback for the variable:
 <START_OF_CONTEXT_FEEDBACK>
 {{variable_grad}}
 <END_OF_CONTEXT_FEEDBACK>
-{# Momentum #}
-{% if past_values %}
-Here are the past iterations of this variable:
-<PAST_ITERATIONS>
-{{past_values}}
-</PAST_ITERATIONS>
-Similar feedbacks across different steps suggests that the modifications to the variable are insufficient.
-If this is the case, please make more significant changes to the variable.
-{% endif %}
 {# Constraints #}
 {% if constraint_text %}
 You must follow the following constraints:
@@ -81,12 +102,13 @@ class HistoryPrompt(DataClass):
 You must base on the following examples when modifying the {{variable_desc}}:
 <EXAMPLES>{{in_context_examples}}</EXAMPLES>
 {% endif %}
-YOU MUST ENSURE the new variable shares the same intent as the original variable.
-You can either rephrase the initial variable, or add more specific instructions based on the feedback.
-You can not change the variable to only fit on one sample if the batch size is larger than 1.
 <END_OF_USER_MESSAGE>
 """
 
+# YOU MUST ENSURE the new variable shares the same intent as the original variable.
+# You can either rephrase the initial variable, or add more specific instructions based on the feedback.
+# You can not change the variable to only fit on one sample if the batch size is larger than 1.
+
 # optimizer system prompt
 
 # Tips:
@@ -94,28 +116,147 @@ class HistoryPrompt(DataClass):
 # 2. Add new elements to address specific feedback.
 # 3. Be creative and present the variable differently.
 # Provide only the new variable value between {{new_variable_start_tag}} and {{new_variable_end_tag}} tags.
-OPTIMIZER_SYSTEM_PROMPT = r"""
-You are part of an optimization system that refines existing variable based on feedback generated on a batch of input data.
+# OPTIMIZER_SYSTEM_PROMPT = r"""
+# You are part of an optimization system that refines existing variable based on feedback generated on a batch of input data.
+
+# 1. Address the concerns raised in the feedback while preserving positive aspects.
+# 3. Observe past performance patterns when provided and to keep the good quality.
+# 4. Consider the variable in the context of its peers if provided.
+#    FYI:
+#    - If a peer will be optimized itself, do not overlap with its scope.
+#    - Otherwise, you can overlap if it is necessary to address the feedback.
+
+# {{output_format_str}}
+
+# YOU MUST ENSURE the new variable shares the same intent as the original variable.
+# You can either rephrase the initial variable when no specific feedback found, or add more specific instructions based on the feedback.
+# You can not change the variable to only fit on one sample if the batch size is larger than 1.
+# {% if instruction_to_optimizer %}
+# YOU Should consider user instruction: {{instruction_to_optimizer}}
+# {% endif %}
+# """
 
-1. Address the concerns raised in the feedback while preserving positive aspects.
-3. Observe past performance patterns when provided and to keep the good quality.
-4. Consider the variable in the context of its peers if provided.
-   FYI:
-   - If a peer will be optimized itself, do not overlap with its scope.
-   - Otherwise, you can overlap if it is necessary to address the feedback.
+# OPTIMIZER_SYSTEM_PROMPT = r"""
+# You are a prompt engineer who excels at refining existing prompts used in LLM in-context learning.
 
-{{output_format_str}}
+# Your task:
+# - **Improve a variable** based on feedback from a batch of input data points.
+
+# ### Context and Requirements
 
+# 1. **Variable Usage**
+#    The variable is either an input or output of a functional component. The component schema will be provided.
+#    If the same DataID has multiple gradients, it indicates that this component/variable is called repeatedly in a compound system (with a cycle) in the same order that it appears in the gradient list.
+
+# 2. **Key Objectives**
+#    1. **Address Feedback**: Resolve concerns raised in the feedback while preserving the positive aspects of the original variable.
+#    2. **Peer Awareness**:
+#       - If a peer variable will be optimized separately, do not overlap with its scope.
+#    3. **Consistency with Past Performance**: Observe patterns from previous iterations and retain beneficial qualities.
+#    4. **Be Creative** in your improvements.
+
+# 3. **Additional Notes**
+#    - Add new elements to address each specific piece of feedback.
+#    - Rephrase or eliminate unnecessary words for clarity.
+
+# {% if instruction_to_optimizer %}
+# **User Instructions**: {{instruction_to_optimizer}}
+# {% endif %}
+# <START_OF_OUTPUT_FORMAT>
+# {{output_format_str}}
+# <END_OF_OUTPUT_FORMAT>
+# """
+# <TASK_PIPELINE>
+# Here is a summary on the task pipeline you are optimizing:
+# retriever: retrieves relevant documents for the question. (Not trainable, you have no control)
+# LLM: Answer questions by reading the context  and reason the best answer.
+# </TASK_PIPELINE>
+OPTIMIZER_SYSTEM_PROMPT = r"""
+You are an excellent prompt engineer who works on optimizing a compound LLM system with in-context learning.
+Your task is to improve a variable based on feedback from a batch of input data points.
+
+The variable is either input or output of a functional component where the component schema will be provided.
+If the same DataID has multiple gradients, it means this component/variable is called multiple times in the compound system(with a cycle) in the same order as it appears in the gradient list.
+
+When the LLM system is complicated with multiple system variables, you need to strategize the role of each
+### Your Responsibilities:
+1. **Address Feedback**: Resolve concerns raised in the feedback while preserving the positive aspects of the original variable.
+2. Observe past performance patterns (when available) to retain good qualities in the variable.
+3. **System Awareness**: When other system variables are given, ensure you understand how this variable works in the whole system.
+   You have a choice to not update a variable if it is not responsible for the error by  setting `update: false` and `proposed_variable: None`.
+You MUST not update variable when there is no clear error indicated in a multi-component system.
+4. **Peer Awareness**: This variable works together with Peer variables, ensure you are aware of their roles and constraints.
+5. Be Creative. If adding new elements, be concise.
+
+### Your available solutions.
+1. Add new elements to address each specific feedback.
+2. Add demonstration (e.g., input-reasoning-answer) for tasks that require strong reasoning skills.
+3. Rephrase(for more clarity) to address the feedback.
+4. You can also eliminate unnecessary words to improve clarity.
+
+### prompt engineering practices:
+1. Set Context and Role: Establish a specific identity or domain expertise for the AI to guide style, knowledge, and constraints.
+2. Demonstration: Construct input-reasoning-answer example especially for tasks that require strong reasoning skills.
+3. Be Specific and Clear: Clearly define instructions, desired format, and constraints to ensure accurate and relevant outputs.
+4. Leverage Constraints and Formatting: Explicitly direct how the answer should be structured (e.g., bullet points, tables, or tone).
+5. Self-Consistency / Verification Prompts: Prompt the model to check its own logic for errors, inconsistencies, or missing details.
+
+{{output_format_str}}
 
-Tips:
-1. Eliminate unnecessary words or phrases.
-2. Add new elements to address specific feedback.
-3. Be creative and present the variable differently.
 {% if instruction_to_optimizer %}
-4. {{instruction_to_optimizer}}
+**Additional User Instructions**: {{instruction_to_optimizer}}
 {% endif %}
 """
 
+# <TASK_PIPELINE>
+# Here is a summary on the task pipeline you are optimizing:
+# query_generator(a trainable LLM): "generates a sub-query based on the initial query"
+# retriever: "retrieves relevant documents based on the sub-query"
+# llm(a trainable LLM): "Answer a question with available context with exact answer extracted from the context"
+# duplicator: "functional part to depulicate the documents, no trainable part, no need to have feedback or to be optimized."
+
+# The query_generator+ retriever is called twice in the pipeline as the question requires two sub-queries.
+# And the retrieved documents are deduplicated and combined to form the final context.
+# The final context is then passed to the llm to generate the answer where we want to use the exact phrase from the context.
+# </TASK_PIPELINE>
+# OPTIMIZER_SYSTEM_PROMPT = r"""
+# You are a prompt engineer exels at refining existing prompts used in LLM in-context learning.
+# Your task is to improve a variable based on feedback from a batch of input data points.
+
+# The variable is either input or output of a functional component where the component schema will be provided.
+# If the same DataID has multiple gradients, it means this component/variable is called multiple times in the compound system(with a cycle) in the same order as it appears in the gradient list.
+
+# ### YOU MUST ENSURE:
+# 1. **Address Feedback**: Resolve concerns raised in the feedback while preserving the positive aspects of the original variable.
+# 2. **Peer Awareness**:
+#    - If a peer will be optimized itself, do not overlap with its scope.
+# 3. Observe past performance patterns (when available) to retain good qualities in the variable.
+# 4. Be Creative.
+# 5. The new variable MUST have better performance than all previous iterations.
+
+# ### NOTES:
+# 1. Add new elements to address each specific feedback.
+# 2. rephrase to address the feedback.
+# 3. You can also eliminate unnecessary words to improve clarity.
+
+# ### Common prompt engineering practices:
+# 1. Set Context and Role: Establish a specific identity or domain expertise for the AI to guide style, knowledge, and constraints.
+# 2. Zero-Shot vs. Few-Shot Prompting: Decide whether to provide examples (few-shot) or none (zero-shot) to shape responses and format.
+# 3. Be Specific and Clear: Clearly define instructions, desired format, and constraints to ensure accurate and relevant outputs.
+# 4. Leverage Constraints and Formatting: Explicitly direct how the answer should be structured (e.g., bullet points, tables, or tone).
+# 5. Self-Consistency / Verification Prompts: Prompt the model to check its own logic for errors, inconsistencies, or missing details.
+
+# {% if instruction_to_optimizer %}
+# **More Instructions**: {{instruction_to_optimizer}}
+# {% endif %}
+
+# {{output_format_str}}
+# """
+
+# When no feedback is provided(high batch performance), you rephrase the variable.
+### Tips:
+# 1. Patterns like "think step by step" helps the model reason better. You should try to maintain the chain of thought.
+
 
 @dataclass
 class Instruction(DataClass):
@@ -134,12 +275,27 @@ class Instruction(DataClass):
 
 @dataclass
 class TGDData(DataClass):
-    reasoning: str = field(metadata={"desc": "Why the variable is proposed this way"})
-    proposed_variable: str = field(metadata={"desc": "The proposed variable"})
+    reasoning: str = field(
+        metadata={
+            "desc": "Which solution did you choose, which prompt engineering technique did you use? Why? Be Concise (maximum 2 sentences)"
+        }
+    )
+    update: bool = field(
+        default=True,
+        metadata={
+            "desc": "Depending on the feedback, update the variable if it is responsible for the error, else, keep it"
+        },
+    )
+    proposed_variable: str = field(
+        metadata={
+            "desc": "The proposed variable, ignoring the field when update:  false"
+        },
+        default=None,
+    )
 
 
 @dataclass
-class TGDOptimizerTrace:
+class TGDOptimizerTrace(DataClass):
     api_kwargs: Dict[str, Any] = field(
         metadata={
             "desc": "The api_kwargs for components like Generator and Retriever that pass to the model client"
@@ -174,7 +330,7 @@ class TGDOptimizer(TextOptimizer):
     params: ParamsT
     constraints: List[str]
     params_history: Dict[str, List[HistoryPrompt]] = {}  # id to history
-    # failed_proposals: Dict[str, List[HistoryPrompt]] = {}  # only need the value
+    failed_proposals: Dict[str, List[HistoryPrompt]] = {}  # only need the value
 
     def __init__(
         self,
@@ -187,7 +343,8 @@ def __init__(
         in_context_examples: List[str] = None,  # TODO: in-context examples
         num_gradient_memory: int = 0,  # TODO: gradient memory and momentum, for now it is not useful
         max_past_history: int = 3,
-        # max_failed_proposals: int = 3,
+        max_failed_proposals: int = 2,
+        steps_from_last_improvement: int = 0,
     ):
         from adalflow.core.generator import Generator
         from adalflow.core import Prompt
@@ -205,7 +362,13 @@ def __init__(
             prompt_kwargs={
                 # "new_variable_start_tag": new_variable_tags[0],
                 # "new_variable_end_tag": new_variable_tags[1],
-                "output_format_str": self.output_parser.get_output_format_str(),
+                "output_format_str": """Your output should be formatted as a standard JSON instance with the following schema:
+```
+{
+    "reasoning": "Why the variable is proposed this way (str) (required)",
+    "proposed_variable": "The proposed variable (str) (required)"
+}
+```"""
             },
         )
         self.variable_and_peers_info = Prompt(
@@ -227,12 +390,13 @@ def __init__(
         )
 
         self.max_past_history = max_past_history
-        # self.max_failed_proposals = max_failed_proposals
+        self.max_failed_proposals = max_failed_proposals
+        self.steps_from_last_improvement = steps_from_last_improvement
 
         # initate the past history for each parameter
         for param in self.params:
             self.params_history[param.id] = []
-            # self.failed_proposals[param.id] = []
+            self.failed_proposals[param.id] = []
 
     @property
     def constraint_text(self):
@@ -248,6 +412,12 @@ def constraint_text(self):
         ]
         return "\n".join(constraints_ordered)
 
+    def increment_steps_from_last_improvement(self):
+        self.steps_from_last_improvement += 1
+
+    def reset_steps_from_last_improvement(self):
+        self.steps_from_last_improvement = 0
+
     def add_score_to_params(self, val_score: float):
         for param in self.params:
             self.add_score_to_current_param(param.id, param, val_score)
@@ -287,39 +457,39 @@ def render_history(self, param_id: str) -> List[str]:
             history.to_yaml(exclude=["id"]) for history in self.params_history[param_id]
         ]
 
-    # def add_failed_proposal(self):
-    #     """Save a copy of the current value of the parameter in the failed proposals."""
-    #     for param in self.params:
-    #         failed_proposal = HistoryPrompt(
-    #             id=param.id,
-    #             value=param.data,
-    #             eval_score=None,
-    #         )
-    #         self.failed_proposals[param.id].append(failed_proposal)
-    #         if len(self.failed_proposals[param.id]) > self.max_failed_proposals:
-    #             for _ in range(
-    #                 len(self.failed_proposals[param.id]) - self.max_failed_proposals
-    #             ):
-    #                 self.failed_proposals[param.id].pop()
-    #     # if param_id not in self.failed_proposals:
-    #     #     self.failed_proposals[param_id] = []
-    #     # failed_proposal = HistoryPrompt(
-    #     #     id=param_id,
-    #     #     value=value,
-    #     #     eval_score=None,
-    #     # )
-    #     # self.failed_proposals[param_id].append(failed_proposal)
-    #     # if len(self.failed_proposals[param_id]) > self.max_failed_proposals:
-    #     #     for _ in range(len(self.failed_proposals[param_id]) - self.max_failed_proposals):
-    #     #         self.failed_proposals[param_id].pop()
-
-    # def render_failed_proposals(self, param_id: str) -> List[str]:
-    #     if param_id not in self.failed_proposals:
-    #         return []
-    #     return [
-    #         history.to_yaml(exclude=["id", "eval_score"])
-    #         for history in self.failed_proposals[param_id]
-    #     ]
+    def add_failed_proposal(self):
+        """Save a copy of the current value of the parameter in the failed proposals."""
+        for param in self.params:
+            failed_proposal = HistoryPrompt(
+                id=param.id,
+                value=param.data,
+                eval_score=None,
+            )
+            self.failed_proposals[param.id].append(failed_proposal)
+            if len(self.failed_proposals[param.id]) > self.max_failed_proposals:
+                for _ in range(
+                    len(self.failed_proposals[param.id]) - self.max_failed_proposals
+                ):
+                    self.failed_proposals[param.id].pop()
+        # if param_id not in self.failed_proposals:
+        #     self.failed_proposals[param_id] = []
+        # failed_proposal = HistoryPrompt(
+        #     id=param_id,
+        #     value=value,
+        #     eval_score=None,
+        # )
+        # self.failed_proposals[param_id].append(failed_proposal)
+        # if len(self.failed_proposals[param_id]) > self.max_failed_proposals:
+        #     for _ in range(len(self.failed_proposals[param_id]) - self.max_failed_proposals):
+        #         self.failed_proposals[param_id].pop()
+
+    def render_failed_proposals(self, param_id: str) -> List[str]:
+        if param_id not in self.failed_proposals:
+            return []
+        return [
+            history.to_yaml(exclude=["id", "eval_score"])
+            for history in self.failed_proposals[param_id]
+        ]
 
     # TODO: optimize with adalflow template for better readability
     def get_gradient_memory_text(self, param: Parameter) -> str:
@@ -333,15 +503,23 @@ def get_gradient_memory_text(self, param: Parameter) -> str:
 
     def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]:
 
+        system_params = [
+            p.get_param_info()
+            for p in self.params
+            if p.id != param.id and p not in param.peers
+        ]
+        peers_params = [p.get_param_info() for p in param.peers]
         variable_and_peer_info = self.variable_and_peers_info.call(
-            variable=param.get_param_info(), peers=param.peers  # param.peers
+            variable=param.get_param_info(), peers=peers_params
         )
 
+        variable_grad = param.get_gradients_component_schema(skip_correct_sample=False)
+
         user_prompt_kwargs = {
             "variable_and_peers_info": variable_and_peer_info,
-            "variable_grad": param.get_gradient_and_context_text(
-                skip_correct_sample=True
-            ),
+            "variable_grad": variable_grad,  # param.get_gradient_and_context_text(
+            #   skip_correct_sample=False
+            # ),
             # constraints
             "constraint_text": self.constraint_text if self.do_constrained else None,
             # in-context examples
@@ -361,11 +539,13 @@ def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]:
                 self.render_history(param.id) if self.max_past_history else None
             ),
             # failed proposals
-            # "failed_proposals": (
-            #     self.render_failed_proposals(param.id)
-            #     if self.max_failed_proposals
-            #     else None
-            # ),
+            "failed_proposals": (
+                self.render_failed_proposals(param.id)
+                if self.max_failed_proposals
+                else None
+            ),
+            "system_variables": system_params,
+            "steps": self.steps_from_last_improvement,
         }
 
         return user_prompt_kwargs
@@ -386,7 +566,7 @@ def propose(self):
         if self.proposing:
             raise ValueError("Already proposing a value.")
 
-        print("Proposing a new value.")
+        printc("Proposing a new value.", color="magenta")
 
         # no cache so that new proposal can be made
         no_cache = True
@@ -411,27 +591,38 @@ def propose(self):
                 **user_prompt_kwargs,
             }
             # turn off cache
-            response = self.llm_optimizer.call(
-                prompt_kwargs=prompt_kwargs, use_cache=not no_cache
-            )
+            try:
+                response: GeneratorOutput = self.llm_optimizer.call(
+                    prompt_kwargs=prompt_kwargs, use_cache=not no_cache
+                )
+            except Exception as e:
+                printc(f"Error in the optimizer: {e}", color="red")
+                raise e
+            if not isinstance(response, GeneratorOutput):
+                raise TypeError(f"Wrong response type: {type(response)}")
+
             prompt_str = self.llm_optimizer.get_prompt(**prompt_kwargs)
             log.debug(f"TGD LLM optimizer prompt: {prompt_str}")
+            printc(f"TGD LLM optimizer prompt:: {prompt_str}", color="blue")
             proposed_data: TGDData = (
                 response.data
-                if response.data
+                if response.data is not None
                 else TGDData(
-                    reasoning="No reasoning", proposed_variable=response.raw_response
+                    reasoning="No reasoning",
+                    proposed_variable=response.raw_response,
+                    update=False,
                 )
             )
+            printc(f"Response from the optimizer: {response}", color="blue")
+
             log.info(f"Response from the optimizer: {response}")
-            # extract the improved variable from the response
-            # TODO: make it more robust
-            # improved_variable = extract_new_variable(proposed_data)
-            improved_variable = proposed_data.proposed_variable
-            param.propose_data(improved_variable)
+            if not proposed_data.update:
+                printc(f"No update is required for {param.name}", color="yellow")
+                param.propose_data(param.data)
+            else:
+                improved_variable = proposed_data.proposed_variable
+                param.propose_data(improved_variable)
             param.trace_optimizer(api_kwargs=prompt_str, response=response)
-            print(f"prompt_str: {prompt_str}")
-            print(f"response: {response}")
             if self.do_gradient_memory:
                 self.update_gradient_memory(param)
         self.proposing = True
@@ -490,4 +681,4 @@ def step(self):
         "past_history": histories,
     }
     response = prompt(**prompt_kwargs)
-    print(response)
+    # print(response)
diff --git a/adalflow/adalflow/optim/trainer/adal.py b/adalflow/adalflow/optim/trainer/adal.py
index cea31760..f65e7822 100644
--- a/adalflow/adalflow/optim/trainer/adal.py
+++ b/adalflow/adalflow/optim/trainer/adal.py
@@ -9,7 +9,7 @@
 
 if TYPE_CHECKING:
     from adalflow.core.model_client import ModelClient
-    from adalflow.core.generator import Generator, BackwardEngine
+    from adalflow.core.generator import Generator, BackwardEngine, BackwardPassSetup
     from adalflow.optim.parameter import Parameter
 
 from adalflow.core.component import Component
@@ -18,6 +18,7 @@
 from adalflow.optim.loss_component import LossComponent
 from adalflow.optim.types import PromptData
 from adalflow.eval.base import EvaluationResult
+from adalflow.optim.grad_component import GradComponent2, GradComponent
 
 from adalflow.optim.optimizer import DemoOptimizer, TextOptimizer
 
@@ -32,6 +33,8 @@ class AdalComponent(Component):
     1. Organize all parts for training a task pipeline in one place.
     2. Help with debugging and testing before the actual training.
     3. Adds multi-threading support for training and evaluation.
+
+    It has no need on call, forward, bicall, or __call__, so we need to overwrite the base ones.
     """
 
     task: Component
@@ -85,7 +88,7 @@ def _get_param_values(self) -> List[PromptData]:
         return [
             PromptData(p.id, p.name, p.data, p.requires_opt)
             for p in self.task.parameters()
-            # if p.requires_opt
+            if p.requires_opt
         ]
 
     def prepare_task(self, sample: Any, *args, **kwargs) -> Tuple[Callable, Dict]:
@@ -172,7 +175,7 @@ def configure_optimizers(self, *args, **kwargs) -> List[Optimizer]:
         return self._demo_optimizers + self._text_optimizers
 
     def configure_backward_engine(self, *args, **kwargs):
-        r"""Configure a backward engine for all generators in the task for bootstrapping examples."""
+        r"""Configure a backward engine for all GradComponent in the task for bootstrapping examples."""
         # check if backward engine is already configured
         if self.backward_engine:
             log.warning("Backward engine is already configured.")
@@ -185,8 +188,13 @@ def configure_backward_engine(self, *args, **kwargs):
         self.configure_backward_engine_helper(
             model_client=self.backward_engine_model_config["model_client"],
             model_kwargs=self.backward_engine_model_config["model_kwargs"],
+            backward_pass_setup=kwargs.get("backward_pass_setup", None),
         )
 
+    def disable_backward_engine(self):
+        r"""Disable the backward engine for all GradComponent in the task."""
+        self.disable_backward_engine_helper()
+
     # def configure_backward_engine(self, *args, **kwargs):
     #     raise NotImplementedError("configure_backward_engine method is not implemented")
 
@@ -402,8 +410,6 @@ def train_step(self, batch, batch_idx, num_workers: int = 2) -> List:
                 if isinstance(y_pred, Parameter):
                     raise ValueError(f"y_pred_{i} is a Parameter, {y_pred}")
 
-                print(f"y_pred: {y_pred})")
-
                 assert (
                     y_pred.id == sample.id
                 ), f"ID mismatch: {y_pred.id} != {sample.id}, type: {type(y_pred)}"
@@ -480,9 +486,17 @@ def validation_step(self, batch, batch_idx, num_workers: int = 2) -> List:
         # TODO: let use decide which mode to be
         self.task.eval()
         self.task.use_teacher(mode=False)  # ensure the teacher is not used
-        completed_y_preds, completed_samples, index_to_score = self.pred_step(
-            batch, batch_idx, num_workers, running_eval=True, min_score=minimum_score
-        )
+        try:
+            completed_y_preds, completed_samples, index_to_score = self.pred_step(
+                batch,
+                batch_idx,
+                num_workers,
+                running_eval=True,
+                min_score=minimum_score,
+            )
+        except Exception as e:
+            raise ValueError(f"Error in validation step: {e}")
+
         if index_to_score:
             # compute score from index_to_score
 
@@ -495,12 +509,15 @@ def validation_step(self, batch, batch_idx, num_workers: int = 2) -> List:
                 avg_score=avg_score, per_item_scores=acc_list
             )
         else:
+            try:
 
-            eval_results = self.evaluate_samples(
-                samples=completed_samples,
-                y_preds=completed_y_preds,
-                num_workers=num_workers,
-            )
+                eval_results = self.evaluate_samples(
+                    samples=completed_samples,
+                    y_preds=completed_y_preds,
+                    num_workers=num_workers,
+                )
+            except Exception as e:
+                raise ValueError(f"Error in evaluation: {e}")
         return eval_results
 
     def loss_step(
@@ -578,11 +595,29 @@ def configure_teacher_generator_helper(
             generator.set_teacher_generator(teacher_generator)
         print("Teacher generator configured.")
 
+    def disable_backward_engine_helper(self):
+        r"""Disable the backward engine for all generators in the task."""
+        all_grads = self._find_all_grad_components()
+        for _, grad in all_grads:
+            if hasattr(grad, "disable_backward_engine") and callable(
+                getattr(grad, "disable_backward_engine", None)
+            ):
+                grad.disable_backward_engine()
+        print("Backward engine disabled for GradComponents")
+
+        if not self.loss_fn:
+            raise ValueError("Loss function is not configured.")
+
+        # configure it for loss_fn
+        if self.loss_fn:
+            self.loss_fn.disable_backward_engine()
+
     def configure_backward_engine_helper(
         self,
         model_client: "ModelClient",
         model_kwargs: Dict[str, Any],
         template: Optional[str] = None,
+        backward_pass_setup: Optional["BackwardPassSetup"] = None,
     ):
         r"""Configure a backward engine for all generators in the task for bootstrapping examples."""
         from adalflow.core.generator import BackwardEngine
@@ -592,13 +627,18 @@ def configure_backward_engine_helper(
             model_kwargs=model_kwargs,
             template=template,
         )
+        if backward_pass_setup is not None:
+            self.backward_engine.update_default_backward_pass_setup(backward_pass_setup)
 
         # set all generator's backward engine
 
-        all_generators = self._find_all_generators()
-        for _, generator in all_generators:
-            generator.set_backward_engine(self.backward_engine)
-        print("Backward engine configured for all generators.")
+        all_grads = self._find_all_grad_components()
+        for _, grad in all_grads:
+            if hasattr(grad, "set_backward_engine") and callable(
+                getattr(grad, "set_backward_engine", None)
+            ):
+                grad.set_backward_engine(self.backward_engine)
+        print("Backward engine configured for GradComponents")
 
         if not self.loss_fn:
             raise ValueError("Loss function is not configured.")
@@ -654,6 +694,17 @@ def _find_all_generators(self) -> List[Tuple[str, "Generator"]]:
         log.debug(f"all_generators: {all_generators}")
         return all_generators
 
+    def _find_all_grad_components(self) -> List[Tuple[str, GradComponent2]]:
+        r"""Find all generators automatically from the task."""
+        # from adalflow.core import Generator
+
+        all_grads: List[Tuple[str, GradComponent2]] = []
+        for name, comp in self.task.named_components():
+            if isinstance(comp, GradComponent2) or isinstance(comp, GradComponent):
+                all_grads.append((name, comp))
+        log.debug(f"all_grads: {all_grads}")
+        return all_grads
+
     def _auto_generator_callbacks(self, save_dir: str = "traces") -> List[str]:
         r"""Automatically generate callbacks."""
         from adalflow.core.types import GeneratorOutput
@@ -745,3 +796,15 @@ def _extra_repr(self):
         s = f"eval_fn: {self.eval_fn.__name__}, backward_engine: {self.backward_engine}, "
         s += f"backward_engine_model_config: {self.backward_engine_model_config}, teacher_model_config: {self.teacher_model_config}, text_optimizer_model_config: {self.text_optimizer_model_config}"
         return s
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+    def bicall(self, *args, **kwargs):
+        pass
+
+    def call(self, *args, **kwargs):
+        pass
+
+    def forward(self, *args, **kwargs):
+        pass
diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
index 91a2fd16..5a7c7826 100644
--- a/adalflow/adalflow/optim/trainer/trainer.py
+++ b/adalflow/adalflow/optim/trainer/trainer.py
@@ -8,12 +8,15 @@
 import numpy as np
 import uuid
 import time
+from copy import copy
 
 from adalflow.core.component import Component
 from adalflow.optim.optimizer import Optimizer, DemoOptimizer, TextOptimizer
 
 if TYPE_CHECKING:
     from adalflow.optim.parameter import Parameter
+    from adalflow.core.generator import BackwardPassSetup
+
 from adalflow.optim.types import (
     PromptData,
     TrainerResult,
@@ -24,7 +27,8 @@
 from adalflow.optim.trainer.adal import AdalComponent
 from adalflow.optim.text_grad.ops import sum_ops
 
-from adalflow.utils import save_json, load_json
+from adalflow.utils import save_json
+from adalflow.utils.file_io import load_standard_json
 from adalflow.utils.cache import hash_text_sha1
 from adalflow.utils.data import DataLoader
 from adalflow.utils.logger import printc
@@ -81,6 +85,7 @@ class Trainer(Component):
     optimization_order: Literal["sequential", "mix"] = (
         "sequential"  # zero-shot first, bootstrap second
     )
+    sequential_order: List[str] = ["text", "demo"]
     max_steps: int
     optimizer: Optimizer = None
     ckpt_path: Optional[str] = None
@@ -91,10 +96,13 @@ class Trainer(Component):
     batch_val_score_threshold: Optional[float] = (
         1.0  # when acc_score >= this threshold, skip this batch
     )
+    correct_val_score_threshold: Optional[float] = (
+        0.5  # when acc_score >= this threshold, it is considered as correct sample
+    )
     max_error_samples: Optional[int] = 2
     max_correct_samples: Optional[int] = 2
     debug: bool = False
-    sequential_order: List[str] = ["text", "demo"]
+    random_seed: int = None
 
     def __init__(
         self,
@@ -106,6 +114,7 @@ def __init__(
         num_workers: int = 4,
         ckpt_path: str = None,
         batch_val_score_threshold: Optional[float] = 1.0,
+        correct_val_score_threshold: Optional[float] = 0.5,
         max_error_samples: Optional[int] = 2,
         max_correct_samples: Optional[int] = 2,
         max_proposals_per_step: int = 5,
@@ -140,6 +149,7 @@ def __init__(
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
         self.batch_val_score_threshold = batch_val_score_threshold
+        self.correct_val_score_threshold = correct_val_score_threshold
         self.max_error_samples = max_error_samples
         self.max_correct_samples = max_correct_samples
         self.max_proposals_per_step = max_proposals_per_step
@@ -147,10 +157,12 @@ def __init__(
         self._subset_effect_count = {"pass": 0, "fail": 0}
         self._fullset_effect_count = {"pass": 0, "fail": 0}
         self._valset_effect_count = {"pass": 0, "fail": 0}
+        self._demo_valset_effect_count = {"pass": 0, "fail": 0}
         self._effective_measure = {
             "subset": self._subset_effect_count,
             "fullset": self._fullset_effect_count,
             "valset": self._valset_effect_count,
+            "demo_valset": self._demo_valset_effect_count,
         }
         self._raw_shots = raw_shots
         self._bootstrap_shots = bootstrap_shots
@@ -165,8 +177,13 @@ def __init__(
         )
         self.sequential_order = sequential_order
 
+    def set_random_seed(self, seed: int):
+        self.random_seed = seed
+
     # TODO: need to support checkpoint resume too!
-    def diagnose(self, dataset: Any, split: str = "train"):
+    def diagnose(
+        self, dataset: Any, split: str = "train", resume_from_ckpt: str = None
+    ):
         """Run an evaluation on the trainset to track all error response, and its raw response using AdaplComponent's default configure_callbacks
         Args:
             dataset: Any: Dataset to evaluate
@@ -187,9 +204,13 @@ def diagnose(self, dataset: Any, split: str = "train"):
             print(diagnose)
         """
         # 1. track all intermediate outputs
+        if resume_from_ckpt:
+            self.resume_params_from_ckpt(resume_from_ckpt)
+        self.adaltask.eval()
         if not self.ckpt_path:
             trainer_state = self.gather_trainer_states()
             self.prep_ckpt_file_path(trainer_state)
+        printc(f"Checkpoint path: {self.ckpt_path}")
         save_path = os.path.join(self.ckpt_path, f"diagnose_{split}")
         logger.debug(f"Save diagnose to {save_path}")
         # One generator will be one file, all stats are in logger_metadata.json
@@ -219,10 +240,15 @@ def diagnose(self, dataset: Any, split: str = "train"):
         paths: Dict[str, List[str]] = {"Log": log_paths, "Diagnose": [], "Stats": []}
 
         # reorder the samples based on the score
+        stats_list: List[Dict] = []
         for log_path in log_paths:
+            stats_list = []
             file_name = os.path.basename(log_path)
             logger.debug(f"Loading log file: {file_name}")
             logs = load_jsonl(log_path)
+            if not logs or len(logs) == 0:
+                print(f"Log file {log_path} is empty. This llm is not called at all.")
+                continue
             try:
                 logs_dict = {log["output"]["id"]: log for log in logs}
             except KeyError:
@@ -239,7 +265,7 @@ def diagnose(self, dataset: Any, split: str = "train"):
 
             diagnose_file = os.path.join(log_dir, diagnose_filename)
             diagnose_items = []
-            stats_list: List[Dict] = []
+
             for i, log in enumerate(sorted_logs):
                 if log["score"] < 0.5:
                     diagnose_item = {
@@ -349,6 +375,33 @@ def debug_report(
         )
         print(Fore.CYAN + "\n===================================================\n")
 
+    def resume_params_from_ckpt(self, ckpt_file: str):
+        """Resume the parameters from the checkpoint file"""
+        dict_data = load_standard_json(ckpt_file)
+        # find the highest val score
+        trainer_results: TrainerResult = TrainerResult.from_dict(dict_data)
+        # restore the prompts to the adaltask
+        val_scores = []
+        # test_scores = []
+        for step in trainer_results.step_results:
+            if step.val_score:
+                val_scores.append(step.val_score)
+            # if step.test_score:
+            #     test_scores.append(step.test_score)
+        result_from_step = 0
+        # if test_scores:
+        #     result_from_step = test_scores.index(max(test_scores))
+        if val_scores:
+            printc(f"Val scores: {val_scores}")
+            result_from_step = val_scores.index(max(val_scores))
+        prompts: List[PromptData] = trainer_results.step_results[
+            result_from_step
+        ].prompt
+
+        print(f"Restoring prompts: {prompts[0]}")
+
+        self.adaltask._set_param_values(prompts)
+
     def fit(
         self,
         *,
@@ -364,6 +417,7 @@ def fit(
         resume_from_ckpt: Optional[
             str
         ] = None,  # TODO: have a more comprehensive ckpt loading in the future
+        backward_pass_setup: Optional["BackwardPassSetup"] = None,
     ) -> Tuple[str, TrainerResult]:
         r"""
         train_loader: An iterable or collection of iterables specifying training samples.
@@ -371,6 +425,7 @@ def fit(
         Returns:
             Tuple[str, TrainerResult]: Checkpoint file and the TrainerResult object
         """
+
         start_time = time.time()
 
         debug = debug or self.debug
@@ -395,7 +450,10 @@ def fit(
             batch_size = self.train_batch_size
 
             train_loader = DataLoader(
-                train_dataset, batch_size=batch_size, shuffle=True
+                train_dataset,
+                batch_size=batch_size,
+                shuffle=True if not debug else False,
+                seed=self.random_seed,
             )
         val_dataset = val_dataset or self.val_dataset
         test_dataset = test_dataset or self.test_dataset
@@ -447,7 +505,9 @@ def fit(
 
         if len(self._get_trainable_text_params()) > 0:
             if self.adaltask.backward_engine is None:
-                self.adaltask.configure_backward_engine()
+                self.adaltask.configure_backward_engine(
+                    backward_pass_setup=backward_pass_setup
+                )
         else:
             print("No trainable text params to optimize")
             self.text_optimizers = []
@@ -460,20 +520,17 @@ def fit(
         starting_step = 0
         if resume_from_ckpt:
             self.ckpt_file = resume_from_ckpt
-            dict_data = load_json(self.ckpt_file)
+            self.ckpt_path = os.path.dirname(self.ckpt_file)
+            dict_data = load_standard_json(self.ckpt_file)
             trainer_results: TrainerResult = TrainerResult.from_dict(dict_data)
             # restore the prompts to the adaltask
             val_scores = []
-            test_scores = []
             for step in trainer_results.step_results:
                 if step.val_score:
                     val_scores.append(step.val_score)
-                if step.test_score:
-                    test_scores.append(step.test_score)
             result_from_step = 0
-            if test_scores:
-                result_from_step = test_scores.index(max(test_scores))
-            elif val_scores:
+            if val_scores:
+                printc(f"Val scores: {val_scores}")
                 result_from_step = val_scores.index(max(val_scores))
             prompts: List[PromptData] = trainer_results.step_results[
                 result_from_step
@@ -484,15 +541,28 @@ def fit(
             self.adaltask._set_param_values(prompts)
             starting_step = len(trainer_results.steps) - 1
 
+        else:
+            trainer_results = (
+                self._pre_fit(val_dataset, test_dataset)
+                if trainer_results is None
+                else trainer_results
+            )
+
         if debug:
             print("Debugging mode")
             text_grad_debug_path, few_shot_demo_debug_path = None, None
-            if len(self.text_optimizers) > 0:
+            if (
+                len(self.text_optimizers) > 0
+                and len(self._get_trainable_text_params()) > 0
+            ):
                 text_grad_debug_path = self._fit_text_grads_one_step_for_debug(
                     train_loader
                 )
 
-            if len(self.demo_optimizers) > 0:
+            if (
+                len(self.demo_optimizers) > 0
+                and len(self._get_trainable_demo_params()) > 0
+            ):
                 few_shot_demo_debug_path = self._fit_demos_one_step_for_debug(
                     train_loader, train_dataset, val_dataset, test_dataset
                 )
@@ -531,29 +601,29 @@ def fit(
             def run_text_optimizers(starting_step: int, trainer_results: TrainerResult):
                 if len(self.text_optimizers) > 0:
                     if self.strategy == "random":
-                        trainer_results = self._fit_text_grad_random(
+                        self._fit_text_grad_random(
                             train_loader,
                             val_dataset,
                             test_dataset,
                             trainer_results,
                             starting_step=starting_step,
                         )
-                        starting_step += self.max_steps
                     elif self.strategy == "constrained":
-                        trainer_results = self._fit_text_grad_constraint(
+                        # self.adaltask.configure_teacher_generator()  # use teacher as bootstrap intemediate results
+                        self._fit_text_grad_constraint(
                             train_loader,
                             val_dataset,
                             test_dataset,
                             trainer_results=trainer_results,
                             starting_step=starting_step,
                         )
-                        starting_step += self.max_steps
                     else:
                         raise ValueError(f"Strategy {self.strategy} not supported")
 
             def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult):
                 if len(self.demo_optimizers) > 0:
                     self.adaltask.configure_teacher_generator()
+                    self.adaltask.disable_backward_engine()  # disable it to avoid backward engine for gradients
                     self._fit_demos_random(
                         train_loader,
                         train_dataset,
@@ -565,27 +635,28 @@ def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult):
 
             if self.sequential_order == ["text", "demo"]:
                 run_text_optimizers(starting_step, trainer_results)
+                starting_step += self.max_steps
+                print(f"Starting step: {starting_step}")
+                print("steps", trainer_results.steps)
                 run_demo_optimizers(starting_step, trainer_results)
             else:
                 run_demo_optimizers(starting_step, trainer_results)
+                starting_step += self.max_steps
                 run_text_optimizers(starting_step, trainer_results)
-            # if len(self.text_optimizers) > 0:
-            #     run_text_optimizers(starting_step, trainer_results)
-
-            # if len(self.demo_optimizers) > 0:
-            #     run_demo_optimizers(starting_step, trainer_results)
-            # self.adaltask.configure_teacher_generator()  # attemp to use the newest teacher as
-            # self._fit_demos_random(
-            #     train_loader,
-            #     train_dataset,
-            #     val_dataset,
-            #     test_dataset,
-            #     trainer_results=trainer_results,
-            #     starting_step=starting_step,
-            # )
 
         end_time = time.time()
         print(f"Training time: {end_time - start_time}s")
+        trainer_results.total_time = end_time - start_time
+        # test at the end
+        if test_dataset:
+            test_output = self.adaltask.validation_step(
+                test_dataset, 0, self.num_workers
+            )
+            test_score = test_output.avg_score
+            trainer_results.test_score = test_score
+        # write the results to the checkpoint file
+        save_json(trainer_results.to_dict(), self.ckpt_file)
+
         print(f"ckpt_file: {self.ckpt_file}")
         return self.ckpt_file, trainer_results
 
@@ -724,7 +795,7 @@ def _fit_demos_one_step_for_debug(
         self.prep_ckpt_file_path()
         debug_path = os.path.join(self.ckpt_path, "debug_demos")
         os.makedirs(debug_path, exist_ok=True)
-        print(f"save to {debug_path}")
+        print(f"_fit_demos_one_step_for_debug save to {debug_path}")
 
         self.adaltask.train()
         self.adaltask.trace()
@@ -751,7 +822,7 @@ def _fit_demos_one_step_for_debug(
 
         # print(f"Teacher y_preds: {y_preds[0].to_dict()}")
 
-        y_preds_outputs = [p.full_response for p in y_preds]
+        y_preds_outputs = [p.data for p in y_preds]
 
         batch_eval: EvaluationResult = self.adaltask.evaluate_samples(
             batch, y_preds_outputs
@@ -809,41 +880,11 @@ def _fit_demos_one_step_for_debug(
             self._demo_optimizers_add_scores(
                 [sample.id for sample in batch], batch_per_item_scores, is_teacher=False
             )
-            # for loss in losses_student:
-            #     loss.backward()
+
             # Check the eval result
-            y_preds_outputs = [p.full_response for p in y_preds_student]
+            y_preds_outputs = [p.data for p in y_preds_student]
             eval_result = self.adaltask.evaluate_samples(batch, y_preds_outputs)
             print(f"Eval result: {eval_result.avg_score}")
-            # eval_score_per_item = eval_result.per_item_scores
-
-            # bootstrap a batch
-            # batch_for_teacher = []
-            # losses_teacher = []
-
-            # for i, (sample, item_score) in enumerate(zip(batch, eval_score_per_item)):
-
-            #     # use teacher
-            #     if sample.id in pred_teacher:
-            #         continue
-            #     # if item_score < 0.5:
-            #     pred_teacher.add(sample.id)
-            #     batch_for_teacher.append(sample)
-            # # run teacher, use teachers's output instead of the initial output (bootstrap)
-            # if len(batch_for_teacher) > 0:
-            #     print(f"Using teacher for {len(batch_for_teacher)} samples")
-            #     self.adaltask.use_teacher()
-            #     y_preds_teacher = self.adaltask.train_step(
-            #         batch_for_teacher, batch_idx, self.num_workers
-            #     )
-            #     losses_teacher: List[Parameter] = self.adaltask.loss_step(  # noqa F841
-            #         batch_for_teacher, y_preds_teacher, batch_idx, self.num_workers
-            #     )
-            #     self._demo_optimizers_add_scores(
-            #         [sample.id for sample in batch_for_teacher],
-            #         eval_score_per_item,
-            #         is_teacher=True,
-            #     )
 
             # loss_students backward
             for loss in losses_student:
@@ -896,7 +937,6 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
         self.prep_ckpt_file_path()
         debug_path = os.path.join(self.ckpt_path, "debug_text_grads")
         os.makedirs(debug_path, exist_ok=True)
-        print(f"save to {debug_path}")
         train_loader.batch_size = 2
         train_loader.shuffle = True
         self.adaltask.train()  # this will turn everything to train mode
@@ -915,14 +955,12 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
                 else:
                     failed_loss = loss
             if correct_loss is not None and failed_loss is not None:
-                print("Found correct and failed loss")
+                printc("Found correct and failed loss", "blue")
                 break
-
+        if not all_losses:
+            raise ValueError("No losses found in the dataset.")
         # Handle case where one or both losses are None
         if correct_loss is None or failed_loss is None:
-            if not all_losses:
-                raise ValueError("No losses found in the dataset.")
-
             # Sort all_losses by their data values
             all_losses.sort(key=lambda x: x.data, reverse=True)  # Highest to lowest
 
@@ -931,12 +969,53 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str
             failed_loss = all_losses[-1]
             print("Assigned correct_loss and failed_loss from sorted losses.")
 
-        total_loss = sum_ops([correct_loss, failed_loss])
+        total_loss = sum_ops([copy(correct_loss), copy(failed_loss)])
+
+        t0 = time.time()
+
         total_loss.backward()
+        t1 = time.time()
+        printc(f"finish loss backward in {t1-t0} seconds")
         # test optimizer
         self._propose_text_optimizers()
+        t2 = time.time()
+        printc(f"finish text optimizer step in {t2-t1} seconds")
+
+        debug_files: Dict = total_loss.draw_graph(filepath=debug_path, full_trace=True)
+        t3 = time.time()
+        printc(f"finish draw_graph step in {t3-t2} seconds")
+        debug_output_file = total_loss.draw_output_subgraph(filepath=debug_path)
+        t4 = time.time()
+        printc(f"finish draw_output_subgraph step in {t4-t3} seconds")
+        debug_component_file = total_loss.draw_component_subgraph(filepath=debug_path)
+        debug_files.update(debug_output_file)
+        debug_files.update(debug_component_file)
+
+        # zero grad
+        self._zero_grad_text_optimizers()
+        # revert
+        self._revert_text_optimizers()
+
+        total_loss.reset_all_gradients()
+
+        # draw graph on a single loss
+        total_loss = sum_ops([copy(failed_loss)])
+        total_loss.backward()
+        self._propose_text_optimizers()
+
+        failed_debug_files = total_loss.draw_graph(
+            filepath=debug_path, full_trace=False
+        )
+        failed_output_file = total_loss.draw_output_subgraph(filepath=debug_path)
+        failed_component_file = total_loss.draw_component_subgraph(filepath=debug_path)
+        failed_debug_files.update(failed_output_file)
+        failed_debug_files.update(failed_component_file)
+
+        for k, v in failed_debug_files.items():
+            if k in debug_files:
+                k = f"failed_{k}"
+            debug_files[k] = v
 
-        debug_files = total_loss.draw_graph(filepath=debug_path, full_trace=True)
         return debug_files
 
     def _set_demo_optimizers_dataset(self, train_dataset: Any):
@@ -982,9 +1061,9 @@ def _propose_text_optimizers(self):
         for text_optimizer in self.text_optimizers:
             text_optimizer.propose()
 
-    # def _add_failed_proposals_text_optimizers(self):
-    #     for opt in self.text_optimizers:
-    #         opt.add_failed_proposal()
+    def _add_failed_proposals_text_optimizers(self):
+        for opt in self.text_optimizers:
+            opt.add_failed_proposal()
 
     def _get_trainable_text_params(self):
         params = []
@@ -1008,6 +1087,14 @@ def _revert_text_optimizers(self):
         for text_optimizer in self.text_optimizers:
             text_optimizer.revert()
 
+    def _increment_step_from_last_improvement_text_optimizers(self):
+        for text_optimizer in self.text_optimizers:
+            text_optimizer.increment_steps_from_last_improvement()
+
+    def _reset_steps_from_last_improvement_text_optimizers(self):
+        for text_optimizer in self.text_optimizers:
+            text_optimizer.reset_steps_from_last_improvement()
+
     def _check_optimizer_proposal(self):
         r"""Return True if all optimizers have proposed a new prompt"""
         for text_optimizer in self.text_optimizers:
@@ -1033,7 +1120,7 @@ def _fit_text_grad_demo_mix_constrained(
             if trainer_results is None
             else trainer_results
         )
-        print(f"save to {self.ckpt_file}")
+        print(f"_fit_text_grad_demo_mix_constrained save to {self.ckpt_file}")
 
         if train_dataset is None:
             raise ValueError("train_dataset is required")
@@ -1066,7 +1153,7 @@ def _fit_text_grad_demo_mix_constrained(
                 all_losses.extend(losses)  # student losses
                 # extract the non-parameter y_preds
                 all_y_preds.extend(
-                    [y.full_response for y in y_preds if isinstance(y, Parameter)]
+                    [y.data for y in y_preds if isinstance(y, Parameter)]
                 )
 
                 # for loss in losses:
@@ -1114,76 +1201,80 @@ def _fit_text_grad_demo_mix_constrained(
                         all_losses=all_losses,
                         all_y_preds=all_y_preds,
                         include_demo_optimizers=True,
+                        trainer_results=trainer_results,
+                        val_dataset=val_dataset,
+                        test_dataset=test_dataset,
+                        total_steps=total_steps,
                     )
                 )
 
-                if not self._check_optimizer_proposal():
-                    print(
-                        "No proposal can improve the subset and full set, go to next step"
-                    )
-                    # self._add_failed_proposals_text_optimizers()
-
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        trainer_results.val_scores[-1],
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                    )
-
-                    continue
-
-                # set the batch size to the size of the validation set
-                last_val_score = trainer_results.val_scores[-1]
-                val_output = self.adaltask.validation_step(
-                    val_dataset,
-                    total_steps,
-                    self.num_workers,
-                    minimum_score=last_val_score,
-                )
-                val_score = val_output.avg_score
-                self._add_history_text_optimizers(val_score)
-
-                if val_score > last_val_score:
-                    print(f"Optimizer step: {val_score} > {last_val_score}")
-                    # self.optimizer.step()
-                    self._step_text_optimizers()
-                    self._demo_optimizers_step()
-
-                    # test the model
-                    test_score = None
-                    if test_dataset is not None:
-                        test_output = self.adaltask.validation_step(
-                            test_dataset, total_steps, self.num_workers
-                        )
-                        test_score = test_output.avg_score
+                # if not self._check_optimizer_proposal():
+                #     print(
+                #         "No proposal can improve the subset and full set, go to next step"
+                #     )
+                #     # self._add_failed_proposals_text_optimizers()
+
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         trainer_results.val_scores[-1],
+                #         trainer_results.test_scores[-1],
+                #         trainer_results.prompts[-1],
+                #         total_steps,
+                #     )
 
-                    new_prompts = self.adaltask._get_param_values()
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        val_score,
-                        test_score,
-                        new_prompts,
-                        total_steps,
-                    )
-                    all_samples, all_losses, all_y_preds = [], [], []
-                else:
-                    print(f"Optimizer revert: {val_score} <= {last_val_score}")
-                    # self.optimizer.revert()
-                    self._revert_text_optimizers()
-                    self._demo_optimizers_revert()
-                    # save the score, no change
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        last_val_score,
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                        attempted_val_score=val_score,
-                    )
+                #     continue
+
+                # # set the batch size to the size of the validation set
+                # last_val_score = trainer_results.val_scores[-1]
+                # val_output = self.adaltask.validation_step(
+                #     val_dataset,
+                #     total_steps,
+                #     self.num_workers,
+                #     minimum_score=last_val_score,
+                # )
+                # val_score = val_output.avg_score
+                # self._add_history_text_optimizers(val_score)
+
+                # if val_score > last_val_score:
+                #     print(f"Optimizer step: {val_score} > {last_val_score}")
+                #     # self.optimizer.step()
+                #     self._step_text_optimizers()
+                #     self._demo_optimizers_step()
+
+                #     # test the model
+                #     test_score = None
+                #     if test_dataset is not None:
+                #         test_output = self.adaltask.validation_step(
+                #             test_dataset, total_steps, self.num_workers
+                #         )
+                #         test_score = test_output.avg_score
+
+                #     new_prompts = self.adaltask._get_param_values()
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         val_score,
+                #         test_score,
+                #         new_prompts,
+                #         total_steps,
+                #     )
+                #     all_samples, all_losses, all_y_preds = [], [], []
+                # else:
+                #     print(f"Optimizer revert: {val_score} <= {last_val_score}")
+                #     # self.optimizer.revert()
+                #     self._revert_text_optimizers()
+                #     self._demo_optimizers_revert()
+                #     # save the score, no change
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         last_val_score,
+                #         trainer_results.test_scores[-1],
+                #         trainer_results.prompts[-1],
+                #         total_steps,
+                #         attempted_val_score=val_score,
+                #     )
 
-                print(f"Saving checkpoint to {self.ckpt_file}")
-                save_json(trainer_results.to_dict(), self.ckpt_file)
+                # print(f"Saving checkpoint to {self.ckpt_file}")
+                # save_json(trainer_results.to_dict(), self.ckpt_file)
             save_json(trainer_results.to_dict(), self.ckpt_file)  # checkpoint
 
     def _fit_text_grad_demo_mix_random(
@@ -1202,7 +1293,7 @@ def _fit_text_grad_demo_mix_random(
             if train_results is None
             else train_results
         )
-        print(f"save to {self.ckpt_file}")
+        print(f"_fit_text_grad_demo_mix_random save to {self.ckpt_file}")
 
         if train_dataset is None:
             raise ValueError("train_dataset is required")
@@ -1298,10 +1389,12 @@ def _fit_text_grad_demo_mix_random(
                     self._demo_optimizers_step()
 
                     # test the model
-                    test_output = self.adaltask.validation_step(
-                        test_dataset, total_steps, self.num_workers
-                    )
-                    test_score = test_output.avg_score
+                    test_score = None
+                    # if test_dataset is not None:
+                    #     test_output = self.adaltask.validation_step(
+                    #         test_dataset, total_steps, self.num_workers
+                    #     )
+                    #     test_score = test_output.avg_score
                     self._add_one_step_in_trainer_results(
                         trainer_results,
                         val_score,
@@ -1344,7 +1437,7 @@ def _fit_demos_random(
             if trainer_results is None
             else trainer_results
         )
-        print(f"save to {self.ckpt_file}")
+        print(f"_fit_demos_random save to {self.ckpt_file}")
         print(f"Starting step: {starting_step}")
 
         self.adaltask.train()
@@ -1428,10 +1521,13 @@ def _fit_demos_random(
                     minimum_score=last_val_score,
                 )
                 val_score = val_output.avg_score
+
                 if val_score > last_val_score:
                     print(
                         f"Pass validation: {val_score} > {trainer_results.val_scores[-1]}"
                     )
+                    self._track_effectiveness("demo_valset", True)
+
                     self._demo_optimizers_step()
                     for opt in self.demo_optimizers:
                         if opt.proposing:
@@ -1439,11 +1535,11 @@ def _fit_demos_random(
 
                     # test the new prompts
                     test_score = None
-                    if test_dataset is not None:
-                        test_output = self.adaltask.validation_step(
-                            test_dataset, step, self.num_workers
-                        )
-                        test_score = test_output.avg_score
+                    # if test_dataset is not None:
+                    #     test_output = self.adaltask.validation_step(
+                    #         test_dataset, step, self.num_workers
+                    #     )
+                    #     test_score = test_output.avg_score
                     self._add_one_step_in_trainer_results(
                         trainer_results,
                         val_score,
@@ -1453,6 +1549,7 @@ def _fit_demos_random(
                         attempted_val_score=val_score,
                     )
                 else:
+                    self._track_effectiveness("demo_valset", False)
                     print(f"Fail validation: {val_score} <= {last_val_score}, revert")
                     self._demo_optimizers_revert()
                     # ensure all demo optimizer are not proposing
@@ -1533,15 +1630,17 @@ def _fit_text_grad_random(
             if trainer_results is None
             else trainer_results
         )
-        print(f"save to {self.ckpt_file}")
+        print(f"_fit_text_grad_random save to {self.ckpt_file}")
 
         self.adaltask.train()
         # self.optimizer.zero_grad()
         self._zero_grad_text_optimizers()
 
         num_epochs = self._estimate_num_epochs(train_loader, self.max_steps)
+        print(f"num_epochs: {num_epochs}, max_steps: {self.max_steps}")
         total_steps = starting_step
         for epoch in tqdm(range(num_epochs), desc="Epoch"):
+            print(f"Epoch: {epoch}")
             for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))):
                 total_steps += 1
                 if total_steps > self.max_steps + starting_step:
@@ -1550,14 +1649,28 @@ def _fit_text_grad_random(
                 self._zero_grad_text_optimizers()
                 pbar.set_description(f"Training Step: {total_steps}")
                 self.adaltask.train()  # this will turn everything to train mode
-                self.train()
-                y_preds = self.adaltask.train_step(batch, steps, self.num_workers)
-                losses = self.adaltask.loss_step(
-                    batch, y_preds, steps, self.num_workers
-                )
+                # self.train()
+                try:
+                    # print(f"Batch: {batch}")
+                    # continue
+                    y_preds = self.adaltask.train_step(batch, steps, self.num_workers)
+                except Exception as e:
+                    print(f"Error in train step: {e}")
+                    raise e
+                try:
+                    losses = self.adaltask.loss_step(
+                        batch, y_preds, steps, self.num_workers
+                    )
+                except Exception as e:
+                    print(f"Error in loss step: {e}")
+                    raise e
                 total_loss = sum_ops(losses)
                 print("Loss backward...")
-                total_loss.backward()
+                try:
+                    total_loss.backward()
+                except Exception as e:
+                    print(f"Error in backward: {e}")
+                    raise e
                 print("Optimizer propose...")
                 self._propose_text_optimizers()
                 new_prompts = self.adaltask._get_param_values()
@@ -1570,19 +1683,23 @@ def _fit_text_grad_random(
                     self.num_workers,
                     minimum_score=last_val_score,
                 )
+                print(f"Val output: {val_output}")
                 val_score = val_output.avg_score
 
                 if val_score > last_val_score:
 
                     print(f"Optimizer step: {val_score} > {last_val_score}")
+                    # track the effectiveness
+                    self._track_effectiveness("valset", True)
                     # self.optimizer.step()
                     self._step_text_optimizers()
                     self._add_history_text_optimizers(val_score)  # track top performor
                     # test the model
-                    test_output = self.adaltask.validation_step(
-                        test_dataset, total_steps, self.num_workers
-                    )
-                    test_score = test_output.avg_score
+                    # test_output = self.adaltask.validation_step(
+                    #     test_dataset, total_steps, self.num_workers
+                    # )
+                    # test_score = test_output.avg_score
+                    test_score = None
                     self._add_one_step_in_trainer_results(
                         trainer_results,
                         val_score,
@@ -1592,11 +1709,11 @@ def _fit_text_grad_random(
                     )
                 else:
                     # if val_score < last_val_score:
-                    #     self._add_failed_proposals_text_optimizers() # track failed proposals
+                    self._add_failed_proposals_text_optimizers()  # track failed proposals
 
                     print(f"Optimizer revert: {val_score} <= {last_val_score}")
-                    # self.optimizer.revert()
                     self._revert_text_optimizers()
+                    self._track_effectiveness("valset", False)
                     # save the score, no change
                     self._add_one_step_in_trainer_results(
                         trainer_results,
@@ -1607,10 +1724,10 @@ def _fit_text_grad_random(
                         attempted_val_score=val_score,
                     )
 
-                print(f"Saving checkpoint to {self.ckpt_file}")
+                print(f" {total_steps}, Saving checkpoint to {self.ckpt_file}")
                 save_json(trainer_results.to_dict(), self.ckpt_file)
             save_json(trainer_results.to_dict(), self.ckpt_file)  # checkpoint
-            return trainer_results
+        return trainer_results
 
     @staticmethod
     def _add_one_step_in_trainer_results(
@@ -1636,6 +1753,43 @@ def _add_one_step_in_trainer_results(
         trainer_results.prompts.append(prompts)
         trainer_results.steps.append(step)
 
+    # def _downsample_move_batch(
+    #     self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list
+    # ):
+    #     """Downsample the moving batch to a more balanced error and correct samples"""
+
+    #     from adalflow.optim.parameter import Parameter
+
+    #     if not all([score >= 0 and score <= 1 for score in acc_score_list]):
+    #         raise ValueError(
+    #             "acc_score_list should only contain values between 0 and 1"
+    #         )
+
+    #     for loss in all_losses:
+    #         if not isinstance(loss, Parameter):
+    #             raise ValueError("Loss should be a Parameter object")
+    #     max_moving_batch_size = 20
+
+    #     correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5]
+    #     error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5]
+
+    #     if (
+    #         len(error_indices) + len(correct_indices)
+    #         <= max_moving_batch_size
+    #         # and len(correct_indices) <= max_moving_batch_size
+    #     ):
+    #         return all_samples, all_losses, all_y_preds, acc_score_list
+
+    #     # downsample from all samples
+    #     new_sample_indices = random.sample(
+    #         range(len(all_samples)), min(max_moving_batch_size, len(all_samples))
+    #     )
+    #     all_samples = [all_samples[i] for i in new_sample_indices]
+    #     all_losses = [all_losses[i] for i in new_sample_indices]
+    #     all_y_preds = [all_y_preds[i] for i in new_sample_indices]
+    #     acc_score_list = [acc_score_list[i] for i in new_sample_indices]
+    #     return all_samples, all_losses, all_y_preds, acc_score_list
+
     def _downsample_move_batch(
         self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list
     ):
@@ -1651,7 +1805,9 @@ def _downsample_move_batch(
         for loss in all_losses:
             if not isinstance(loss, Parameter):
                 raise ValueError("Loss should be a Parameter object")
+
         max_moving_batch_size = 20
+        min_error_samples = 4
 
         correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5]
         error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5]
@@ -1663,14 +1819,46 @@ def _downsample_move_batch(
         ):
             return all_samples, all_losses, all_y_preds, acc_score_list
 
-        # downsample from all samples
-        new_sample_indices = random.sample(
-            range(len(all_samples)), min(max_moving_batch_size, len(all_samples))
-        )
-        all_samples = [all_samples[i] for i in new_sample_indices]
-        all_losses = [all_losses[i] for i in new_sample_indices]
-        all_y_preds = [all_y_preds[i] for i in new_sample_indices]
-        acc_score_list = [acc_score_list[i] for i in new_sample_indices]
+        # Adjust downsampling logic
+        if len(error_indices) < min_error_samples:
+            remaining_capacity = max_moving_batch_size - len(error_indices)
+            correct_indices = random.sample(correct_indices, max(0, remaining_capacity))
+        else:
+            # Set aside minimum error samples
+            retained_error_indices = error_indices[:min_error_samples]
+            remaining_error_indices = error_indices[min_error_samples:]
+
+            # Combine remaining error and correct indices for unified sampling
+            combined_indices = remaining_error_indices + correct_indices
+            sampled_combined_indices = random.sample(
+                combined_indices, max(0, max_moving_batch_size - min_error_samples)
+            )
+
+            error_indices = retained_error_indices
+            correct_indices = [
+                i for i in sampled_combined_indices if i in correct_indices
+            ]
+            remaining_error_indices = [
+                i for i in sampled_combined_indices if i in remaining_error_indices
+            ]
+            error_indices += remaining_error_indices
+
+        error_samples = [all_samples[i] for i in error_indices]
+        error_losses = [all_losses[i] for i in error_indices]
+        error_y_preds = [all_y_preds[i] for i in error_indices]
+        error_scores = [acc_score_list[i] for i in error_indices]
+
+        correct_samples = [all_samples[i] for i in correct_indices]
+        correct_losses = [all_losses[i] for i in correct_indices]
+        correct_y_preds = [all_y_preds[i] for i in correct_indices]
+        correct_scores = [acc_score_list[i] for i in correct_indices]
+
+        # Combine error and downsampled correct samples
+        all_samples = error_samples + correct_samples
+        all_losses = error_losses + correct_losses
+        all_y_preds = error_y_preds + correct_y_preds
+        acc_score_list = error_scores + correct_scores
+
         return all_samples, all_losses, all_y_preds, acc_score_list
 
     def _moving_batch_sample(
@@ -1680,21 +1868,29 @@ def _moving_batch_sample(
         # ensure only 0 and 1 in the acc_score_list
         import numpy as np
 
-        if not all([score in [0, 1] for score in acc_score_list]):
+        if not all(0 <= score <= 1 for score in acc_score_list):
             raise ValueError("acc_score_list should only contain 0 and 1")
-        correct_indices = [i for i, score in enumerate(acc_score_list) if score == 1]
-        error_indices = [i for i, score in enumerate(acc_score_list) if score == 0]
+        correct_indices = [
+            i
+            for i, score in enumerate(acc_score_list)
+            if score > self.correct_val_score_threshold
+        ]
+        error_indices = [
+            i
+            for i, score in enumerate(acc_score_list)
+            if score <= self.correct_val_score_threshold
+        ]
         print(f"Moving batch correct size: {len(correct_indices)}")
         print(f"Moving batch error size: {len(error_indices)}")
-        if len(error_indices) == 0:
-            raise ValueError("No error samples found")
+        # if len(error_indices) == 0:
+        #     raise ValueError("No error samples found")
         sampled_error_indices = random.sample(
             error_indices, min(self.max_error_samples, len(error_indices))
         )
         num_errors = len(sampled_error_indices)
 
         # max allowed correct samples min(0.8 * num_errors, len(correct_indices), self.max_correct_samples)
-        max_num_correct_samples = int(2 * num_errors)
+        max_num_correct_samples = int(2 * max(1, num_errors))
         sampled_correct_indices = random.sample(
             correct_indices,
             min(
@@ -1713,7 +1909,7 @@ def _moving_batch_sample(
         return subset_score, subset
 
     def _track_effectiveness(
-        self, stage: Literal["subset", "fullset", "valset"], pass_: bool
+        self, stage: Literal["subset", "fullset", "valset", "demo_valset"], pass_: bool
     ):
         if stage == "subset":
             if pass_:
@@ -1730,6 +1926,13 @@ def _track_effectiveness(
                 self._valset_effect_count["pass"] += 1
             else:
                 self._valset_effect_count["fail"] += 1
+        elif stage == "demo_valset":
+            if pass_:
+                self._demo_valset_effect_count["pass"] += 1
+            else:
+                self._demo_valset_effect_count["fail"] += 1
+        else:
+            raise NotImplementedError(f"Stage {stage} not implemented")
 
     def _text_grad_constraint_propose_step(
         self,
@@ -1738,6 +1941,10 @@ def _text_grad_constraint_propose_step(
         all_losses: List["Parameter"],
         all_y_preds,
         include_demo_optimizers: bool = False,
+        trainer_results: TrainerResult = None,
+        val_dataset: Any = None,
+        test_dataset: Any = None,
+        total_steps: int = 0,
     ):
         """Handles both the mixed training and the separate training.
         When include_demo_optimizers is True, the demo optimizers are included in the training
@@ -1750,15 +1957,27 @@ def _text_grad_constraint_propose_step(
                 raise ValueError("Loss should be a Parameter object")
         self.adaltask.eval()
         move_batch_eval = self.adaltask.evaluate_samples(all_samples, all_y_preds)
+        print(f"Moving batch eval: {move_batch_eval}")
         move_batch_score = move_batch_eval.avg_score
         move_batch_acc_score_list = move_batch_eval.per_item_scores
 
-        if move_batch_score >= self.batch_val_score_threshold:
-            print(f"Skipping batch {steps} as acc: {move_batch_score}")
-
-            # reset the moving batch
-            all_samples, all_losses, all_y_preds = [], [], []
-            return all_samples, all_losses, all_y_preds
+        last_val_score = trainer_results.val_scores[-1]
+        val_score_increased = False
+
+        # if move_batch_score >= self.batch_val_score_threshold:
+        #     print(f"Skipping batch {steps} as acc: {move_batch_score}")
+
+        #     # reset the moving batch
+        #     all_samples, all_losses, all_y_preds = [], [], []
+        #     # track the result
+        #     self._add_one_step_in_trainer_results(
+        #         trainer_results,
+        #         last_val_score,
+        #         trainer_results.test_scores[-1],
+        #         trainer_results.prompts[-1],
+        #         total_steps,
+        #     )
+        #     return all_samples, all_losses, all_y_preds
         # downsample the moving batch
         all_samples, all_losses, all_y_preds, move_batch_acc_score_list = (
             self._downsample_move_batch(
@@ -1775,6 +1994,8 @@ def _text_grad_constraint_propose_step(
         )
         print(f"Subset batch acc: {subset_score}")
 
+        self.adaltask.train()
+
         # compute the subset loss
         subset_losses = [all_losses[i] for i in subset_indices]
 
@@ -1788,9 +2009,10 @@ def _text_grad_constraint_propose_step(
 
         # TODO: make this a step
         tdqm_loader = tqdm(range(self.max_proposals_per_step), desc="Proposing")
+
         for i in tdqm_loader:
 
-            # print(f"Proposing step: {i}")
+            print(f"Proposing step: {i}")
             # self.optimizer.propose()
             self._propose_text_optimizers()  # new prompts
             if include_demo_optimizers:
@@ -1799,13 +2021,16 @@ def _text_grad_constraint_propose_step(
             print("New prompts: ", new_prompts)
             # valide the subset
             subset_samples = [all_samples[i] for i in subset_indices]
-            # validate the subset
             val_output = self.adaltask.validation_step(
                 subset_samples, steps, self.num_workers
             )
             # check subset validation score
             val_score = val_output.avg_score
-            if val_score > subset_score:
+            if (
+                val_score == subset_score
+                and subset_score >= self.batch_val_score_threshold
+            ) or val_score > subset_score:  # allow perfect subset to pass
+
                 print(f"Pass subset check: {val_score} > {subset_score}")
                 self._track_effectiveness("subset", True)
 
@@ -1813,31 +2038,101 @@ def _text_grad_constraint_propose_step(
                 print(
                     f"Fail subset check, try next proposal: {val_score} <= {subset_score}"
                 )
-                # self._add_failed_proposals_text_optimizers()
+                self._add_failed_proposals_text_optimizers()
                 self._track_effectiveness("subset", False)
                 self._revert_text_optimizers()
                 if include_demo_optimizers:
                     self._demo_optimizers_revert()
                 continue
             # validate the full set
-            move_batch_result = self.adaltask.validation_step(
-                all_samples, steps, self.num_workers
+            # move_batch_result = self.adaltask.validation_step(
+            #     all_samples, steps, self.num_workers
+            # )
+            # new_move_batch_score = move_batch_result.avg_score
+            # if new_move_batch_score >= move_batch_score:
+            #     print(f"Pass full check: {new_move_batch_score} >= {move_batch_score}")
+            #     self._track_effectiveness("fullset", True)
+            #     # break
+            # else:
+            #     print(
+            #         f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}"
+            #     )
+            #     self._track_effectiveness("fullset", False)
+            #     # self._add_failed_proposals_text_optimizers()
+            #     self._revert_text_optimizers()
+            #     if include_demo_optimizers:
+            #         self._demo_optimizers_revert()
+            #     continue
+
+            # check on the validation set
+            # set the batch size to the size of the validation set
+            val_output = self.adaltask.validation_step(
+                val_dataset,
+                total_steps,
+                self.num_workers,
+                minimum_score=last_val_score,
             )
-            new_move_batch_score = move_batch_result.avg_score
-            if new_move_batch_score >= move_batch_score:
-                print(f"Pass full check: {new_move_batch_score} >= {move_batch_score}")
-                self._track_effectiveness("fullset", True)
+            val_score = val_output.avg_score
+
+            if val_score > last_val_score:
+                print(f"Optimizer step: {val_score} > {last_val_score}")
+                # self.optimizer.step()
+                self._track_effectiveness("valset", True)
+                self._step_text_optimizers()
+                self._add_history_text_optimizers(val_score)
+
+                if include_demo_optimizers:
+
+                    self._demo_optimizers_step()
+
+                # test the model
+                test_score = None
+                # if test_dataset is not None:
+                #     test_output = self.adaltask.validation_step(
+                #         test_dataset, total_steps, self.num_workers
+                #     )
+                #     test_score = test_output.avg_score
+
+                new_prompts = self.adaltask._get_param_values()
+                self._add_one_step_in_trainer_results(
+                    trainer_results,
+                    val_score,
+                    test_score,
+                    new_prompts,
+                    total_steps,
+                )
+                all_samples, all_losses, all_y_preds = [], [], []
+                val_score_increased = True
+                self._reset_steps_from_last_improvement_text_optimizers()
                 break
             else:
-                print(
-                    f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}"
-                )
-                self._track_effectiveness("fullset", False)
-                # self._add_failed_proposals_text_optimizers()
+                print(f"Optimizer revert: {val_score} <= {last_val_score}")
+                self._track_effectiveness("valset", False)
+                self._add_failed_proposals_text_optimizers()
+                # self.optimizer.revert()
                 self._revert_text_optimizers()
                 if include_demo_optimizers:
                     self._demo_optimizers_revert()
+
                 continue
+        if not val_score_increased:
+            print("No proposal can improve the subset and full set, and val set")
+            self._zero_grad_text_optimizers()
+            subset_loss.reset_all_gradients()
+            # save the score, no change
+            self._add_one_step_in_trainer_results(
+                trainer_results,
+                last_val_score,
+                trainer_results.test_scores[-1],
+                trainer_results.prompts[-1],
+                total_steps,
+                attempted_val_score=val_score,
+            )
+            self._increment_step_from_last_improvement_text_optimizers()
+
+        print(f"Saving checkpoint to {self.ckpt_file}")
+        trainer_results.effective_measure = self._effective_measure
+        save_json(trainer_results.to_dict(), self.ckpt_file)
 
         print("Done with proposals")
         self.adaltask.train()
@@ -1877,24 +2172,27 @@ def _fit_text_grad_constraint(
         trainer_results: TrainerResult = None,
         starting_step: int = 0,
     ) -> TrainerResult:
-        from adalflow.optim.parameter import Parameter
+        from adalflow.optim.parameter import OutputParameter
 
         logger.info("Fitting using Textual Gradient Descent with constraints")
+        printc("Fitting using Textual Gradient Descent with constraints")
         trainer_results = (
             self._pre_fit(val_dataset, test_dataset)
             if trainer_results is None
             else trainer_results
         )
 
-        print(f"save to {self.ckpt_file}")
+        print(f"_fit_text_grad_constraint save to {self.ckpt_file}")
 
         self.adaltask.train()
         self._zero_grad_text_optimizers()
 
         num_epochs = self._estimate_num_epochs(train_loader, self.max_steps)
         total_steps = starting_step
-        all_samples, all_losses, all_y_preds = [], [], []
+        all_samples, all_losses = [], []
+        all_y_preds: List[OutputParameter] = []
         for epoch in tqdm(range(num_epochs), desc="Epoch"):
+            print(f"Epoch: {epoch}")
             for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))):
                 total_steps += 1
                 if total_steps > self.max_steps + starting_step:
@@ -1903,6 +2201,8 @@ def _fit_text_grad_constraint(
                 self._zero_grad_text_optimizers()
                 pbar.set_description(f"Training Step: {total_steps}")
                 self.adaltask.train()  # this will turn everything to train mode
+                # print(f"Batch: {batch}")
+                # continue
                 y_preds = self.adaltask.train_step(batch, steps, self.num_workers)
                 losses = self.adaltask.loss_step(
                     batch, y_preds, steps, self.num_workers
@@ -1912,8 +2212,9 @@ def _fit_text_grad_constraint(
                 all_samples.extend(batch)
                 all_losses.extend(losses)
                 all_y_preds.extend(
-                    [y.full_response for y in y_preds if isinstance(y, Parameter)]
+                    [y.data for y in y_preds if isinstance(y, OutputParameter)]
                 )
+                # printc(f"y_preds: {y_preds[0]}")
 
                 all_samples, all_losses, all_y_preds = (
                     self._text_grad_constraint_propose_step(
@@ -1921,86 +2222,12 @@ def _fit_text_grad_constraint(
                         all_samples=all_samples,
                         all_losses=all_losses,
                         all_y_preds=all_y_preds,
+                        trainer_results=trainer_results,
+                        val_dataset=val_dataset,
+                        test_dataset=test_dataset,
+                        total_steps=total_steps,
                     )
                 )
 
-                # check optimizer stages to see if the proposal was accepted so far
-                if not self._check_optimizer_proposal():
-                    print(
-                        "No proposal can improve the subset and full set, go to next step"
-                    )
-
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        trainer_results.val_scores[-1],
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                    )
-                    continue
-
-                # prune the correct sample size if its too big, same with error samples
-                # run the tests as any other optimizer
-                if self.adaltask.validate_condition(steps, total_steps):
-                    # set the batch size to the size of the validation set
-                    last_val_score = trainer_results.val_scores[-1]
-                    val_output = self.adaltask.validation_step(
-                        val_dataset,
-                        total_steps,
-                        self.num_workers,
-                        minimum_score=last_val_score,
-                    )
-                    val_score = val_output.avg_score
-
-                    if val_score > last_val_score:
-                        print(f"Optimizer step: {val_score} > {last_val_score}")
-                        # self.optimizer.step()
-                        self._add_history_text_optimizers(
-                            val_score
-                        )  # track top performor
-                        self._step_text_optimizers()
-
-                        # save the score
-                        step_result = {
-                            "val_score": val_score,
-                        }
-
-                        self._track_effectiveness("valset", True)
-
-                        # test the model
-                        if test_dataset is not None:
-                            test_output = self.adaltask.validation_step(
-                                test_dataset,
-                                steps,
-                                self.num_workers,
-                            )
-                            step_result["test_score"] = test_output.avg_score
-                        else:
-                            step_result["test_score"] = None
-                        step_result["prompts"] = self.adaltask._get_param_values()
-                        step_result["step"] = total_steps
-                        self._add_one_step_in_trainer_results(
-                            trainer_results,
-                            **step_result,
-                        )
-
-                        all_samples, all_losses, all_y_preds = [], [], []
-
-                    else:
-                        print(f"Optimizer revert: {val_score} <= {last_val_score}")
-                        self._revert_text_optimizers()
-                        # self._add_failed_proposals_text_optimizers() # track failed proposals
-                        self._track_effectiveness("valset", False)
-                        self._add_one_step_in_trainer_results(
-                            trainer_results,
-                            trainer_results.val_scores[-1],
-                            trainer_results.test_scores[-1],
-                            trainer_results.prompts[-1],
-                            total_steps,
-                            attempted_val_score=val_score,
-                        )
-
-                trainer_results.effective_measure = self._effective_measure
-                save_json(trainer_results.to_dict(), self.ckpt_file)
         save_json(trainer_results.to_dict(), self.ckpt_file)
         return trainer_results
diff --git a/adalflow/adalflow/optim/types.py b/adalflow/adalflow/optim/types.py
index 22b0ab14..45425859 100644
--- a/adalflow/adalflow/optim/types.py
+++ b/adalflow/adalflow/optim/types.py
@@ -9,41 +9,54 @@
 from adalflow.core import DataClass
 
 
+# TODO: set default optimization
 class ParameterType(Enum):
-    __doc__ = """Enum for the type of parameter to compute the loss with, and to inform the optimizer."""
+    __doc__ = """Enum for the type of parameter to compute the loss with, and to inform the optimizer.
+
+    The meaning of reach tuple is:
+    1. First element: the name of the parameter.
+    2. Second element: the description of the parameter.
+    3. Third element: whether the parameter is trainable.
+
+    To access each element, use the following:
+    1. name: `ParameterType.PROMPT.value`
+    2. description: `ParameterType.PROMPT.description`
+    3. trainable: `ParameterType.PROMPT.default_trainable`
+    """
 
     # trainable parameters with optimizers
     PROMPT = (
         "prompt",
         "Instruction to the language model on task, data, and format.",
+        True,
     )  # optimized by tgd_optimizer
     DEMOS = (
         "demos",
         "A few examples to guide the language model.",
+        True,
     )  # optimized by demo_optimizer
 
     # input and output parameters (similar to tensor, can have grad_opt true, but not trainable)
-    INPUT = ("input", "The input to the component.")
-    OUTPUT = ("output", "The output of the component.")
-    HYPERPARAM = ("hyperparam", "Hyperparameters/args for the component.")
-
-    # gradient paramters for each predecessor of dag.
-    GRADIENT = ("gradient", "A gradient parameter.")
+    INPUT = ("input", "The input to the component.", False)
+    OUTPUT = ("output", "The output of the component.", True)
+    HYPERPARAM = ("hyperparam", "Hyperparameters/args for the component.", False)
 
     # the following is a subtype of the output type
     # INSTANCE = ("instance", "Focus on fixing issues of this specific example.")
     GENERATOR_OUTPUT = (
         "generator_output",
         "The output of the generator.",
+        True,
     )  # use raw response or error message as data, full response in full_response
-    RETRIEVER_OUTPUT = ("retriever_output", "The output of the retriever.")
-    LOSS_OUTPUT = ("loss", "The loss value.")
-    SUM_OUTPUT = ("sum", "The sum of the losses.")
-    NONE = ("none", "")
+    RETRIEVER_OUTPUT = ("retriever_output", "The output of the retriever.", True)
+    LOSS_OUTPUT = ("loss", "The loss value.", True)
+    SUM_OUTPUT = ("sum", "The sum of the losses.", True)
+    NONE = ("none", "", False)
 
-    def __init__(self, value, description):
+    def __init__(self, value: str, description: str, default_trainable: bool):
         self._value_ = value
         self.description = description
+        self.default_trainable = default_trainable
 
     def __str__(self):
         """Return a string representation that includes the enum's value and description."""
@@ -145,3 +158,7 @@ class TrainerResult(DataClass):
     trainer_state: Dict[str, Any] = field(
         default=None, metadata={"desc": "Save the most detailed state of the trainer"}
     )
+    total_time: float = field(
+        default=0.0, metadata={"desc": "Total time taken for training"}
+    )
+    test_score: float = field(default=None, metadata={"desc": "Test score"})
diff --git a/adalflow/adalflow/utils/data.py b/adalflow/adalflow/utils/data.py
index 682453b1..374c47b4 100644
--- a/adalflow/adalflow/utils/data.py
+++ b/adalflow/adalflow/utils/data.py
@@ -74,10 +74,13 @@ class DataLoader:
 
     The biggest difference is not to handle tensors, but to handle any type of data."""
 
-    def __init__(self, dataset, batch_size: int = 4, shuffle: bool = True):
+    def __init__(
+        self, dataset, batch_size: int = 4, shuffle: bool = True, seed: int = 42
+    ):
         self.dataset = dataset
         self.batch_size = batch_size
         self.shuffle = shuffle
+        self.seed = seed
 
         self.indices = np.arange(len(dataset))
         # if self.shuffle:
@@ -91,6 +94,8 @@ def set_max_steps(self, max_steps: int):
 
     def __iter__(self):
         if self.shuffle:
+            if self.seed is not None:
+                np.random.seed(self.seed)  # Use the provided seed
             np.random.shuffle(self.indices)
         self.current_index = 0
         return self
@@ -104,6 +109,8 @@ def __next__(self) -> Union[np.ndarray, Tuple]:
 
         if self.current_index >= len(self.dataset):
             if self.shuffle:
+                if self.seed is not None:
+                    np.random.seed(self.seed)  # Use the same seed for reshuffle
                 np.random.shuffle(self.indices)  # Reshuffle for the new epoch
             self.current_index = 0
             if self.step_index < self.max_steps:
diff --git a/adalflow/adalflow/utils/file_io.py b/adalflow/adalflow/utils/file_io.py
index 7b038d7d..83728941 100644
--- a/adalflow/adalflow/utils/file_io.py
+++ b/adalflow/adalflow/utils/file_io.py
@@ -5,16 +5,17 @@
 from typing import Mapping, Any, Optional, List, Dict
 
 
-from adalflow.utils.serialization import (
-    to_dict,
-    serialize,
-)
+from adalflow.utils.serialization import to_dict, serialize, _deserialize_object_hook
 
 log = logging.getLogger(__name__)
 
 
 def save_json(obj: Mapping[str, Any], f: str = "task.json") -> None:
-    """Save the object to a json file.
+    """Customized Save the object to a json file.
+
+    Support Set.
+    We encourage users first save the data as DataClass using to_dict,
+    and then load it back to DataClass using from_dict.
 
     Args:
         obj (Mapping[str, Any]): The object to be saved.
@@ -29,6 +30,15 @@ def save_json(obj: Mapping[str, Any], f: str = "task.json") -> None:
         raise IOError(f"Error saving object to JSON file {f}: {e}")
 
 
+# def standard_save_json(obj: Mapping[str, Any], f: str = "task.json") -> None:
+#     os.makedirs(os.path.dirname(f) or ".", exist_ok=True)
+#     try:
+#         with open(f, "w") as file:
+#             json.dump(obj, file, indent=4)
+#     except IOError as e:
+#         raise IOError(f"Error saving object to JSON file {f}: {e}")
+
+
 def save_csv(
     obj: List[Dict[str, Any]], f: str = "task.csv", fieldnames: List[str] = None
 ) -> None:
@@ -47,6 +57,15 @@ def save_csv(
             writer.writeheader()
             for row in obj:
                 filtered_row = {k: v for k, v in row.items() if k in fieldnames}
+                # use json.dumps to serialize the object
+                for k, v in filtered_row.items():
+                    if (
+                        isinstance(v, dict)
+                        or isinstance(v, list)
+                        or isinstance(v, tuple)
+                        or isinstance(v, set)
+                    ):
+                        filtered_row[k] = json.dumps(v)
                 writer.writerow(filtered_row)
     except IOError as e:
         raise IOError(f"Error saving object to CSV file {f}: {e}")
@@ -82,20 +101,63 @@ def save(obj: Mapping[str, Any], f: str = "task") -> None:
         raise Exception(f"Error saving object to json and pickle files: {e}")
 
 
-def load_json(f: str = "task.json") -> Optional[Mapping[str, Any]]:
-    r"""Load the object from a json file.
+# def load_json(f: str = "task.json") -> Optional[Mapping[str, Any]]:
+#     r"""Load the object from a json file.
+
+#     Args:
+#         f (str, optional): The file name. Defaults to "task".
+#     """
+#     if not os.path.exists(f):
+#         log.warning(f"File {f} does not exist.")
+#         return None
+#     try:
+#         with open(f, "r") as file:
+#             return json.load(file)
+#     except Exception as e:
+#         raise Exception(f"Error loading object from JSON file {f}: {e}")
+
+
+def load_json(f: str) -> Any:
+    """Customized Load a JSON file and deserialize it.
 
     Args:
-        f (str, optional): The file name. Defaults to "task".
+        f (str): The file name of the JSON file to load.
+
+    Returns:
+        Any: The deserialized Python object.
     """
     if not os.path.exists(f):
-        log.warning(f"File {f} does not exist.")
-        return None
+        raise FileNotFoundError(f"JSON file not found: {f}")
+
+    try:
+        with open(f, "r") as file:
+            data = json.load(file, object_hook=_deserialize_object_hook)
+            return data
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON file {f}: {e}")
+    except Exception as e:
+        raise IOError(f"Error loading JSON file {f}: {e}")
+
+
+def load_standard_json(f: str) -> Any:
+    """Standard Load a JSON file and deserialize it.
+    Args:
+        f (str): The file name of the JSON file to load.
+
+    Returns:
+        Any: The deserialized Python object.
+    """
+    if not os.path.exists(f):
+        raise FileNotFoundError(f"JSON file not found: {f}")
+
     try:
         with open(f, "r") as file:
-            return json.load(file)
+            data = json.load(file)
+            return data
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON file {f}: {e}")
     except Exception as e:
-        raise Exception(f"Error loading object from JSON file {f}: {e}")
+        raise IOError(f"Error loading JSON file {f}: {e}")
 
 
 def load_pickle(f: str = "task.pickle") -> Optional[Mapping[str, Any]]:
diff --git a/adalflow/adalflow/utils/serialization.py b/adalflow/adalflow/utils/serialization.py
index 5cb1dd27..929cbd60 100644
--- a/adalflow/adalflow/utils/serialization.py
+++ b/adalflow/adalflow/utils/serialization.py
@@ -58,6 +58,14 @@ def default(o: Any) -> Union[Dict[str, Any], str]:
             except Exception as e:
                 log.error(f"Error serializing object {o}: {e}")
                 pass
+        # handle set
+        elif isinstance(o, set):
+            return {"type": type(o).__name__, "data": list(o)}
+        else:
+            return {"type": type(o).__name__, "data": str(o)}
+            # raise NotImplementedError(
+            #     f"Object of type {o.__class__.__name__} is not JSON serializable: {o}"
+            # )
     elif obj_type == ObjectTypes.TYPE:
         log.debug(f"Object {o} is a type of {o.__name__}")
         try:
@@ -101,9 +109,19 @@ def _deserialize_object_hook(d: Dict[str, Any]) -> Any:
     """Hook to deserialize objects based on their type."""
     if "type" in d and "data" in d:
         class_name = d["type"]
+        if class_name == "set":
+            return set(d["data"])
+
+        # deseralize customized types
+        # TODO: all customized data types need to be saved
         class_type = EntityMapping.get(class_name)
-        if class_type:
-            return class_type.from_dict(d)
+        try:
+            if class_type:
+                return class_type.from_dict(d)
+        except Exception as e:
+            # default to the original object
+            log.error(f"Error deserializing object {d}: {e}")
+            pass
     return d
 
 
diff --git a/adalflow/tests/test_parameter.py b/adalflow/tests/test_parameter.py
index 3da290da..a8f64f0e 100644
--- a/adalflow/tests/test_parameter.py
+++ b/adalflow/tests/test_parameter.py
@@ -46,6 +46,19 @@ def test_update_value(self, data, new_data):
         param.update_value(new_data)
         assert param.data == new_data, "Parameter data should be updated correctly"
 
+    def test_data_in_prompt_callable(self):
+        param = Parameter(
+            data=10, requires_opt=False, data_in_prompt=lambda x: f"Data: {x.data}"
+        )
+
+        assert (
+            param.data_in_prompt(param) == "Data: 10"
+        ), "Data should be correctly formatted in the prompt"
+
+        assert (
+            param.get_prompt_data() == "Data: 10"
+        ), "Data should be correctly formatted in the prompt"
+
     # def test_update_value_incorrect_type(self):
     #     """Test updating the parameter with an incorrect type."""
     #     param = Parameter[int](data=10)
diff --git a/adalflow/tests/test_parameter_text_grad.py b/adalflow/tests/test_parameter_text_grad.py
index 91cf4dc9..64e004e0 100644
--- a/adalflow/tests/test_parameter_text_grad.py
+++ b/adalflow/tests/test_parameter_text_grad.py
@@ -8,36 +8,15 @@
 class TestGradientContext(unittest.TestCase):
     def test_gradient_context_initialization(self):
         context = GradientContext(
-            context="Sample context",
+            input_output="Sample context",
             response_desc="Sample response description",
             variable_desc="Sample variable description",
         )
-        self.assertEqual(context.context, "Sample context")
+        self.assertEqual(context.input_output, "Sample context")
         self.assertEqual(context.response_desc, "Sample response description")
         self.assertEqual(context.variable_desc, "Sample variable description")
 
 
-class TestParameter(unittest.TestCase):
-    def setUp(self):
-        self.param1 = Parameter(data="Gradient 1", name="param1")
-        self.param2 = Parameter(data="Gradient 2", name="param2")
-        self.param1.gradients.append(self.param2)
-        self.param1.gradients_context[self.param2] = GradientContext(
-            context="Conversation context",
-            response_desc="Response description",
-            variable_desc="Variable description",
-        )
-
-    def test_get_gradient_text(self):
-        expected_output = """Batch size: 1
-
-1.
-<CONTEXT>Conversation context</CONTEXT>
-
-<FEEDBACK>Gradient 2</FEEDBACK>"""
-        self.assertEqual(self.param1.get_gradient_and_context_text(), expected_output)
-
-
 #     def test_get_gradient_and_context_text(self):
 #         expected_output = """
 # Feedback 1.\n
@@ -83,7 +62,6 @@ def test_update_prompt(self):
         # Check if each variable value is in the generated output
         # self.assertIn("Role description", result)
         # self.assertIn("short value", result)
-        self.assertIn("gradient and context text", result)
         # self.assertIn("<start>", result)
         # self.assertIn("<end>", result)
         self.assertIn("Some constraint text", result)
diff --git a/adalflow/tests/test_react_agent.py b/adalflow/tests/test_react_agent.py
index 244a421f..6575054c 100644
--- a/adalflow/tests/test_react_agent.py
+++ b/adalflow/tests/test_react_agent.py
@@ -1,2 +1,229 @@
+import unittest
+from unittest.mock import Mock, patch
+from adalflow.core.func_tool import FunctionTool
+from adalflow.core.types import FunctionExpression, GeneratorOutput
+from adalflow.components.agent.react import ReActAgent, StepOutput
+from adalflow.components.model_client.openai_client import OpenAIClient
+
+
+# Mock tools for testing
+def mock_add(a: int, b: int) -> int:
+    return a + b
+
+
+def mock_multiply(a: int, b: int) -> int:
+    return a * b
+
+
+def mock_simple_tool(input: str) -> str:
+    return f"Processed: {input}"
+
+
+class TestReActAgent(unittest.TestCase):
+    """Test Agent with normal functions"""
+
+    def setUp(self):
+        # Mock OpenAIClient
+        self.mock_model_client = Mock(spec=OpenAIClient)
+
+        # Initialize ReActAgent with mocked tools and model client
+        self.tools = [
+            FunctionTool(mock_add),
+            FunctionTool(mock_multiply),
+            FunctionTool(mock_simple_tool),
+        ]
+        self.react_agent = ReActAgent(
+            tools=self.tools,
+            max_steps=5,
+            add_llm_as_fallback=False,
+            model_client=self.mock_model_client,
+            model_kwargs={"model": "gpt-3.5-turbo"},
+        )
+
+    def test_react_agent_initialization(self):
+        self.assertEqual(self.react_agent.max_steps, 5)
+        self.assertTrue(not self.react_agent.add_llm_as_fallback)
+        self.assertEqual(
+            len(self.react_agent.tool_manager.tools), 4
+        )  # 3 tools + finish + fallback
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_simple_query_execution(self, mock_planner):
+        # Simulate a valid JSON-serializable response from the planner
+        mock_planner.return_value = GeneratorOutput(
+            data=FunctionExpression.from_function(
+                thought="Finish the task directly.",
+                func=self.react_agent._finish,
+                answer="Simple answer",
+            )
+        )
+
+        result = self.react_agent.call("What is 2 + 2?")
+        self.assertEqual(result.answer, "Simple answer")
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_complex_query_execution(self, mock_planner):
+        # Simulate multiple steps for a complex query, each planner will return a FunctionExpression
+        mock_planner.side_effect = [
+            GeneratorOutput(
+                data=FunctionExpression.from_function(
+                    thought="Divide the task into subqueries.", func=mock_add, a=2, b=2
+                )
+            ),
+            GeneratorOutput(
+                data=FunctionExpression.from_function(
+                    thought="Multiply the results.", func=mock_multiply, a=4, b=3
+                )
+            ),
+            GeneratorOutput(
+                data=FunctionExpression.from_function(
+                    thought="Finish the task directly.",
+                    func=self.react_agent._finish,
+                    answer=12,
+                )
+            ),
+        ]
+
+        # mock the agent to run the first step
+        step_output = self.react_agent._run_one_step(
+            step=1, step_history=[], prompt_kwargs={}, model_kwargs={}
+        )
+        print(f"step_output 1: {step_output}")
+        self.assertEqual(len(step_output), 1)
+        self.assertTrue(isinstance(step_output[0], StepOutput))
+        self.assertTrue(step_output[0].action)
+        self.assertTrue(isinstance(step_output[0].action, FunctionExpression))
+
+        result = self.react_agent.call("Add 2 and 3, then multiply by 4.")
+        print(f"result: {result}")
+        self.assertEqual(result.answer, 12)
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_error_handling(self, mock_planner):
+        # Simulate an error scenario
+        mock_planner.return_value = GeneratorOutput(
+            data={
+                "thought": "Encountered an error.",
+                "function": {"name": "finish", "args": {"answer": "Error occurred"}},
+            }
+        )
+        # no action
+
+        # check error raised
+        # with self.assertRaises(ValueError):
+
+        result = self.react_agent.call("Simulate an error.")
+        print(f"result 2: {result}")
+        self.assertIn("Error occurred", result.answer)
+
+
+from adalflow.optim.grad_component import GradComponent
+
+
+class GradAdd(GradComponent):
+    def __init__(self):
+        super().__init__()
+
+    def call(self, x, y):
+        return x + y
+
+    def forward(self, x, y):
+        return f"{x + y} + forward"
+
+
+class GradSub(GradComponent):
+    def __init__(self):
+        super().__init__()
+
+    def call(self, x, y):
+        return x - y
+
+    def forward(self, x, y):
+        return f"{x - y} + forward"
+
+
+class TestReactAgentWithComponentASTool(unittest.TestCase):
+    @patch("adalflow.components.model_client.openai_client.OpenAIClient", autospec=True)
+    def setUp(self, MockOpenAIClient):
+        """Set up the ReActAgent with GradComponents as tools."""
+        self.add_component = GradAdd()
+        self.sub_component = GradSub()
+
+        self.tools = [
+            FunctionTool(fn=self.add_component.__call__, component=self.add_component),
+            FunctionTool(fn=self.sub_component.__call__, component=self.sub_component),
+        ]
+
+        self.mock_model_client = MockOpenAIClient.return_value
+        self.agent = ReActAgent(
+            tools=self.tools,
+            max_steps=5,
+            add_llm_as_fallback=False,
+            model_client=self.mock_model_client,
+            model_kwargs={"model": "gpt-3.5-turbo"},
+        )
+
+    def test_agent_with_eval_mode(self):
+        """Test the agent's behavior when GradComponents are in eval mode."""
+        # Ensure components start in eval mode
+        self.assertFalse(self.add_component.training)
+        self.assertFalse(self.sub_component.training)
+
+        # Use agent to call addition tool
+        result = self.agent.tool_manager.tools[0](3, 2)  # GradAdd in eval mode
+        self.assertEqual(result.output, 5)
+
+        # Use agent to call subtraction tool
+        result = self.agent.tool_manager.tools[1](5, 3)  # GradSub in eval mode
+        self.assertEqual(result.output, 2)
+
+    def test_agent_with_train_mode(self):
+        """Test the agent's behavior when GradComponents are in train mode."""
+        # Set the agent to train mode, which should propagate to components
+        self.agent.train()
+
+        self.assertTrue(self.add_component.training)
+        self.assertTrue(self.sub_component.training)
+        # as the component is not directly registered in the agent, but passed to the tool manager, it will not be in training mode
+
+        # Use agent to call addition tool in train mode
+        result = self.agent.tool_manager.tools[0](3, 2)  # GradAdd in train mode
+        self.assertEqual(result.output, "5 + forward")
+
+        # Use agent to call subtraction tool in train mode
+        result = self.agent.tool_manager.tools[1](5, 3)  # GradSub in train mode
+        self.assertEqual(result.output, "2 + forward")
+
+    def test_agent_switch_modes(self):
+        """Test the agent's ability to switch between eval and train modes."""
+        # Start in eval mode
+        self.assertFalse(self.add_component.training)
+        self.assertFalse(self.sub_component.training)
+
+        # Switch to train mode
+        self.agent.train()
+        named_components = self.agent.named_components()
+        for name, component in named_components:
+            print(f"{name}: {component}")
+        print(f"add_component: {self.add_component}")
+        self.assertTrue(self.agent.tool_manager.training)
+
+        # add component will have eval mode
+        self.assertTrue(self.add_component.training)
+
+        # the tools from the tool manager will be in training mode
+        self.assertTrue(self.agent.tool_manager.tools[0].training)
+        self.assertTrue(self.agent.tool_manager.tools[1].training)
+
+        # back to eval mode
+        self.agent.eval()
+        self.assertFalse(self.add_component.training)
+        self.assertFalse(self.sub_component.training)
+
+        # tools from the tool manager will be in eval mode
+        self.assertFalse(self.agent.tool_manager.tools[0].training)
+        self.assertFalse(self.agent.tool_manager.tools[1].training)
+
+
 if __name__ == "__main__":
-    pass
+    unittest.main()
diff --git a/adalflow/tests/test_tool.py b/adalflow/tests/test_tool.py
index 441eef56..e2cb3485 100644
--- a/adalflow/tests/test_tool.py
+++ b/adalflow/tests/test_tool.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 
 from adalflow.core.func_tool import FunctionTool
+from adalflow.core.tool_manager import ToolManager
 from adalflow.core.types import FunctionDefinition
 
 
@@ -53,27 +54,150 @@ def test_function_tool_async():
         tool.call(3, 4)
 
 
-# def test_invalid_function_tool_initialization():
-#     # Test initialization without any function should raise ValueError
-#     with pytest.raises(ValueError):
-#         tool = FunctionTool(metadata=metadata)
+from adalflow.optim.grad_component import GradComponent
 
 
-# def test_from_defaults_uses_function_docstring():
-#     def sample_function(x, y, user: User = User(id=1, name="John")):
-#         """
-#         Adds two numbers together and returns the sum.
-#         """
-#         return x + y
+class GradAdd(GradComponent):
+    def __init__(self):
+        super().__init__()
+        print(f"training: {self.training}")
 
-#     tool = FunctionTool(fn=sample_function)
+    def call(self, x, y):
+        return x + y
 
-#     expected_description = sample_function.__doc__.strip()
-#     actual_description = tool.metadata.description
-#     print(f"Expected: {expected_description}, Actual: {actual_description}")
+    def forward(self, x, y):
+        print(f"training: {self.training}")
+        return f"{x + y} + forward"
 
 
-# # Check if the metadata description matches the function's docstring
-# assert (
-#     actual_description == expected_description
-# ), f"The description should automatically be set to the function's docstring. Expected: {expected_description}, Actual: {actual_description}"
+class GradSub(GradComponent):
+    def __init__(self):
+        super().__init__()
+
+    def call(self, x, y):
+        return x - y
+
+    def forward(self, x, y):
+        print(f"training: {self.training}")
+        return f"{x - y} + forward"
+
+
+class TestComponent(GradComponent):
+    def __init__(self):
+        super().__init__()
+
+        self.add = GradAdd()
+        self.sub = GradSub()
+
+        print(f"sub_component: {self.sub.training}")
+
+        print(f"add_component: {self.add.training}")
+
+        def add_as_tool(x, y):
+            return self.add(x, y)
+
+        self.tools = [
+            FunctionTool(fn=add_as_tool, component=self.add),
+            FunctionTool(fn=self.sub.__call__, component=self.sub),
+        ]
+
+
+add = GradAdd()
+sub = GradSub()
+
+
+class TestComponnetInstanceOutsideComponent(GradComponent):
+    def __init__(self):
+        super().__init__()
+
+        print(f"sub_component: {sub.training}")
+
+        print(f"add_component: {add.training}")
+
+        def add_as_tool(x, y):
+            return add(x, y)
+
+        self.tools = [
+            FunctionTool(fn=add_as_tool, component=add),
+            FunctionTool(fn=sub.__call__, component=sub),
+        ]
+
+
+class TestToolManagerComponent(GradComponent):
+
+    def __init__(self):
+        super().__init__()
+
+        print(f"sub_component: {sub.training}")
+
+        print(f"add_component: {add.training}")
+
+        def add_as_tool(x, y):
+            return add(x, y)
+
+        self.tools = [
+            FunctionTool(fn=add_as_tool, component=add),
+            FunctionTool(fn=sub.__call__, component=sub),
+        ]
+
+        # manag by tool manager, and since the component is passed to tools_manager which is also a component, it will be in training mode
+        self.tools_manager = ToolManager(tools=self.tools)
+
+
+def test_function_tool_with_grad_component():
+    r"""When we set the training mode of the component, the subcomponents will change with it.
+    Once the subcomponent change, it will adapt to training model too.
+    """
+
+    test_com = TestComponent()
+    assert not test_com.training
+    # call the tools
+    output = test_com.tools[0](1, 2)
+    # ensure it is the call method that is called
+    assert output.output == 3
+    test_com.train()
+    assert test_com.training
+    assert test_com.add.training
+    # ensure it is the forward method that is called
+    output = test_com.tools[0](1, 2)
+    assert output.output == "3 + forward"
+
+
+def test_component_instance_outside_component():
+    r"""When we set the training mode of the component, the subcomponents will change with it.
+    Once the subcomponent change, it will adapt to training model too.
+    """
+
+    test_com = TestComponnetInstanceOutsideComponent()
+    assert not test_com.training
+    # call the tools
+    output = test_com.tools[0](1, 2)
+    # ensure it is the call method that is called
+    assert output.output == 3
+    test_com.train()
+    assert test_com.training
+    assert not add.training  # the subcomponent is no longer in training mode
+    # ensure it is the forward method that is called
+    output = test_com.tools[0](1, 2)
+    assert output.output == 3
+
+
+def test_tool_manager_with_grad_component():
+    r"""When we set the training mode of the component, the subcomponents will change with it.
+    Once the subcomponent change, it will adapt to training model too.
+    """
+
+    test_com = TestToolManagerComponent()
+    assert not test_com.training
+    # call the tools
+    output = test_com.tools_manager.tools[0](1, 2)
+    # ensure it is the call method that is called
+    assert output.output == 3
+    test_com.train()
+    assert test_com.training
+    assert (
+        add.training
+    )  # the subcomponent will change as it is managed by the tool manager
+    # ensure it is the forward method that is called
+    output = test_com.tools_manager.tools[0](1, 2)
+    assert output.output == "3 + forward"
diff --git a/benchmarks/hotpot_qa/_adal_train.py b/benchmarks/hotpot_qa/_adal_train.py
deleted file mode 100644
index e397cf0f..00000000
--- a/benchmarks/hotpot_qa/_adal_train.py
+++ /dev/null
@@ -1,664 +0,0 @@
-"deprecated"
-"""We will use dspy's retriever to keep that the same and only use our generator and optimizer"""
-
-import dspy
-from typing import List, Union, Optional, Dict, Callable
-from dataclasses import dataclass, field
-
-import adalflow as adal
-from adalflow.optim.parameter import Parameter, ParameterType
-
-from adalflow.datasets.hotpot_qa import HotPotQA, HotPotQAData
-from adalflow.datasets.types import Example
-
-from adalflow.core.retriever import Retriever
-
-
-colbertv2_wiki17_abstracts = dspy.ColBERTv2(
-    url="http://20.102.90.50:2017/wiki17_abstracts"
-)
-
-dspy.settings.configure(rm=colbertv2_wiki17_abstracts)
-
-
-def load_datasets():
-
-    trainset = HotPotQA(split="train", size=20)  # 20
-    valset = HotPotQA(split="val", size=50)  # 50
-    testset = HotPotQA(split="test", size=50)  # to keep the same as the dspy #50
-    print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}")
-    return trainset, valset, testset
-
-
-# task pipeline
-from typing import Any, Tuple
-
-from adalflow.core import Component, Generator
-
-
-# dspy format
-# Follow the following format.
-# Context: may contain relevant facts
-# Question: ${question}
-# Reasoning: Let's think step by step in order to ${produce the query}. We ...
-# Query: ${query}
-@dataclass
-class QueryRewritterData(adal.DataClass):
-    reasoning: str = field(
-        metadata={"desc": "The reasoning to produce the query"},
-    )
-    query: str = field(
-        metadata={"desc": "The query you produced"},
-    )
-
-    __output_fields__ = ["reasoning", "query"]
-
-
-@dataclass
-class AnswerData(adal.DataClass):
-    reasoning: str = field(
-        metadata={"desc": "The reasoning to produce the answer"},
-    )
-    answer: str = field(
-        metadata={"desc": "The answer you produced"},
-    )
-
-    __output_fields__ = ["reasoning", "answer"]
-
-
-query_template = """<START_OF_SYSTEM_PROMPT>
-Write a simple search query that will help answer a complex question.
-
-You will receive a context(may contain relevant facts) and a question.
-Think step by step.
-
-{{output_format_str}}
-{# Few shot demos #}
-{% if few_shot_demos is not none %}
-Here are some examples:
-{{few_shot_demos}}
-{% endif %}
-<END_OF_SYSTEM_PROMPT>
-<START_OF_USER>
-Context: {{context}}
-Question: {{question}}
-<END_OF_USER>
-"""
-
-# Library gives a standard template for easy prompt
-answer_template = """<START_OF_SYSTEM_PROMPT>
-Answer questions with short factoid answers.
-
-You will receive context(may contain relevabt facts) and a question.
-Think step by step.
-{{output_format_str}}
-{# Few shot demos #}
-{% if few_shot_demos is not none %}
-Here are some examples:
-{{few_shot_demos}}
-{% endif %}
-<END_OF_SYSTEM_PROMPT>
-<START_OF_USER>
-Context: {{context}}
-Question: {{question}}
-"""
-
-from adalflow.core.component import fun_to_component
-import re
-
-
-@fun_to_component
-def parse_string_query(text: str) -> str:
-    return re.search(r"Query: (.*)", text).group(1)
-
-
-@fun_to_component
-def parse_string_answer(text: str) -> str:
-    return re.search(r"Answer: (.*)", text).group(1)
-
-
-from dataclasses import dataclass, field
-
-
-@dataclass
-class HotPotQADemoData(Example):
-    context: List[str] = field(
-        metadata={"desc": "The context to be used for answering the question"},
-        default_factory=list,
-    )
-    score: float = field(
-        metadata={"desc": "The score of the answer"},
-        default=None,
-    )
-
-
-from benchmarks.hotpot_qa.dspy_train import validate_context_and_answer_and_hops
-
-
-def convert_y_pred_to_dataclass(y_pred):
-    # y_pred in both eval and train mode
-    context: List[str] = (
-        y_pred.input_args["prompt_kwargs"]["context"]
-        if hasattr(y_pred, "input_args")
-        else []
-    )
-    # context_str = "\n".join(context)
-    data = y_pred.data if hasattr(y_pred, "data") else y_pred
-    return DynamicDataClassFactory.from_dict(
-        class_name="HotPotQAData",
-        data={
-            "answer": data,
-            "context": context,
-        },
-    )
-
-
-def eval_fn(sample, y_pred, metadata):
-    if isinstance(sample, Parameter):
-        sample = sample.data
-    y_pred_obj = convert_y_pred_to_dataclass(y_pred)
-    return 1 if validate_context_and_answer_and_hops(sample, y_pred_obj) else 0
-
-
-from adalflow.core.types import RetrieverOutput, GeneratorOutput
-
-
-# Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline
-class DspyRetriever(Retriever):
-    def __init__(self, k=3):
-        super().__init__()
-        self.k = k
-        self.dspy_retriever = dspy.Retrieve(k=k)
-
-    def call(self, input: str) -> List[RetrieverOutput]:
-        output = self.dspy_retriever(query_or_queries=input, k=self.k)
-        print(f"dsy_retriever output: {output}")
-        final_output: List[RetrieverOutput] = []
-        documents = output.passages
-
-        final_output.append(
-            RetrieverOutput(
-                query=input,
-                documents=documents,
-                doc_indices=[],
-            )
-        )
-        print(f"final_output: {final_output}")
-        return final_output
-
-
-# example need to have question,
-# pred needs to have query
-
-import adalflow as adal
-
-
-# User customize an auto-grad operator
-class MultiHopRetriever(adal.Retriever):
-    def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
-        super().__init__()
-
-        self.passages_per_hop = passages_per_hop
-        self.max_hops = max_hops
-
-        self.data_parser = adal.DataClassParser(
-            data_class=QueryRewritterData, return_data_class=True, format_type="yaml"
-        )
-
-        # Grad Component
-        self.query_generator = Generator(
-            name="query_generator",
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            prompt_kwargs={
-                "few_shot_demos": Parameter(
-                    name="few_shot_demos_1",
-                    data=None,
-                    role_desc="To provide few shot demos to the language model",
-                    requires_opt=True,
-                    param_type=ParameterType.DEMOS,
-                ),
-                "output_format_str": self.data_parser.get_output_format_str(),
-            },
-            template=query_template,
-            # output_processors=parse_string_query,
-            output_processors=self.data_parser,
-            use_cache=True,
-            # demo_data_class=HotPotQADemoData,
-            # demo_data_class_input_mapping={
-            #     "question": "question",
-            #     # "context": "context",
-            # },
-            # demo_data_class_output_mapping={"answer": lambda x: x.raw_response},
-        )
-        self.retrieve = DspyRetriever(k=passages_per_hop)
-
-    @staticmethod
-    def context_to_str(context: List[str]) -> str:
-        return "\n".join(context)
-
-    def call(self, *, question: str, id: str = None) -> Any:  # Add id for tracing
-        # inference mode!!!
-        # output = self.forward(question, id=id)
-
-        context = []
-        self.max_hops = 1
-        for hop in range(self.max_hops):
-            gen_out = self.query_generator(
-                prompt_kwargs={
-                    "context": self.context_to_str(context),
-                    "question": question,
-                },
-                id=id,
-            )
-            query = None
-            # TODO: the bridge between the retriever to the generator and generator to the retriever needs to be more smooth
-            if isinstance(gen_out, GeneratorOutput):
-                query = (  # noqa: F841
-                    gen_out.data.query if gen_out.data and gen_out.data.query else None
-                )
-            elif isinstance(gen_out, adal.Parameter):
-                gen_out.successor_map_fn = lambda x: (
-                    x.full_response.data.query
-                    if x.full_response and x.full_response.data
-                    else None
-                )
-                print(f"gen_out: {gen_out}")
-                # query = (
-                #     gen_out.full_response.data.query
-                #     if gen_out.full_response and gen_out.full_response.data
-                #     else None
-                # )
-            retrieve_out = self.retrieve(input=gen_out)
-            print(f"retrieve_out: {retrieve_out}")
-            # passages = []
-            # if isinstance(retrieve_out, Parameter):
-            #     passages = retrieve_out.data[0].documents
-            # else:
-            #     passages = retrieve_out[0].documents
-
-            # print(f"passages: {passages}")
-
-            # context = deduplicate(context + passages)
-
-        # # for hop in range(self.max_hops):
-        # last_context_param = Parameter(
-        #     data=context,
-        #     name=f"query_context_{id}_{0}",
-        #     requires_opt=True,
-        # )
-        # query = self.query_generator(
-        #     prompt_kwargs={
-        #         "context": last_context_param,
-        #         "question": question,
-        #     },
-        #     id=id,
-        # )
-        # print(f"query: {query}")
-        # if isinstance(query, GeneratorOutput):
-        #     query = query.data
-        # output = self.retrieve(query)
-        # print(f"output: {output}")
-        # print(f"output call: {output}")
-        # return output[0].documents
-
-    # def forward(self, question: str, id: str = None) -> Parameter:
-    #     question_param = question
-    #     if not isinstance(question, Parameter):
-    #         question_param = Parameter(
-    #             data=question,
-    #             name="question",
-    #             role_desc="The question to be answered",
-    #             requires_opt=False,
-    #         )
-    #     context = []
-    #     self.max_hops = 1
-    #     # for hop in range(self.max_hops):
-    #     last_context_param = Parameter(
-    #         data=context,
-    #         name=f"query_context_{id}_{0}",
-    #         requires_opt=True,
-    #     )
-    #     query = self.query_generator(
-    #         prompt_kwargs={
-    #             "context": last_context_param,
-    #             "question": question_param,
-    #         },
-    #         id=id,
-    #     )
-    #     print(f"query: {query}")
-    #     if isinstance(query, GeneratorOutput):
-    #         query = query.data
-    #     output = self.retrieve(query)
-    #     print(f"output: {output}")
-    #     passages = []
-    #     if isinstance(output, Parameter):
-    #         passages = output.data[0].documents
-    #     else:
-    #         passages = output[0].documents
-    #     # context = deduplicate(context + passages) # all these needs to gradable
-    #     # output_param = Parameter(
-    #     #     data=passages,
-    #     #     alias=f"qa_context_{id}",
-    #     #     role_desc="The context to be used for answering the question",
-    #     #     requires_opt=True,
-    #     # )
-    #     output.data = passages  # reset the values to be used in the next
-    #     if not isinstance(output, Parameter):
-    #         raise ValueError(f"Output must be a Parameter, got {output}")
-    #     return output
-    #     # output_param.set_grad_fn(
-    #     #     BackwardContext(
-    #     #         backward_fn=self.backward,
-    #     #         response=output_param,
-    #     #         id=id,
-    #     #         prededecessors=prededecessors,
-    #     #     )
-    #     # )
-    #     # return output_param
-
-    def backward(self, response: Parameter, id: Optional[str] = None):
-        print(f"MultiHopRetriever backward: {response}")
-        children_params = response.predecessors
-        # backward score to the demo parameter
-        for pred in children_params:
-            if pred.requires_opt:
-                # pred._score = float(response._score)
-                pred.set_score(response._score)
-                print(
-                    f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}"
-                )
-                if pred.param_type == ParameterType.DEMOS:
-                    # Accumulate the score to the demo
-                    pred.add_score_to_trace(
-                        trace_id=id, score=response._score, is_teacher=self.teacher_mode
-                    )
-                    print(f"Pred: {pred.name}, traces: {pred._traces}")
-
-
-class HotPotQARAG(
-    Component
-):  # use component as not creating a new ops, but assemble existing ops
-    r"""Same system prompt as text-grad paper, but with our one message prompt template, which has better starting performance"""
-
-    def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
-        super().__init__()
-
-        self.passages_per_hop = passages_per_hop
-        self.max_hops = max_hops
-
-        self.multi_hop_retriever = MultiHopRetriever(
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            passages_per_hop=passages_per_hop,
-            max_hops=max_hops,
-        )
-        # TODO: sometimes the cache will collide, so we get different evaluation
-        self.llm_counter = Generator(
-            name="QuestionAnswering",
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            prompt_kwargs={
-                "few_shot_demos": Parameter(
-                    name="few_shot_demos",
-                    data=None,
-                    role_desc="To provide few shot demos to the language model",
-                    requires_opt=True,
-                    param_type=ParameterType.DEMOS,
-                )
-            },
-            template=answer_template,
-            output_processors=parse_string_answer,
-            use_cache=True,
-            demo_data_class=HotPotQADemoData,
-            demo_data_class_input_mapping={
-                "question": "question",
-                "context": "context",
-            },
-            demo_data_class_output_mapping={"answer": lambda x: x.raw_response},
-        )
-
-    # TODO: the error will be a context
-    # a component wont handle training, forward or backward, just passing everything through
-    def call(self, question: str, id: str = None) -> Union[Parameter, str]:
-
-        # normal component, will be called when in inference mode
-
-        question_param = Parameter(
-            data=question,
-            name="question",
-            role_desc="The question to be answered",
-            requires_opt=False,
-        )
-        context = []  # noqa: F841
-        output = None
-        retrieved_context = self.multi_hop_retriever(question_param, id=id)
-
-        # forming a backpropagation graph
-        # Make this step traceable too.
-        # for hop in range(self.max_hops):
-        #     # make context a parameter to be able to trace
-        #     query = self.query_generator(
-        #         prompt_kwargs={
-        #             "context": Parameter(
-        #                 data=context, alias=f"query_context_{id}", requires_opt=True
-        #             ),
-        #             "question": question_param,
-        #         },
-        #         id=id,
-        #     )
-        #     print(f"query: {query}")
-        #     if isinstance(query, GeneratorOutput):
-        #         query = query.data
-        #     output = self.retrieve(query)
-        #     print(f"output: {output}")
-        #     passages = []
-        #     if isinstance(output, Parameter):
-        #         passages = output.data[0].documents
-        #     else:
-        #         output[0].documents
-        #     context = deduplicate(context + passages)
-        # print(f"context: {context}")
-
-        output = self.llm_counter(
-            prompt_kwargs={
-                "context": retrieved_context,
-                "question": question_param,
-            },
-            id=id,
-        )  # already support both training (forward + call)
-
-        if (
-            not self.training
-        ):  # if users want to customize the output, ensure to use if not self.training
-
-            # convert the generator output to a normal data format
-            print(f"converting output: {output}")
-
-            if output.data is None:
-                error_msg = (
-                    f"Error in processing the question: {question}, output: {output}"
-                )
-                print(error_msg)
-                output = error_msg
-            else:
-                output = output.data
-        return output
-
-
-from adalflow.optim.trainer.adal import AdalComponent
-from adalflow.optim.trainer.trainer import Trainer
-from adalflow.optim.few_shot.bootstrap_optimizer import BootstrapFewShot
-from adalflow.eval.answer_match_acc import AnswerMatchAcc
-from adalflow.optim.text_grad.text_loss_with_eval_fn import EvalFnToTextLoss
-from adalflow.core.base_data_class import DynamicDataClassFactory
-
-
-class HotPotQARAGAdal(AdalComponent):
-    # TODO: move teacher model or config in the base class so users dont feel customize too much
-    def __init__(self, task: Component, teacher_model_config: dict):
-        super().__init__()
-        self.task = task
-        self.teacher_model_config = teacher_model_config
-
-        self.evaluator = AnswerMatchAcc("fuzzy_match")
-        self.eval_fn = self.evaluator.compute_single_item
-        # self.eval_fn = eval_fn
-
-    def handle_one_task_sample(
-        self, sample: HotPotQAData
-    ) -> Any:  # TODO: auto id, with index in call train examples
-        return self.task, {"question": sample.question, "id": sample.id}
-
-    def handle_one_loss_sample(
-        self, sample: HotPotQAData, y_pred: Any
-    ) -> Tuple[Callable, Dict]:
-        return self.loss_fn.forward, {
-            "kwargs": {
-                "y": y_pred,
-                "y_gt": Parameter(
-                    data=sample.answer,
-                    role_desc="The ground truth(reference correct answer)",
-                    name="y_gt",
-                    requires_opt=False,
-                ),
-            }
-        }
-
-    def configure_optimizers(self, *args, **kwargs):
-
-        # TODO: simplify this, make it accept generator
-        parameters = []
-        for name, param in self.task.named_parameters():
-            param.name = name
-            parameters.append(param)
-        do = BootstrapFewShot(params=parameters)
-        return [do]
-
-    def evaluate_one_sample(
-        self, sample: Any, y_pred: Any, metadata: Dict[str, Any]
-    ) -> Any:
-
-        # we need "context" be passed as metadata
-        # print(f"sample: {sample}, y_pred: {y_pred}")
-        # convert pred to Dspy structure
-
-        # y_obj = convert_y_pred_to_dataclass(y_pred)
-        # print(f"y_obj: {y_obj}")
-        # raise ValueError("Stop here")
-        if metadata:
-            return self.eval_fn(sample, y_pred, metadata)
-        return self.eval_fn(sample, y_pred)
-
-    def configure_teacher_generator(self):
-        super().configure_teacher_generator(**self.teacher_model_config)
-
-    def configure_loss_fn(self):
-        self.loss_fn = EvalFnToTextLoss(
-            eval_fn=self.eval_fn,
-            eval_fn_desc="ObjectCountingEvalFn, Output accuracy score: 1 for correct, 0 for incorrect",
-            backward_engine=None,
-        )
-
-
-def validate_dspy_demos(
-    demos_file="benchmarks/BHH_object_count/models/dspy/hotpotqa.json",
-):
-    from adalflow.utils.file_io import load_json
-
-    demos_json = load_json(demos_file)
-
-    demos = demos_json["generate_answer"]["demos"]  # noqa: F841
-
-    # task = HotPotQARAG(  # noqa: F841
-    #     **gpt_3_model,
-    #     passages_per_hop=3,
-    #     max_hops=2,
-    # )
-    # task.llm_counter.p
-
-
-def test_multi_hop_retriever():
-
-    from use_cases.config import (
-        gpt_3_model,
-    )
-
-    multi_hop_retriever = MultiHopRetriever(
-        **gpt_3_model,
-        passages_per_hop=3,
-        max_hops=2,
-    )
-    # 1. use print
-    # print(multi_hop_retriever.query_generator)
-    # # 2. run one forward for query generator
-    question = "How many storeys are in the castle that David Gregory inherited?"
-    # context = []
-    # context_str = multi_hop_retriever.context_to_str(context)
-    # print(
-    #     multi_hop_retriever.query_generator(
-    #         prompt_kwargs={"question": question, "context": context_str}, id="1"
-    #     )
-    # )
-    # # verfify the prompt
-    # multi_hop_retriever.query_generator.print_prompt(
-    #     **{"question": question, "context": context_str}
-    # )
-
-    # training mode
-    multi_hop_retriever.train()
-
-    # 3. run one forward for retriever
-    print(multi_hop_retriever(question=question, id="1"))
-
-
-def train():
-    trainset, valset, testset = load_datasets()
-
-    from use_cases.config import (
-        gpt_3_model,
-        gpt_4o_model,
-    )
-
-    task = HotPotQARAG(
-        **gpt_3_model,
-        passages_per_hop=3,
-        max_hops=2,
-    )
-    print(task)
-    question = "How long is the highway Whitehorse/Cousins Airport was built to support as of 2012?"
-    print(task(question))
-
-    # for name, param in task.named_parameters():
-    #     print(f"name: {name}, param: {param}")
-
-    trainset, valset, testset = load_datasets()
-
-    trainer = Trainer(
-        adaltask=HotPotQARAGAdal(task=task, teacher_model_config=gpt_4o_model),
-        max_steps=10,
-        raw_shots=0,
-        bootstrap_shots=4,
-        train_batch_size=4,
-        ckpt_path="hotpot_qa_rag",
-        strategy="random",
-        save_traces=True,
-        debug=True,  # make it having debug mode
-        weighted_sampling=True,
-    )
-    # fit include max steps
-    trainer.fit(
-        train_dataset=trainset, val_dataset=valset, test_dataset=testset, debug=True
-    )
-
-
-if __name__ == "__main__":
-    ### Try the minimum effort to test on any task
-
-    # get_logger(level="DEBUG")
-    test_multi_hop_retriever()
-
-
-# TODO: i forgot that i need demo_data_class
-# TODO: i forgot that i need to set id
-# Failed to generate demos but no error messages
diff --git a/benchmarks/hotpot_qa/adal_exp/build.py b/benchmarks/hotpot_qa/adal_exp/build.py
deleted file mode 100644
index 9f1d078c..00000000
--- a/benchmarks/hotpot_qa/adal_exp/build.py
+++ /dev/null
@@ -1,630 +0,0 @@
-"""We will use dspy's retriever to keep that the same and only use our generator and optimizer"""
-
-import dspy
-import re
-from typing import List, Union, Optional, Dict, Callable
-from dataclasses import dataclass, field
-
-import adalflow as adal
-from adalflow.optim.parameter import Parameter, ParameterType
-
-from adalflow.datasets.hotpot_qa import HotPotQA, HotPotQAData
-from adalflow.datasets.types import Example
-
-from adalflow.core.retriever import Retriever
-from adalflow.core.component import fun_to_component
-
-
-colbertv2_wiki17_abstracts = dspy.ColBERTv2(
-    url="http://20.102.90.50:2017/wiki17_abstracts"
-)
-
-dspy.settings.configure(rm=colbertv2_wiki17_abstracts)
-
-
-def load_datasets():
-
-    trainset = HotPotQA(split="train", size=20)
-    valset = HotPotQA(split="val", size=50)
-    testset = HotPotQA(split="test", size=50)
-    print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}")
-    return trainset, valset, testset
-
-
-# task pipeline
-from typing import Any, Tuple
-
-from adalflow.core import Component, Generator
-
-
-# dspy format
-# Follow the following format.
-# Context: may contain relevant facts
-# Question: ${question}
-# Reasoning: Let's think step by step in order to ${produce the query}. We ...
-# Query: ${query}
-@dataclass
-class QueryRewritterData(adal.DataClass):
-    reasoning: str = field(
-        metadata={"desc": "The reasoning to produce the query"},
-    )
-    query: str = field(
-        metadata={"desc": "The query you produced"},
-    )
-
-    __output_fields__ = ["reasoning", "query"]
-
-
-@dataclass
-class AnswerData(adal.DataClass):
-    reasoning: str = field(
-        metadata={"desc": "The reasoning to produce the answer"},
-    )
-    answer: str = field(
-        metadata={"desc": "The answer you produced"},
-    )
-
-    __output_fields__ = ["reasoning", "answer"]
-
-
-query_template = """<START_OF_SYSTEM_PROMPT>
-Write a simple search query that will help answer a complex question.
-
-You will receive a context(may contain relevant facts) and a question.
-Think step by step.
-
-{{output_format_str}}
-{# Few shot demos #}
-{% if few_shot_demos is not none %}
-Here are some examples:
-{{few_shot_demos}}
-{% endif %}
-<END_OF_SYSTEM_PROMPT>
-<START_OF_USER>
-Context: {{context}}
-Question: {{question}}
-<END_OF_USER>
-"""
-
-# Library gives a standard template for easy prompt
-answer_template = """<START_OF_SYSTEM_PROMPT>
-Answer questions with short factoid answers.
-
-You will receive context(may contain relevabt facts) and a question.
-Think step by step.
-{{output_format_str}}
-{# Few shot demos #}
-{% if few_shot_demos is not none %}
-Here are some examples:
-{{few_shot_demos}}
-{% endif %}
-<END_OF_SYSTEM_PROMPT>
-<START_OF_USER>
-Context: {{context}}
-Question: {{question}}
-"""
-
-
-# @fun_to_component
-# def parse_string_query(text: str) -> str:
-#     return re.search(r"Query: (.*)", text).group(1)
-
-
-@fun_to_component
-def parse_string_answer(text: str) -> str:
-    return re.search(r"Answer: (.*)", text).group(1)
-
-
-from dataclasses import dataclass, field
-
-
-@dataclass
-class HotPotQADemoData(Example):
-    context: List[str] = field(
-        metadata={"desc": "The context to be used for answering the question"},
-        default_factory=list,
-    )
-    score: float = field(
-        metadata={"desc": "The score of the answer"},
-        default=None,
-    )
-
-
-from adalflow.core.types import RetrieverOutput, GeneratorOutput
-
-
-# Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline
-class DspyRetriever(Retriever):
-    def __init__(self, k=3):
-        super().__init__()
-        self.k = k
-        self.dspy_retriever = dspy.Retrieve(k=k)
-
-    def call(self, input: str) -> List[RetrieverOutput]:
-        output = self.dspy_retriever(query_or_queries=input, k=self.k)
-        print(f"dsy_retriever output: {output}")
-        final_output: List[RetrieverOutput] = []
-        documents = output.passages
-
-        final_output.append(
-            RetrieverOutput(
-                query=input,
-                documents=documents,
-                doc_indices=[],
-            )
-        )
-        print(f"final_output: {final_output}")
-        return final_output
-
-
-import adalflow as adal
-
-
-# User customize an auto-grad operator
-class MultiHopRetriever(adal.Retriever):
-    def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
-        super().__init__()
-
-        self.passages_per_hop = passages_per_hop
-        self.max_hops = max_hops
-
-        self.data_parser = adal.DataClassParser(
-            data_class=QueryRewritterData, return_data_class=True, format_type="yaml"
-        )
-
-        # Grad Component
-        self.query_generator = Generator(
-            name="query_generator",
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            prompt_kwargs={
-                "few_shot_demos": Parameter(
-                    name="few_shot_demos_1",
-                    data=None,
-                    role_desc="To provide few shot demos to the language model",
-                    requires_opt=True,
-                    param_type=ParameterType.DEMOS,
-                ),
-                "output_format_str": self.data_parser.get_output_format_str(),
-            },
-            template=query_template,
-            # output_processors=parse_string_query,
-            output_processors=self.data_parser,
-            use_cache=True,
-            # demo_data_class=HotPotQADemoData,
-            # demo_data_class_input_mapping={
-            #     "question": "question",
-            #     # "context": "context",
-            # },
-            # demo_data_class_output_mapping={"answer": lambda x: x.raw_response},
-        )
-        self.retrieve = DspyRetriever(k=passages_per_hop)
-
-    @staticmethod
-    def context_to_str(context: List[str]) -> str:
-        return "\n".join(context)
-
-    def call(self, *, question: str, id: str = None) -> Any:  # Add id for tracing
-        # inference mode!!!
-        # output = self.forward(question, id=id)
-
-        context = []
-        self.max_hops = 1
-        for hop in range(self.max_hops):
-            gen_out = self.query_generator(
-                prompt_kwargs={
-                    "context": self.context_to_str(context),
-                    "question": question,
-                },
-                id=id,
-            )
-            query = None
-            # TODO: the bridge between the retriever to the generator and generator to the retriever needs to be more smooth
-            if isinstance(gen_out, GeneratorOutput):
-                query = (  # noqa: F841
-                    gen_out.data.query if gen_out.data and gen_out.data.query else None
-                )
-            elif isinstance(gen_out, adal.Parameter):
-                gen_out.successor_map_fn = lambda x: (
-                    x.full_response.data.query
-                    if x.full_response and x.full_response.data
-                    else None
-                )
-                print(f"gen_out: {gen_out}")
-                # query = (
-                #     gen_out.full_response.data.query
-                #     if gen_out.full_response and gen_out.full_response.data
-                #     else None
-                # )
-            retrieve_out = self.retrieve(input=gen_out)
-            print(f"retrieve_out: {retrieve_out}")
-            # passages = []
-            # if isinstance(retrieve_out, Parameter):
-            #     passages = retrieve_out.data[0].documents
-            # else:
-            #     passages = retrieve_out[0].documents
-
-            # print(f"passages: {passages}")
-
-            # context = deduplicate(context + passages)
-
-        # # for hop in range(self.max_hops):
-        # last_context_param = Parameter(
-        #     data=context,
-        #     name=f"query_context_{id}_{0}",
-        #     requires_opt=True,
-        # )
-        # query = self.query_generator(
-        #     prompt_kwargs={
-        #         "context": last_context_param,
-        #         "question": question,
-        #     },
-        #     id=id,
-        # )
-        # print(f"query: {query}")
-        # if isinstance(query, GeneratorOutput):
-        #     query = query.data
-        # output = self.retrieve(query)
-        # print(f"output: {output}")
-        # print(f"output call: {output}")
-        # return output[0].documents
-
-    # def forward(self, question: str, id: str = None) -> Parameter:
-    #     question_param = question
-    #     if not isinstance(question, Parameter):
-    #         question_param = Parameter(
-    #             data=question,
-    #             name="question",
-    #             role_desc="The question to be answered",
-    #             requires_opt=False,
-    #         )
-    #     context = []
-    #     self.max_hops = 1
-    #     # for hop in range(self.max_hops):
-    #     last_context_param = Parameter(
-    #         data=context,
-    #         name=f"query_context_{id}_{0}",
-    #         requires_opt=True,
-    #     )
-    #     query = self.query_generator(
-    #         prompt_kwargs={
-    #             "context": last_context_param,
-    #             "question": question_param,
-    #         },
-    #         id=id,
-    #     )
-    #     print(f"query: {query}")
-    #     if isinstance(query, GeneratorOutput):
-    #         query = query.data
-    #     output = self.retrieve(query)
-    #     print(f"output: {output}")
-    #     passages = []
-    #     if isinstance(output, Parameter):
-    #         passages = output.data[0].documents
-    #     else:
-    #         passages = output[0].documents
-    #     # context = deduplicate(context + passages) # all these needs to gradable
-    #     # output_param = Parameter(
-    #     #     data=passages,
-    #     #     alias=f"qa_context_{id}",
-    #     #     role_desc="The context to be used for answering the question",
-    #     #     requires_opt=True,
-    #     # )
-    #     output.data = passages  # reset the values to be used in the next
-    #     if not isinstance(output, Parameter):
-    #         raise ValueError(f"Output must be a Parameter, got {output}")
-    #     return output
-    #     # output_param.set_grad_fn(
-    #     #     BackwardContext(
-    #     #         backward_fn=self.backward,
-    #     #         response=output_param,
-    #     #         id=id,
-    #     #         prededecessors=prededecessors,
-    #     #     )
-    #     # )
-    #     # return output_param
-
-    def backward(self, response: Parameter, id: Optional[str] = None):
-        print(f"MultiHopRetriever backward: {response}")
-        children_params = response.predecessors
-        # backward score to the demo parameter
-        for pred in children_params:
-            if pred.requires_opt:
-                # pred._score = float(response._score)
-                pred.set_score(response._score)
-                print(
-                    f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}"
-                )
-                if pred.param_type == ParameterType.DEMOS:
-                    # Accumulate the score to the demo
-                    pred.add_score_to_trace(
-                        trace_id=id, score=response._score, is_teacher=self.teacher_mode
-                    )
-                    print(f"Pred: {pred.name}, traces: {pred._traces}")
-
-
-class HotPotQARAG(
-    Component
-):  # use component as not creating a new ops, but assemble existing ops
-    r"""Same system prompt as text-grad paper, but with our one message prompt template, which has better starting performance"""
-
-    def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
-        super().__init__()
-
-        self.passages_per_hop = passages_per_hop
-        self.max_hops = max_hops
-
-        self.multi_hop_retriever = MultiHopRetriever(
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            passages_per_hop=passages_per_hop,
-            max_hops=max_hops,
-        )
-        # TODO: sometimes the cache will collide, so we get different evaluation
-        self.llm_counter = Generator(
-            name="QuestionAnswering",
-            model_client=model_client,
-            model_kwargs=model_kwargs,
-            prompt_kwargs={
-                "few_shot_demos": Parameter(
-                    name="few_shot_demos",
-                    data=None,
-                    role_desc="To provide few shot demos to the language model",
-                    requires_opt=True,
-                    param_type=ParameterType.DEMOS,
-                )
-            },
-            template=answer_template,
-            output_processors=parse_string_answer,
-            use_cache=True,
-            demo_data_class=HotPotQADemoData,
-            demo_data_class_input_mapping={
-                "question": "question",
-                "context": "context",
-            },
-            demo_data_class_output_mapping={"answer": lambda x: x.raw_response},
-        )
-
-    # TODO: the error will be a context
-    # a component wont handle training, forward or backward, just passing everything through
-    def call(self, question: str, id: str = None) -> Union[Parameter, str]:
-
-        # normal component, will be called when in inference mode
-
-        question_param = Parameter(
-            data=question,
-            name="question",
-            role_desc="The question to be answered",
-            requires_opt=False,
-        )
-        context = []  # noqa: F841
-        output = None
-        retrieved_context = self.multi_hop_retriever(question_param, id=id)
-
-        # forming a backpropagation graph
-        # Make this step traceable too.
-        # for hop in range(self.max_hops):
-        #     # make context a parameter to be able to trace
-        #     query = self.query_generator(
-        #         prompt_kwargs={
-        #             "context": Parameter(
-        #                 data=context, alias=f"query_context_{id}", requires_opt=True
-        #             ),
-        #             "question": question_param,
-        #         },
-        #         id=id,
-        #     )
-        #     print(f"query: {query}")
-        #     if isinstance(query, GeneratorOutput):
-        #         query = query.data
-        #     output = self.retrieve(query)
-        #     print(f"output: {output}")
-        #     passages = []
-        #     if isinstance(output, Parameter):
-        #         passages = output.data[0].documents
-        #     else:
-        #         output[0].documents
-        #     context = deduplicate(context + passages)
-        # print(f"context: {context}")
-
-        output = self.llm_counter(
-            prompt_kwargs={
-                "context": retrieved_context,
-                "question": question_param,
-            },
-            id=id,
-        )  # already support both training (forward + call)
-
-        if (
-            not self.training
-        ):  # if users want to customize the output, ensure to use if not self.training
-
-            # convert the generator output to a normal data format
-            print(f"converting output: {output}")
-
-            if output.data is None:
-                error_msg = (
-                    f"Error in processing the question: {question}, output: {output}"
-                )
-                print(error_msg)
-                output = error_msg
-            else:
-                output = output.data
-        return output
-
-
-from adalflow.optim.trainer.adal import AdalComponent
-from adalflow.optim.trainer.trainer import Trainer
-from adalflow.optim.few_shot.bootstrap_optimizer import BootstrapFewShot
-from adalflow.eval.answer_match_acc import AnswerMatchAcc
-from adalflow.optim.text_grad.text_loss_with_eval_fn import EvalFnToTextLoss
-
-
-class HotPotQARAGAdal(AdalComponent):
-    # TODO: move teacher model or config in the base class so users dont feel customize too much
-    def __init__(self, task: Component, teacher_model_config: dict):
-        super().__init__()
-        self.task = task
-        self.teacher_model_config = teacher_model_config
-
-        self.evaluator = AnswerMatchAcc("fuzzy_match")
-        self.eval_fn = self.evaluator.compute_single_item
-        # self.eval_fn = eval_fn
-
-    def handle_one_task_sample(
-        self, sample: HotPotQAData
-    ) -> Any:  # TODO: auto id, with index in call train examples
-        return self.task, {"question": sample.question, "id": sample.id}
-
-    def handle_one_loss_sample(
-        self, sample: HotPotQAData, y_pred: Any
-    ) -> Tuple[Callable, Dict]:
-        return self.loss_fn.forward, {
-            "kwargs": {
-                "y": y_pred,
-                "y_gt": Parameter(
-                    data=sample.answer,
-                    role_desc="The ground truth(reference correct answer)",
-                    name="y_gt",
-                    requires_opt=False,
-                ),
-            }
-        }
-
-    def configure_optimizers(self, *args, **kwargs):
-
-        # TODO: simplify this, make it accept generator
-        parameters = []
-        for name, param in self.task.named_parameters():
-            param.name = name
-            parameters.append(param)
-        do = BootstrapFewShot(params=parameters)
-        return [do]
-
-    def evaluate_one_sample(
-        self, sample: Any, y_pred: Any, metadata: Dict[str, Any]
-    ) -> Any:
-
-        # we need "context" be passed as metadata
-        # print(f"sample: {sample}, y_pred: {y_pred}")
-        # convert pred to Dspy structure
-
-        # y_obj = convert_y_pred_to_dataclass(y_pred)
-        # print(f"y_obj: {y_obj}")
-        # raise ValueError("Stop here")
-        if metadata:
-            return self.eval_fn(sample, y_pred, metadata)
-        return self.eval_fn(sample, y_pred)
-
-    def configure_teacher_generator(self):
-        super().configure_teacher_generator(**self.teacher_model_config)
-
-    def configure_loss_fn(self):
-        self.loss_fn = EvalFnToTextLoss(
-            eval_fn=self.eval_fn,
-            eval_fn_desc="ObjectCountingEvalFn, Output accuracy score: 1 for correct, 0 for incorrect",
-            backward_engine=None,
-        )
-
-
-def validate_dspy_demos(
-    demos_file="benchmarks/BHH_object_count/models/dspy/hotpotqa.json",
-):
-    from adalflow.utils.file_io import load_json
-
-    demos_json = load_json(demos_file)
-
-    demos = demos_json["generate_answer"]["demos"]  # noqa: F841
-
-    # task = HotPotQARAG(  # noqa: F841
-    #     **gpt_3_model,
-    #     passages_per_hop=3,
-    #     max_hops=2,
-    # )
-    # task.llm_counter.p
-
-
-def test_multi_hop_retriever():
-
-    from use_cases.config import (
-        gpt_3_model,
-    )
-
-    multi_hop_retriever = MultiHopRetriever(
-        **gpt_3_model,
-        passages_per_hop=3,
-        max_hops=2,
-    )
-    # 1. use print
-    # print(multi_hop_retriever.query_generator)
-    # # 2. run one forward for query generator
-    question = "How many storeys are in the castle that David Gregory inherited?"
-    # context = []
-    # context_str = multi_hop_retriever.context_to_str(context)
-    # print(
-    #     multi_hop_retriever.query_generator(
-    #         prompt_kwargs={"question": question, "context": context_str}, id="1"
-    #     )
-    # )
-    # # verfify the prompt
-    # multi_hop_retriever.query_generator.print_prompt(
-    #     **{"question": question, "context": context_str}
-    # )
-
-    # training mode
-    multi_hop_retriever.train()
-
-    # 3. run one forward for retriever
-    print(multi_hop_retriever(question=question, id="1"))
-
-
-def train():
-    trainset, valset, testset = load_datasets()
-
-    from use_cases.config import (
-        gpt_3_model,
-        gpt_4o_model,
-    )
-
-    task = HotPotQARAG(
-        **gpt_3_model,
-        passages_per_hop=3,
-        max_hops=2,
-    )
-    print(task)
-    question = "How long is the highway Whitehorse/Cousins Airport was built to support as of 2012?"
-    print(task(question))
-
-    # for name, param in task.named_parameters():
-    #     print(f"name: {name}, param: {param}")
-
-    trainset, valset, testset = load_datasets()
-
-    trainer = Trainer(
-        adaltask=HotPotQARAGAdal(task=task, teacher_model_config=gpt_4o_model),
-        max_steps=10,
-        raw_shots=0,
-        bootstrap_shots=4,
-        train_batch_size=4,
-        ckpt_path="hotpot_qa_rag",
-        strategy="random",
-        save_traces=True,
-        debug=True,  # make it having debug mode
-        weighted_sampling=True,
-    )
-    # fit include max steps
-    trainer.fit(
-        train_dataset=trainset, val_dataset=valset, test_dataset=testset, debug=True
-    )
-
-
-if __name__ == "__main__":
-    ### Try the minimum effort to test on any task
-
-    # get_logger(level="DEBUG")
-    test_multi_hop_retriever()
-
-
-# TODO: i forgot that i need demo_data_class
-# TODO: i forgot that i need to set id
-# Failed to generate demos but no error messages
diff --git a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
index cebcfdf2..de3ecfcc 100644
--- a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py
@@ -1,7 +1,7 @@
 """We will use dspy's retriever to keep that the same and only use our generator and optimizer"""
 
 import dspy
-from typing import List
+from typing import List, Optional
 from dataclasses import dataclass, field
 
 import adalflow as adal
@@ -12,6 +12,9 @@
 
 from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import DspyRetriever
 from adalflow.utils.logger import printc
+from adalflow.components.agent.react import ReActAgent
+
+from adalflow.optim.grad_component import GradComponent2
 
 colbertv2_wiki17_abstracts = dspy.ColBERTv2(
     url="http://20.102.90.50:2017/wiki17_abstracts"
@@ -52,17 +55,34 @@ class QueryRewritterData(adal.DataClass):
 {% endif %}
 <END_OF_SYSTEM_PROMPT>
 <START_OF_USER>
-Context: {{context}}
 Question: {{question}}
+{% if last_query is not none %}
+Last Query: {{last_query}}
+{% endif %}
+{% if context is not none %}
+Context from last search query: {{context}}
+{% endif %}
 <END_OF_USER>
 """
 
 
+@dataclass
+class QueriesOutput(adal.DataClass):
+    data: str = field(
+        metadata={"desc": "The joined queries"},
+    )
+    id: str = field(
+        metadata={"desc": "The id of the output"},
+    )
+
+
 class DeduplicateList(adal.GradComponent):
     def __init__(self):
         super().__init__()
 
-    def call(self, exisiting_list: List[str], new_list: List[str]) -> List[str]:
+    def call(
+        self, exisiting_list: List[str], new_list: List[str], id: str = None
+    ) -> List[str]:
 
         seen = set()
         return [x for x in exisiting_list + new_list if not (x in seen or seen.add(x))]
@@ -73,12 +93,80 @@ def backward(self, *args, **kwargs):
         return super().backward(*args, **kwargs)
 
 
-# User customize an auto-grad operator
-# Need this to be a GradComponent
+class CombineList(GradComponent2):
+    def __init__(
+        self,
+        name="CombineRetrieverOut",
+        desc="combines two lists and deduplicate with set",
+    ):
+        super().__init__(name=name, desc=desc)
+
+    def call(
+        self,
+        context_1: adal.RetrieverOutput,
+        context_2: adal.RetrieverOutput,
+        id: str = None,
+    ) -> List[str]:
+
+        seen = set()
+        lists_1 = context_1.documents
+        lists_2 = context_2.documents
+        combined = [x for x in lists_1 + lists_2 if not (x in seen or seen.add(x))]
+
+        output = adal.RetrieverOutput(
+            id=id,
+            # query=f"query 1: {context_1.query}, query 2: {context_2.query}",
+            query=[context_1.query, context_2.query],
+            documents=combined,
+            doc_indices=[],
+        )
+        return output
+
 
+class CombineQueries(GradComponent2):
+    def __init__(
+        self,
+        name="CombineTwoQueries using ','",
+        desc="combines two queries for evaluation",
+    ):
+        super().__init__(name=name, desc=desc)
 
-# NOTE: deprecated
-class MultiHopRetriever(adal.Retriever):
+    def call(
+        self,
+        q_1: str,
+        q_2: str,
+        id: str = None,
+    ) -> QueriesOutput:
+
+        value = f"{q_1}, {q_2}"
+
+        output = QueriesOutput(data=value, id=id)
+
+        return output
+
+
+query_generator_task_desc = """Write a simple search query that will help answer a complex question.
+
+You will receive a context(may contain relevant facts) and a question.
+Think step by step."""
+
+
+task_desc_str = """
+You will receive an original question, last search query, and the retrieved context from the last search query.
+Write the next search query to help retrieve all relevant context to answer the original question.
+Think step by step."""
+
+task_desc_str_system_finetuned = """
+Write a search query to identify key information step by step. Begin by extracting names or entities directly referenced in the question. Use retrieved data to iteratively refine subsequent queries, targeting specific attributes such as filmographies, roles, or numerical criteria (e.g., number of movies or TV shows). Adjust the query dynamically based on gaps or ambiguities in retrieved results.
+"""
+
+task_desc_system_finedtuned_separately = [
+    "Write a search query that extracts the key entity or fact required to begin answering the question. Focus on identifying specific names, titles, or roles directly referenced in the question. The query should aim to retrieve precise and relevant details (e.g., the name of a person, cast members of a movie, or associated facts) to refine understanding of the question.",
+    "Based on the retrieved results, refine the search query to target detailed information that resolves the question. Use retrieved entities or partial answers to adjust the query dynamically. If gaps or ambiguities remain, incorporate criteria from the original question (e.g., specific numbers, attributes, or context) to improve precision and relevance.",
+]
+
+
+class MultiHopRetrieverCycle(adal.Retriever):
     def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
         super().__init__()
 
@@ -89,146 +177,190 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
             data_class=QueryRewritterData, return_data_class=True, format_type="json"
         )
 
-        # Grad Component
-        self.query_generators: List[adal.Generator] = []
-        for i in range(self.max_hops):
-            self.query_generators.append(
-                adal.Generator(
-                    name=f"query_generator_{i}",
-                    model_client=model_client,
-                    model_kwargs=model_kwargs,
-                    prompt_kwargs={
-                        "few_shot_demos": Parameter(
-                            name="few_shot_demos_1",
-                            data=None,
-                            role_desc="To provide few shot demos to the language model",
-                            requires_opt=True,
-                            param_type=ParameterType.DEMOS,
-                        ),
-                        "task_desc_str": Parameter(
-                            name="task_desc_str",
-                            data="""Write a simple search query that will help answer a complex question.
+        # only one generator which will be used in a loop, called max_hops times
+        self.query_generator: adal.Generator = adal.Generator(
+            name="query_generator",
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            prompt_kwargs={
+                # "few_shot_demos": Parameter(
+                #     name="few_shot_demos",
+                #     data=None,
+                #     role_desc="To provide few shot demos to the language model",
+                #     requires_opt=True,
+                #     param_type=ParameterType.DEMOS,
+                # ),
+                "task_desc_str": Parameter(
+                    name="task_desc_str",
+                    data=query_generator_task_desc,
+                    # data=task_desc_str_system_finetuned,
+                    # data=task_desc_system_finedtuned_separately[0],
+                    role_desc="Task description for the language model. Used together with \
+                    the following template: \
+                    Question: {{question}} \
+{% if last_query is not none %} \
+Last Query: {{last_query}}\
+{% endif %}\
+{% if context is not none %}\
+Context from last search query: {{context}}\
+{% endif %}",
+                    requires_opt=True,
+                    param_type=ParameterType.PROMPT,
+                ),
+                "output_format_str": self.data_parser.get_output_format_str(),
+            },
+            template=query_template,
+            output_processors=self.data_parser,
+            use_cache=True,
+        )
 
-You will receive a context(may contain relevant facts) and a question.
-Think step by step.""",
-                            role_desc="Task description for the language model",
-                            requires_opt=True,
-                            param_type=ParameterType.PROMPT,
-                        ),
-                        "output_format_str": self.data_parser.get_output_format_str(),
-                    },
-                    template=query_template,
-                    output_processors=self.data_parser,
-                    use_cache=True,
-                )
-            )
         self.retriever = DspyRetriever(top_k=passages_per_hop)
         self.deduplicater = DeduplicateList()
+        self.combine_list = CombineList()
 
     @staticmethod
     def context_to_str(context: List[str]) -> str:
         return "\n".join(context)
 
-    @staticmethod
-    def deduplicate(seq: list[str]) -> list[str]:
-        """
-        Source: https://stackoverflow.com/a/480227/1493011
-        """
-
-        seen = set()
-        return [x for x in seq if not (x in seen or seen.add(x))]
-
-    def call(self, *, question: str, id: str = None) -> adal.RetrieverOutput:
-        context = []
-        print(f"question: {question}")
-        for i in range(self.max_hops):
-            gen_out = self.query_generators[i](
-                prompt_kwargs={
-                    "context": self.context_to_str(context),
-                    "question": question,
-                },
-                id=id,
-            )
-
-            query = gen_out.data.query if gen_out.data and gen_out.data.query else None
+    def call(self, *, input: str, id: str = None) -> List[adal.RetrieverOutput]:
+        # assemble the foundamental building blocks
+        printc(f"question: {input}", "yellow")
+        out = self.forward(input=input, id=id)
 
-            print(f"query {i}: {query}")
+        if not isinstance(out, adal.Parameter):
+            raise ValueError("The output should be a parameter")
 
-            retrieve_out = self.retriever.call(input=query)
-            passages = retrieve_out[0].documents
-            context = self.deduplicate(context + passages)
-        out = [adal.RetrieverOutput(documents=context, query=query, doc_indices=[])]
-        return out
+        return out.data  # or full response its up to users
 
-    def forward(self, *, question: str, id: str = None) -> adal.Parameter:
+    def forward(self, *, input: str, id: str = None) -> adal.Parameter:
         # assemble the foundamental building blocks
         context = []
-        print(f"question: {question}")
         # 1. make question a parameter as generator does not have it yet
         # can create the parameter at the leaf, but not the intermediate nodes
         question_param = adal.Parameter(
             name="question",
-            data=question,
+            data=input,
             role_desc="The question to be answered",
-            requires_opt=True,
-            param_type=ParameterType.INPUT,
-        )
-        context_param = adal.Parameter(
-            name="context",
-            data=context,
-            role_desc="The context to be used for the query",
-            requires_opt=True,
+            requires_opt=False,
             param_type=ParameterType.INPUT,
         )
-        context_param.add_successor_map_fn(
-            successor=self.query_generators[0],
-            map_fn=lambda x: self.context_to_str(x.data),
-        )
+        contexts = []
+        last_query = None
 
         for i in range(self.max_hops):
+            # printc(f"hop: {i}", "yellow")
 
-            gen_out = self.query_generators[i].forward(
+            gen_out = self.query_generator.forward(
                 prompt_kwargs={
-                    "context": context_param,
+                    "context": context,
                     "question": question_param,
+                    "last_query": last_query,
+                    # "task_desc_str": task_desc_system_finedtuned_separately[
+                    #     i
+                    # ],  # replace this at runtime
                 },
                 id=id,
             )
-
+            # prompt_kwargs = {
+            #     "context": context,
+            #     "question": question_param,
+            #     "last_query": last_query,
+            # }
+            # prompt = self.query_generator.get_prompt(**prompt_kwargs)
+            # printc(f"prompt: {prompt}", "yellow")
+
+            # printc(f"query {i}: {gen_out.data.data.query}", "yellow")
+            # extract the query from the generator output
             success_map_fn = lambda x: (  # noqa E731
-                x.full_response.data.query
-                if x.full_response
-                and x.full_response.data
-                and x.full_response.data.query
-                else None
+                x.data.data.query
+                if x.data and x.data.data and x.data.data.query
+                else (x.data.raw_response if x.data and x.data.raw_response else None)
             )
-            print(f"query {i}: {success_map_fn(gen_out)}")
+            # print(f"query {i}: {success_map_fn(gen_out)}")
 
             gen_out.add_successor_map_fn(
                 successor=self.retriever, map_fn=success_map_fn
             )
+            # printc(f"before retrieve_out: {success_map_fn(gen_out)}", "yellow")
 
-            retrieve_out = self.retriever.forward(input=gen_out)
+            # retrieve the passages
+            retrieve_out: adal.Parameter = self.retriever.forward(input=gen_out, id=id)
+            # printc(f"retrieve_out: {retrieve_out}", "yellow")
 
-            def retrieve_out_map_fn(x: adal.Parameter):
-                return x.data[0].documents if x.data and x.data[0].documents else []
+            retrieve_out.data_in_prompt = lambda x: {
+                "query": x.data.query,
+                "documents": x.data.documents,
+            }
+            if i + 1 < self.max_hops:
+                last_query = gen_out
+
+                last_query.add_successor_map_fn(
+                    successor=self.query_generator, map_fn=success_map_fn
+                )
 
-            print(f"retrieve_out: {retrieve_out}")
+            def retrieve_out_map_fn(x: adal.Parameter):
+                return x.data.documents if x.data and x.data.documents else []
 
+            # add the map function to the retrieve_out
             retrieve_out.add_successor_map_fn(
                 successor=self.deduplicater, map_fn=retrieve_out_map_fn
             )
+            context = retrieve_out
+            if i + 1 < self.max_hops:
+                context.add_successor_map_fn(
+                    successor=self.query_generator, map_fn=retrieve_out_map_fn
+                )
 
-            context_param = self.deduplicater.forward(
-                exisiting_list=context_param, new_list=retrieve_out
-            )
+            contexts.append(context)
+
+        contexts[0].add_successor_map_fn(
+            successor=self.combine_list, map_fn=lambda x: x.data
+        )
+        contexts[1].add_successor_map_fn(
+            successor=self.combine_list, map_fn=lambda x: x.data
+        )
+
+        context_sum = self.combine_list.forward(contexts[0], contexts[1])
+        return context_sum
 
-        context_param.param_type = ParameterType.RETRIEVER_OUTPUT
 
-        return context_param
+# task_desc_str = """Write a simple search query that will help answer a complex question.
 
+# You will receive a context(may contain relevant facts) and a question.
+# Think step by step."""
 
-class MultiHopRetriever2(adal.Retriever):
+
+trained_task_desc_strs = [
+    "You are tasked with formulating precise search queries using the original question, last search query, and its retrieved context. Prioritize identifying, emphasizing, and explicitly including all crucial entities, relationships, and geographical details mentioned in the question. Ensure comprehensive retrieval by focusing on key elements such as specific individuals (e.g., 'Kyrie Irving'), roles, or contextual details required for accuracy. Demonstrate reasoning by cross-referencing multiple sources and provide clear examples where necessary. Adapt queries to capture all nuances effectively for improved relevance and accuracy. Think step by step.",
+    "You will receive an original question, the last search query, and the retrieved context from that search. Write the next search query to ensure comprehensive retrieval of all relevant context needed to answer the original question. Emphasize identifying, precisely including, and verifying specific key entities, historical events, and factual names directly linked to the question within the context. Explicitly use the context to confirm and match critical entities to improve recall and ensure consistency with the targeted entities. Avoid irrelevant inclusions or false positives by cross-referencing data and verifying alignment accurately. Think step by step.",
+]
+
+trained_task_desc_strs = [
+    "You will receive an original question, last search query, and the retrieved context from the last search query. Identify key entities, explicitly named individuals, and specific versions (e.g., specific film versions) in the original question to ensure comprehensive and focused retrieval. Craft a refined search query to help retrieve relevant context, prioritizing connections and biographical details needed. Think step by step.",
+    "You will receive an original question, last search query, and the retrieved context from the last search query. Analyze both the question and context to craft the next search query. Focus on all pertinent entities, especially notable individuals, mentioned in the question and context to ensure comprehensive coverage. Think step by step.",
+]
+
+few_shot_demos = [
+    "reasoning: The question is asking for the individual who defeated Sander Levin in\n  a specific election, the Michigan gubernatorial election of 1970. I need to determine\n  who his opponent was and who won that election. Hence, I should focus the search\n  on the Michigan gubernatorial election of 1970, Sander Levin, and the name of the\n  winner.\nquery: Michigan gubernatorial election 1970 winner Sander Levin\n\nquestion: What is the name of this American law firm headquartered in Little Rock,\n  Arkansas, which was co-founded by Robert Crittenden?\nanswer: Rose Law Firm",
+    "reasoning: The context provides information about Kirk Humphreys, the chairman of\n  The Humphreys Company, and his birth date as September 13, 1950. It also mentions\n  that he lost in a primary to former Congressman Tom Coburn, who is a medical doctor.\n  To determine who is older, we need to find the birth date of Tom Coburn.\nquery: Tom Coburn birth date\n\nquestion: In which century was football introduced to this region represented by FC\n  Espanya de Barcelona?\nanswer: 19th century",
+]
+
+manual_task_desc_strs = [
+    "You will receive an question that requires 2 retrieveal steps to have enough context to answer. \
+    You are the first step, write a simple search query to retrieve the first part of the context. \
+    Think step by step.",
+    "You will receive an original question, last search query, and the retrieved context from the last search query. Write the next search query to help retrieve all relevant context to answer the original question. Think step by step.",
+]
+
+
+# task_desc_str = """ You are a query assistant that helps search all relevant context to answer a multi-hop question.
+
+# You will a question, and existing context(may contain relevant facts along with its sub-questions).
+# Write a new simple search query to help retrieve the relevant context to answer the question.
+# Think step by step."""
+
+
+class MultiHopRetriever(adal.Component):
     def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
         super().__init__()
 
@@ -239,11 +371,10 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
             data_class=QueryRewritterData, return_data_class=True, format_type="json"
         )
 
-        # Grad Component
-        # self.query_generators: List[adal.Generator] = []
         self.query_generators: adal.ComponentList[adal.Generator] = adal.ComponentList()
         self.retrievers: List[Retriever] = []
         self.deduplicaters: List[adal.GradComponent] = []
+
         for i in range(self.max_hops):
             self.query_generators.append(
                 adal.Generator(
@@ -251,20 +382,27 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
                     model_client=model_client,
                     model_kwargs=model_kwargs,
                     prompt_kwargs={
-                        "few_shot_demos": Parameter(
-                            name=f"few_shot_demos_{i}",
-                            data=None,
-                            role_desc="To provide few shot demos to the language model",
-                            requires_opt=True,
-                            param_type=ParameterType.DEMOS,
-                        ),
+                        # "few_shot_demos": Parameter(
+                        #     name=f"few_shot_demos_{i}",
+                        #     # data=few_shot_demos[i],
+                        #     data=None,
+                        #     role_desc="To provide few shot demos to the language model",
+                        #     requires_opt=True,
+                        #     param_type=ParameterType.DEMOS,
+                        # ),
                         "task_desc_str": Parameter(
                             name="task_desc_str",
-                            data="""Write a simple search query that will help answer a complex question.
-
-You will receive a context(may contain relevant facts) and a question.
-Think step by step.""",
-                            role_desc="Task description for the language model",
+                            data=task_desc_str,
+                            # data=manual_task_desc_strs[i],
+                            role_desc=f"""Task description for the {i+1}th language model."""
+                            + "Used together with the following template: \
+Question: {{question}} \
+{% if last_query is not none %} \
+Last Query: {{last_query}}\
+{% endif %}\
+{% if context is not none %}\
+Context from last search query: {{context}}\
+{% endif %}",
                             requires_opt=True,
                             param_type=ParameterType.PROMPT,
                         ),
@@ -278,6 +416,9 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2):
             self.retrievers.append(DspyRetriever(top_k=passages_per_hop))
             self.deduplicaters.append(DeduplicateList())
 
+        self.combine_list = CombineList()
+        self.combine_queries = CombineQueries()
+
     @staticmethod
     def context_to_str(context: List[str]) -> str:
         return "\n".join(context)
@@ -291,51 +432,75 @@ def deduplicate(seq: list[str]) -> list[str]:
         seen = set()
         return [x for x in seq if not (x in seen or seen.add(x))]
 
-    # def call(self, *, question: str, id: str = None) -> adal.RetrieverOutput:
-    #     context = []
-    #     print(f"question: {question}")
-    #     for i in range(self.max_hops):
-    #         gen_out = self.query_generators[i](
-    #             prompt_kwargs={
-    #                 "context": self.context_to_str(context),
-    #                 "question": question,
-    #             },
-    #             id=id,
-    #         )
-
-    #         query = gen_out.data.query if gen_out.data and gen_out.data.query else None
-
-    #         print(f"query {i}: {query}")
-
-    #         retrieve_out = self.retrievers[i].call(input=query)
-    #         passages = retrieve_out[0].documents
-    #         context = self.deduplicate(context + passages)
-    #     out = [adal.RetrieverOutput(documents=context, query=query, doc_indices=[])]
-    #     return out
-
-    # TODO: simplify and avoid the need where users need to write two methods (call and forward)
-    def call(self, *, input: str, id: str = None) -> List[adal.RetrieverOutput]:
-        # assemble the foundamental building blocks
-        printc(f"question: {input}", "yellow")
-        out = self.forward(input=input, id=id)
+    def call(self, *, input: str, id: str = None) -> adal.RetrieverOutput:
+        context = []
+        queries: List[str] = []
+        last_query = None
+        for i in range(self.max_hops):
+            gen_out = self.query_generators[i](
+                prompt_kwargs={
+                    "context": context,
+                    "question": input,
+                    "last_query": last_query,
+                },
+                id=id,
+            )
 
-        if not isinstance(out, adal.Parameter):
-            raise ValueError("The output should be a parameter")
+            query = gen_out.data.query if gen_out.data and gen_out.data.query else input
 
-        return out.data  # or full response its up to users
+            retrieve_out = self.retrievers[i](input=query, id=id)
+
+            passages = retrieve_out.documents
+            context = self.deduplicate(context + passages)
+            queries.append(query)
+            last_query = query
+        out = adal.RetrieverOutput(
+            documents=context, query=queries, doc_indices=[], id=id
+        )
+        return out
+
+    def call2(self, *, input: str, id: str = None) -> str:
+        context = []
+        queries: List[str] = []
+        last_query = None
+        for i in range(self.max_hops):
+            gen_out = self.query_generators[i](
+                prompt_kwargs={
+                    "context": context,
+                    "question": input,
+                    "last_query": last_query,
+                },
+                id=id,
+            )
+
+            query = gen_out.data.query if gen_out.data and gen_out.data.query else input
+
+            retrieve_out = self.retrievers[i](input=query, id=id)
+
+            passages = retrieve_out.documents
+            context = self.deduplicate(context + passages)
+            queries.append(query)
+            last_query = query
+        out = ", ".join(queries)
+        query_output = QueriesOutput(data=out, id=id)
+        return query_output
 
     def forward(self, *, input: str, id: str = None) -> adal.Parameter:
         # assemble the foundamental building blocks
         printc(f"question: {input}", "yellow")
-        context = []
+        # context = []
 
         queries: List[str] = []
 
-        for i in range(self.max_hops):
+        context = []
+        last_query = None
+        contexts: List[Parameter] = []
 
-            gen_out = self.query_generators[i].forward(
+        for i in range(self.max_hops):
+            gen_out: Parameter = self.query_generators[i].forward(
                 prompt_kwargs={
-                    "context": context,  # can be a list or a parameter
+                    "context": context,
+                    "last_query": last_query,
                     "question": adal.Parameter(
                         name="question",
                         data=input,
@@ -348,17 +513,11 @@ def forward(self, *, input: str, id: str = None) -> adal.Parameter:
             )
 
             success_map_fn = lambda x: (  # noqa E731
-                x.full_response.data.query
-                if x.full_response
-                and x.full_response.data
-                and x.full_response.data.query
-                else (
-                    x.full_response.raw_response
-                    if x.full_response and x.full_response.raw_response
-                    else None
-                )
+                x.data.data.query
+                if x.data and x.data.data and x.data.data.query
+                else (x.data.raw_response if x.data and x.data.raw_response else None)
             )
-            print(f"query {i}: {success_map_fn(gen_out)}")
+            # printc(f"query {i}: {success_map_fn(gen_out)}")
 
             queries.append(success_map_fn(gen_out))
 
@@ -372,41 +531,153 @@ def forward(self, *, input: str, id: str = None) -> adal.Parameter:
             retrieve_out = self.retrievers[i].forward(input=gen_out, id=id)
 
             def retrieve_out_map_fn(x: adal.Parameter):
-                return x.data[0].documents if x.data and x.data[0].documents else []
+                return x.data.documents if x.data and x.data.documents else []
 
             # print(f"retrieve_out: {retrieve_out}")
 
-            retrieve_out.add_successor_map_fn(
-                successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn
+            # retrieve_out.add_successor_map_fn(
+            #     successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn
+            # )
+
+            # context = self.deduplicaters[i].forward(
+            #     exisiting_list=context, new_list=retrieve_out
+            # )
+            retrieve_out.data_in_prompt = lambda x: {
+                "query": x.data.query,
+                "documents": x.data.documents,
+            }
+            context = retrieve_out
+            if i + 1 < self.max_hops:
+                context.add_successor_map_fn(
+                    successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn
+                )
+                last_query = success_map_fn(gen_out)
+            contexts.append(retrieve_out)
+            # if i + 1 < self.max_hops:
+            #     retrieve_out.add_successor_map_fn(
+            #         successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn
+            #     )
+
+            #     last_query = success_map_fn(gen_out)
+            # printc(f"retrieve_out, last_query: {last_query}")
+
+        contexts[0].add_successor_map_fn(
+            successor=self.combine_list, map_fn=lambda x: x.data
+        )
+        contexts[1].add_successor_map_fn(
+            successor=self.combine_list, map_fn=lambda x: x.data
+        )
+        contexts_sum = self.combine_list.forward(
+            context_1=contexts[0], context_2=contexts[1]
+        )
+        contexts_sum.data_in_prompt = lambda x: {
+            "query": x.data.query,
+            "documents": x.data.documents,
+        }
+
+        return contexts_sum
+
+    # TODO: might need to support multiple output parameters
+    def forward2(self, *, input: str, id: str = None) -> List[adal.Parameter]:
+        r"""Experiment multiple output parameters for multiple evaluation."""
+        # assemble the foundamental building blocks
+        printc(f"question: {input}", "yellow")
+
+        queries: List[adal.Parameter] = []
+
+        context = []
+        last_query = None
+        contexts: List[Parameter] = []
+
+        for i in range(self.max_hops):
+            gen_out: Parameter = self.query_generators[i].forward(
+                prompt_kwargs={
+                    "context": context,
+                    "last_query": last_query,
+                    "question": adal.Parameter(
+                        name="question",
+                        data=input,
+                        role_desc="The question to be answered",
+                        requires_opt=False,
+                        param_type=ParameterType.INPUT,
+                    ),
+                },
+                id=id,
             )
 
-            context = self.deduplicaters[i].forward(
-                exisiting_list=context, new_list=retrieve_out
+            success_map_fn = lambda x: (  # noqa E731
+                x.data.data.query
+                if x.data and x.data.data and x.data.data.query
+                else (x.data.raw_response if x.data and x.data.raw_response else None)
             )
+            # printc(f"query {i}: {success_map_fn(gen_out)}")
 
-        context.param_type = ParameterType.RETRIEVER_OUTPUT
+            # queries.append(success_map_fn(gen_out))
+            queries.append(gen_out)
 
-        def context_to_retrover_output(x):
-            return [
-                adal.RetrieverOutput(
-                    documents=x.data, query=[input] + queries, doc_indices=[]
-                )
-            ]
+            gen_out.add_successor_map_fn(
+                successor=self.retrievers[i], map_fn=success_map_fn
+            )
 
-        context.data = context_to_retrover_output(context)
+            if success_map_fn(gen_out) is None:
+                raise ValueError(f"The query is None, please check the generator {i}")
+
+            retrieve_out = self.retrievers[i].forward(input=gen_out, id=id)
 
-        printc(f"MultiHopRetriever2 grad fn: {context.grad_fn}", "yellow")
+            def retrieve_out_map_fn(x: adal.Parameter):
+                return x.data.documents if x.data and x.data.documents else []
 
-        return context
+            # print(f"retrieve_out: {retrieve_out}")
 
-    def backward(self, *args, **kwargs):
+            # retrieve_out.add_successor_map_fn(
+            #     successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn
+            # )
+            context = retrieve_out
+            if i + 1 < self.max_hops:
+                context.add_successor_map_fn(
+                    successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn
+                )
 
-        printc(f"MultiHopRetriever2 backward: {args}", "yellow")
-        super().backward(*args, **kwargs)
-        return
+            # context = self.deduplicaters[i].forward(
+            #     exisiting_list=context, new_list=retrieve_out
+            # )
+            contexts.append(retrieve_out)
+            if i + 1 < self.max_hops:
+                retrieve_out.add_successor_map_fn(
+                    successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn
+                )
+
+                last_query = success_map_fn(gen_out)
+            # printc(f"retrieve_out, last_query: {last_query}")
+
+        # contexts[0].add_successor_map_fn(
+        #     successor=self.combine_list, map_fn=lambda x: x.data
+        # )
+        # contexts[1].add_successor_map_fn(
+        #     successor=self.combine_list, map_fn=lambda x: x.data
+        # )
+        # contexts_sum = self.combine_list.forward(
+        #     context_1=contexts[0], context_2=contexts[1]
+        # )
+        # contexts_sum.data_in_prompt = lambda x: {
+        #     "query": x.data.query,
+        #     "documents": x.data.documents,
+        # }
+        # setattr(contexts_sum, "queries", [q.data.data.query for q in queries])
+        queries[0].add_successor_map_fn(
+            successor=self.combine_queries, map_fn=lambda x: x.data.data.query
+        )
+        queries[1].add_successor_map_fn(
+            successor=self.combine_queries, map_fn=lambda x: x.data.data.query
+        )
+        combined_queries = self.combine_queries.forward(q_1=queries[0], q_2=queries[1])
+        printc(f"queries: {combined_queries.data}", "yellow")
+        return combined_queries
 
 
-from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG
+from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import (
+    VanillaRAG,
+)
 
 
 class MultiHopRAG(VanillaRAG):
@@ -418,12 +689,138 @@ def __init__(
             model_client=model_client,
             model_kwargs=model_kwargs,
         )
-        self.retriever = MultiHopRetriever2(
+        self.retriever = MultiHopRetriever(
             model_client=model_client,
             model_kwargs=model_kwargs,
             passages_per_hop=passages_per_hop,
             max_hops=max_hops,
         )
+        # update the parameters to untainable
+        # for name, param in self.llm.named_parameters():
+        #     param.requires_opt = False
+        #     printc(f"param: {name} requires_opt: {param.requires_opt}", "yellow")
+
+
+class MultiHopRAGCycle(VanillaRAG):
+    def __init__(
+        self, passages_per_hop=3, max_hops=2, model_client=None, model_kwargs=None
+    ):
+        super().__init__(
+            passages_per_hop=passages_per_hop,
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+        )
+        self.retriever = MultiHopRetrieverCycle(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            passages_per_hop=passages_per_hop,
+            max_hops=max_hops,
+        )
+
+
+# TODO: agent needs storage for the context instead of all in the step history.
+class AgenticRAG(adal.GradComponent):
+    def __init__(self, model_client, model_kwargs):
+        super().__init__()
+
+        self.dspy_retriever = DspyRetriever(top_k=2)
+        # self.llm_parser = adal.DataClassParser(
+        #     data_class=AnswerData, return_data_class=True, format_type="json"
+        # )
+        # self.llm = adal.Generator(
+        #     model_client=model_client,
+        #     model_kwargs=model_kwargs,
+        #     template=answer_template,
+        #     prompt_kwargs={
+        #         "task_desc_str": adal.Parameter(
+        #             data=task_desc_str,
+        #             role_desc="Task description for the language model",
+        #             param_type=adal.ParameterType.PROMPT,
+        #             requires_opt=True,
+        #         ),
+        #         "few_shot_demos": adal.Parameter(
+        #             data=None,
+        #             requires_opt=None,
+        #             role_desc="To provide few shot demos to the language model",
+        #             param_type=adal.ParameterType.DEMOS,
+        #         ),
+        #         "output_format_str": self.llm_parser.get_output_format_str(),
+        #     },
+        #     output_processors=self.llm_parser,
+        # )
+
+        # self.context = []
+
+        def dspy_retriever_as_tool(
+            input: str,
+            # context_variables: Dict,
+            id: Optional[str] = None,
+        ) -> List[str]:
+            r"""Retrieves the top 2 passages from using input as the query.
+            Ensure you get all the context to answer the original question.
+            """
+            output = self.dspy_retriever(input=input, id=id)
+            parsed_output = output
+            if isinstance(output, adal.Parameter):
+                parsed_output = output.data
+                return output
+            documents = parsed_output[0].documents
+            # if context_variables:
+            #     context_variables["context"].extend(documents)
+            return documents
+
+        # def generator_as_tool(
+        #     input: str,
+        #     context_variables: Dict,
+        #     id: Optional[str] = None,
+        # ) -> str:
+        #     r"""Generates the answer to the question(input) and the context from the context_variables(Dict).
+        #     Example: generator_as_tool(original question, context_variables=context_variables)
+
+        #     YOU MUST call generator_as_tool once to produce the final answer.
+        #     """
+        #     context = context_variables["context"]
+        #     # print(f"context: {context}")
+        #     output = self.llm(
+        #         prompt_kwargs={"question": input, "context": context}, id=id
+        #     )
+        #     return output
+
+        from adalflow.core.func_tool import FunctionTool
+
+        tools = [
+            FunctionTool(self.dspy_retriever.__call__, component=self.dspy_retriever),
+            # FunctionTool(generator_as_tool, component=self.llm),
+        ]  # NOTE: agent is not doing well to call component methods at this moment
+
+        tools = [
+            FunctionTool(dspy_retriever_as_tool, component=self.dspy_retriever),
+            # FunctionTool(generator_as_tool, component=self.llm),
+        ]
+
+        self.agent = ReActAgent(
+            max_steps=4,
+            add_llm_as_fallback=True,
+            tools=tools,
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            context_variables={"context": []},
+        )
+
+    def forward(self, *args, **kwargs) -> Parameter:
+        return self.bicall(*args, **kwargs)
+
+    def call(self, *args, **kwargs):
+        return self.bicall(*args, **kwargs)
+
+    def bicall(self, input: str, id: str = None) -> str:
+        out = self.agent(input=input, id=id)
+        if isinstance(out, adal.Parameter):
+            return out
+        return out  # .observation ReactOutput
+        # if isinstance(out, adal.Parameter):
+        #     return out.data[-1].observation
+        # return out[-1].observation
 
 
 def test_multi_hop_retriever():
@@ -438,17 +835,71 @@ def test_multi_hop_retriever():
         max_hops=2,
     )
 
+    question = "How many storeys are in the castle that David Gregory inherited?"
+    print(f"multi_hop_retriever: {multi_hop_retriever}")
+    return
+    # eval mode
+    output = multi_hop_retriever.call(input=question, id="1")
+    print(output)
+
+    # train mode
+    multi_hop_retriever.train()
+    output = multi_hop_retriever.forward(input=question, id="1")
+    print(output)
+    output.draw_graph()
+
+
+def test_multi_hop_retriever_cycle():
+
+    from use_cases.config import (
+        gpt_3_model,
+    )
+
+    multi_hop_retriever = MultiHopRetrieverCycle(
+        **gpt_3_model,
+        passages_per_hop=3,
+        max_hops=2,
+    )
+
     question = "How many storeys are in the castle that David Gregory inherited?"
 
     # eval mode
-    output = multi_hop_retriever.call(question=question, id="1")
+    output = multi_hop_retriever.call(input=question, id="1")
     print(output)
 
     # train mode
     multi_hop_retriever.train()
-    output = multi_hop_retriever.forward(question=question, id="1")
+    output = multi_hop_retriever.forward(input=question, id="1")
     print(output)
     output.draw_graph()
+    output.draw_output_subgraph()
+    output.draw_component_subgraph()
+
+
+def test_agent_rag():
+
+    from use_cases.config import (
+        gpt_3_model,
+    )
+
+    task = AgenticRAG(
+        **gpt_3_model,
+    )
+    print(task)
+
+    question = "How many storeys are in the castle that David Gregory inherited?"
+
+    task.train()
+    task(input=question, id="1")
+
+    # output =
+    # print(output)
+    # output.draw_graph()
+    # output.draw_output_subgraph()
+    # output.draw_component_subgraph()
+
+    # task.eval()
+    # output = task(input=question, id="1")
 
 
 def test_multi_hop_retriever2():
@@ -457,7 +908,7 @@ def test_multi_hop_retriever2():
         gpt_3_model,
     )
 
-    multi_hop_retriever = MultiHopRetriever2(
+    multi_hop_retriever = MultiHopRetriever(
         **gpt_3_model,
         passages_per_hop=3,
         max_hops=2,
@@ -529,6 +980,9 @@ def test_multi_hop_rag():
     ### Try the minimum effort to test on any task
 
     # get_logger(level="DEBUG")
-    # test_multi_hop_retriever()
+    test_multi_hop_retriever()
     # test_multi_hop_retriever2()
-    test_multi_hop_rag()
+
+    # test_multi_hop_retriever_cycle()
+    # test_multi_hop_rag()
+    # test_agent_rag()
diff --git a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
index 3eae0598..5fb078bd 100644
--- a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py
@@ -6,7 +6,7 @@
 
 import adalflow as adal
 
-from adalflow.datasets.hotpot_qa import HotPotQA
+from benchmarks.hotpot_qa.config import load_datasets
 
 from adalflow.core.retriever import Retriever
 from adalflow.core.types import RetrieverOutput
@@ -20,13 +20,13 @@
 dspy.settings.configure(rm=colbertv2_wiki17_abstracts)
 
 
-def load_datasets():
+# def load_datasets():
 
-    trainset = HotPotQA(split="train", size=20)
-    valset = HotPotQA(split="val", size=50)
-    testset = HotPotQA(split="test", size=50)
-    print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}")
-    return trainset, valset, testset
+#     trainset = HotPotQA(split="train", size=20)
+#     valset = HotPotQA(split="val", size=50)
+#     testset = HotPotQA(split="test", size=50)
+#     print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}")
+#     return trainset, valset, testset
 
 
 # task pipeline
@@ -102,6 +102,8 @@ class AnswerData(adal.DataClass):
 # Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline
 # as a subclass of retriever which is a subclass of GradComponent, we dont need to do additional implementation
 # data processing has already done
+
+
 class DspyRetriever(Retriever):
     def __init__(self, top_k: int = 3):
         super().__init__()
@@ -110,7 +112,7 @@ def __init__(self, top_k: int = 3):
 
     def call(
         self, input: str, top_k: Optional[int] = None, id: str = None
-    ) -> List[RetrieverOutput]:
+    ) -> RetrieverOutput:
 
         k = top_k or self.top_k
 
@@ -119,25 +121,34 @@ def call(
 
         output = self.dspy_retriever(query_or_queries=input, k=k)
         # print(f"dsy_retriever output: {output}")
-        final_output: List[RetrieverOutput] = []
         documents = output.passages
 
-        final_output.append(
-            RetrieverOutput(
-                query=input,
-                documents=documents,
-                doc_indices=[],
-            )
+        return RetrieverOutput(
+            query=input,
+            documents=documents,
+            doc_indices=[],
         )
-        # print(f"final_output: {final_output}")
-        return final_output
 
 
 task_desc_str = r"""Answer questions with short factoid answers.
 
-You will receive context(may contain relevant facts) and a question.
+You will receive context(contain relevant facts).
 Think step by step."""
 
+task_desc_str_system_finetuned = "Generate a concise, factually accurate answer by synthesizing information from the provided context. If multiple sources are available, prioritize resolving ambiguities and cross-referencing data for consistency. Ensure the final answer directly addresses the question while considering specific numerical or descriptive criteria mentioned in the input."
+
+# task_desc_str = r"""Answer questions with verbatim short factoid responses.
+
+# You will receive context. Extract only the most relevant fact for a precise answer.
+# """
+
+demo_str = r"""reasoning: \"Dragon Data, the producer of Dragon 32/64, was based in Port Talbot, Wales,\\\n  \\ while TK82C was a product of a Brazilian company, Microdigital Eletr\\xF4nica Ltda.\"\nanswer: 'No'\n\nreasoning: The context specifies that the live action sequel '102 Dalmatians' was\n  directed by Kevin Lima.\nanswer: Kevin Lima\n\nreasoning: The context specifically mentions that in the 1970 Michigan gubernatorial\n  election, Republican William Milliken defeated Democrat Sander Levin.\nanswer: William Milliken\n\nreasoning: The context states that 'Lost Songs from the Lost Years' is a compilation\n  by Cloud Cult, which is an experimental indie rock band from Duluth, Minnesota.\nanswer: Minnesota
+"""
+
+# task_desc_str = r"""Answer the question with given context.
+# The question requires you to answer one subquestion first, and then find the next potential subquestion and until you find the final answer.
+# """
+
 
 class VanillaRAG(adal.GradComponent):
     def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
@@ -154,18 +165,36 @@ def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
             model_kwargs=model_kwargs,
             prompt_kwargs={
                 "task_desc_str": adal.Parameter(
+                    # data=task_desc_str_system_finetuned,
                     data=task_desc_str,
-                    role_desc="Task description for the language model",
+                    role_desc="""Task description for the language model,\
+                    used with the following template: \
+                    {{task_desc_str}} \
+                    {{output_format_str}}\
+                    <START_OF_USER>
+Context: {{context}}
+Question: {{question}}
+<END_OF_USER>""",
                     param_type=adal.ParameterType.PROMPT,
                     requires_opt=True,
+                    instruction_to_backward_engine="You need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.",
+                    instruction_to_optimizer="ou need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.",
+                    # + "Given existing context, ensure the task instructions can maximize the performance.",
                 ),
-                "few_shot_demos": adal.Parameter(
-                    data=None,
-                    requires_opt=True,
-                    role_desc="To provide few shot demos to the language model",
-                    param_type=adal.ParameterType.DEMOS,
-                ),
+                # "few_shot_demos": adal.Parameter(
+                #     # data=demo_str,
+                #     data=None,
+                #     requires_opt=True,
+                #     role_desc="To provide few shot demos to the language model",
+                #     param_type=adal.ParameterType.DEMOS,
+                # ),
                 "output_format_str": self.llm_parser.get_output_format_str(),
+                # "output_format_str": adal.Parameter(
+                #     data=self.llm_parser.get_output_format_str(),
+                #     requires_opt=True,
+                #     param_type=adal.ParameterType.PROMPT,
+                #     role_desc="The output format string to ensure no failed json parsing",
+                # ),
             },
             template=answer_template,
             output_processors=self.llm_parser,
@@ -188,7 +217,7 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
         retriever_out = self.retriever.call(input=question, id=id)
 
         successor_map_fn = lambda x: (  # noqa E731
-            "\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
+            "\n\n".join(x.documents) if x and x.documents else ""
         )
         retrieved_context = successor_map_fn(retriever_out)
 
@@ -201,29 +230,16 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput:
             prompt_kwargs=prompt_kwargs,
             id=id,
         )
-        # self.llm.print_prompt(**prompt_kwargs)
-        # print(f"retrieved_context: {retrieved_context}")
-        # print(f"retriever_out: {retriever_out}")
-        return output
 
-    # def call(self, *, question: str, id: str = None) -> adal.GeneratorOutput:
-    #     self.train()
-    #     out = self.forward(question=question, id=id)
-    #     if not isinstance(out, adal.Parameter):
-    #         raise ValueError(
-    #             "This output should be a Parameter, please check the forward function"
-    #         )
-    #     self.eval()
-    #     return out.data
+        return output
 
-    # TODO: add id in the retriever output
     def forward(self, question: str, id: str = None) -> adal.Parameter:
         if not self.training:
             raise ValueError("This component is not supposed to be called in eval mode")
         retriever_out = self.retriever.forward(input=question, id=id)
         successor_map_fn = lambda x: (  # noqa E731
-            "\n\n".join(x.data[0].documents)
-            if x.data and x.data[0] and x.data[0].documents
+            "\n\n".join(x.data.documents)
+            if x.data and x.data and x.data.documents
             else ""
         )
         retriever_out.add_successor_map_fn(successor=self.llm, map_fn=successor_map_fn)
@@ -242,16 +258,17 @@ def bicall(
         retriever_out = self.retriever(input=question)
         if isinstance(retriever_out, adal.Parameter):
             successor_map_fn = lambda x: (  # noqa E731
-                "\n\n".join(x.data[0].documents)
-                if x.data and x.data[0] and x.data[0].documents
+                "\n\n".join(x.data.documents)
+                if x.data and x.data and x.data.documents
                 else ""
             )
             retriever_out.add_successor_map_fn(
                 successor=self.llm, map_fn=successor_map_fn
             )
+            # retriever_out.requires_opt = False
         else:
             successor_map_fn = lambda x: (  # noqa E731
-                "\n\n".join(x[0].documents) if x and x[0] and x[0].documents else ""
+                "\n\n".join(x.documents) if x and x.documents else ""
             )
             retrieved_context = successor_map_fn(retriever_out)
         prompt_kwargs = {
@@ -262,8 +279,77 @@ def bicall(
         return output
 
 
+class Vanilla(adal.Component):
+    def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None):
+        super().__init__()
+
+        self.passages_per_hop = passages_per_hop
+
+        # self.retriever = DspyRetriever(top_k=passages_per_hop)
+        self.llm_parser = adal.DataClassParser(
+            data_class=AnswerData, return_data_class=True, format_type="json"
+        )
+        self.llm = Generator(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            prompt_kwargs={
+                "task_desc_str": adal.Parameter(
+                    data=task_desc_str,
+                    role_desc="Task description for the language model",
+                    param_type=adal.ParameterType.PROMPT,
+                    requires_opt=True,
+                    instruction_to_backward_engine="You need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.",
+                    instruction_to_optimizer="ou need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.",
+                    # + "Given existing context, ensure the task instructions can maximize the performance.",
+                ),
+                "few_shot_demos": adal.Parameter(
+                    data=None,
+                    requires_opt=True,
+                    role_desc="To provide few shot demos to the language model",
+                    param_type=adal.ParameterType.DEMOS,
+                ),
+                "output_format_str": self.llm_parser.get_output_format_str(),
+            },
+            template=answer_template,
+            output_processors=self.llm_parser,
+            use_cache=True,
+        )
+
+    def call(
+        self, question: str, context: List[str], id: str = None
+    ) -> adal.GeneratorOutput:
+        if self.training:
+            raise ValueError(
+                "This component is not supposed to be called in training mode"
+            )
+
+        prompt_kwargs = {
+            "context": context,
+            "question": question,
+        }
+
+        output = self.llm.call(
+            prompt_kwargs=prompt_kwargs,
+            id=id,
+        )
+
+        return output
+
+    # TODO: add id in the retriever output
+    def forward(
+        self, question: str, context: List[str], id: str = None
+    ) -> adal.Parameter:
+        if not self.training:
+            raise ValueError("This component is not supposed to be called in eval mode")
+
+        generator_out = self.llm.forward(
+            prompt_kwargs={"question": question, "context": context}, id=id
+        )
+        return generator_out
+
+
 def test_retriever():
-    question = "How many storeys are in the castle that David Gregory inherited?"
+    question = "Were Scott Derrickson and Ed Wood of the same nationality?"
     retriever = DspyRetriever(top_k=3)
     retriever_out = retriever(input=question)
     print(f"retriever_out: {retriever_out}")
@@ -271,10 +357,6 @@ def test_retriever():
 
 def test_vailla_rag():
 
-    from use_cases.config import (
-        gpt_3_model,
-    )
-
     task = VanillaRAG(
         **gpt_3_model,
         passages_per_hop=3,
@@ -301,6 +383,29 @@ def test_vailla_rag():
     # print(f"generator_out: {generator_out}")
 
 
+from use_cases.config import (
+    gpt_3_model,
+)
+
+
+def test_vanilla():
+    task = Vanilla(
+        **gpt_3_model,
+        passages_per_hop=3,
+    )
+    task.eval()
+    data_train, data_val, data_test = load_datasets()
+    data = data_train[0]
+
+    output = task.call(question=data.question, context=data.context, id="1")
+    print(f"output: {output}, answer: {data.answer}")
+
+    task.train()
+    output = task.forward(question=data.question, context=data.context, id="1")
+    print(f"output: {output.data}, answer: {data.answer}")
+
+
 if __name__ == "__main__":
     # test_retriever()
-    test_vailla_rag()
+    test_vanilla()
+    # test_vailla_rag()
diff --git a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
new file mode 100644
index 00000000..07f86674
--- /dev/null
+++ b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
@@ -0,0 +1,252 @@
+from typing import Any, Callable, Dict, Tuple
+
+import adalflow as adal
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.datasets.types import HotPotQAData
+
+from benchmarks.hotpot_qa.config import load_datasets
+from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import AgenticRAG
+from use_cases.config import gpt_3_model, gpt_4o_model
+from adalflow.utils import printc
+
+
+# TODO: look more into the loss function
+# TODO: test LLM judge too.
+
+from adalflow.components.agent.react import ReActOutput
+
+
+class AgenticRAGAdal(adal.AdalComponent):
+    def __init__(
+        self,
+        model_client: adal.ModelClient,
+        model_kwargs: Dict,
+        backward_engine_model_config: Dict | None = None,
+        teacher_model_config: Dict | None = None,
+        text_optimizer_model_config: Dict | None = None,
+    ):
+        task = AgenticRAG(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+        )
+        eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+        loss_fn = adal.EvalFnToTextLoss(
+            eval_fn=eval_fn, eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0"
+        )
+        # eval_fn = f1_score  # 0.38 (hand crafted the finish, exat match 0.25)
+
+        # loss_fn = adal.EvalFnToTextLoss(
+        #     eval_fn=eval_fn, eval_fn_desc="Computes the overlaps between y and y_gt"
+        # )
+        super().__init__(
+            task=task,
+            eval_fn=eval_fn,
+            loss_fn=loss_fn,
+            backward_engine_model_config=backward_engine_model_config,
+            teacher_model_config=teacher_model_config,
+            text_optimizer_model_config=text_optimizer_model_config,
+        )
+
+    # tell the trainer how to call the task
+    def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
+        if self.task.training:
+            return self.task.forward, {"input": sample.question, "id": sample.id}
+        else:
+            # print("eval mode")
+            return self.task.call, {"input": sample.question, "id": sample.id}
+
+    # TODO: use two map fn to make the cde even simpler
+
+    # eval mode: get the generator output, directly engage with the eval_fn
+    def prepare_eval(self, sample: HotPotQAData, y_pred: ReActOutput) -> float:
+        y_label = ""
+        if y_pred is not None and y_pred.answer:
+            y_label = y_pred.answer
+
+        printc(
+            f"eval y_label: {y_label}, y_gt: {sample.answer}, self.eval_fn: {self.eval_fn(y_label, sample.answer)}"
+        )
+
+        return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
+
+    # train mode: get the loss and get the data from the full_response
+    def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
+        # prepare gt parameter
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.answer,
+            eval_input=sample.answer,
+            requires_opt=False,
+        )
+
+        # pred's full_response is the output of the task pipeline which is GeneratorOutput
+        # pred.eval_input = (
+        #     pred.data[-1].observation if pred.data and pred.data[-1] else ""
+        # )
+        printc(f"pred data: {pred.data}")
+        pred.eval_input = pred.data.answer if pred.data else ""
+        # pred.eval_input = (
+        #     pred.data[-1].observation if pred.data and pred.data[-1] else ""
+        # )
+        # printc(f"loss eval_input: {pred.eval_input}")
+        return self.loss_fn, {
+            "kwargs": {"y": pred, "y_gt": y_gt},
+            "id": sample.id,
+            "gt": y_gt.eval_input,
+            "input": {"question": sample.question},
+        }
+
+
+# Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics
+# i checked the eval which does fuzzy match, and found some yes and Yes are not matched, then converted both strings to lower and
+# the performances have gone up from 0.15 to 0.4
+def train_diagnose(
+    model_client: adal.ModelClient,
+    model_kwargs: Dict,
+) -> Dict:
+
+    trainset, valset, testset = load_datasets()
+
+    adal_component = AgenticRAGAdal(
+        model_client,
+        model_kwargs,
+        backward_engine_model_config=gpt_4o_model,
+        teacher_model_config=gpt_3_model,
+        text_optimizer_model_config=gpt_3_model,
+    )
+    trainset = trainset[:5]
+    trainer = adal.Trainer(adaltask=adal_component)
+    trainer.diagnose(dataset=trainset, split="train")
+    # trainer.diagnose(dataset=valset, split="val")
+    # trainer.diagnose(dataset=testset, split="test")
+
+
+from adalflow.core.generator import BackwardPassSetup
+
+
+def train(
+    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
+    raw_shots: int = 0,
+    bootstrap_shots: int = 4,
+    max_steps=1,
+    num_workers=4,
+    strategy="constrained",
+    optimization_order="sequential",
+    debug=False,
+    resume_from_ckpt=None,
+    exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
+):
+    adal_component = AgenticRAGAdal(
+        **gpt_3_model,
+        teacher_model_config=gpt_4o_model,
+        text_optimizer_model_config=gpt_4o_model,  # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt
+        backward_engine_model_config=gpt_4o_model,
+    )
+    print(adal_component)
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+    trainer = adal.Trainer(
+        train_batch_size=train_batch_size,
+        adaltask=adal_component,
+        strategy=strategy,
+        max_steps=max_steps,
+        num_workers=num_workers,
+        raw_shots=raw_shots,
+        bootstrap_shots=bootstrap_shots,
+        debug=debug,
+        weighted_sampling=True,
+        optimization_order=optimization_order,
+        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        sequential_order=["text", "demo"],
+        max_proposals_per_step=max_proposals_per_step,
+        backward_pass_setup=backward_pass_setup,
+    )
+    trainer.set_random_seed(seed)
+    print(trainer)
+
+    train_dataset, val_dataset, test_dataset = load_datasets()
+    train_dataset = train_dataset[:4]
+    val_dataset = val_dataset[:4]
+    test_dataset = test_dataset[:4]
+    ckpt, _ = trainer.fit(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        resume_from_ckpt=resume_from_ckpt,
+    )
+    return ckpt
+
+
+if __name__ == "__main__":
+    from use_cases.config import gpt_3_model
+
+    log = adal.get_logger(level="DEBUG", enable_console=False)
+
+    adal.setup_env()
+    import json
+
+    import random
+
+    random.seed(2025)
+
+    adal.setup_env()
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
+    # task = MultiHopRAGAdal(**gpt_3_model)
+    # print(task)
+
+    # train_diagnose(**gpt_3_model)
+    # exit()
+
+    ckpt = train(
+        debug=True,
+        max_steps=12,
+        seed=2025,
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json",
+    )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
+
+    # 0.68 on val without training, 0.74on the second step. 0.84 test
+    # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_2_029cb_run_1.json
+    # 0.7, 0.72 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_2_b7523_run_1.json
+    # 208.085706949234s, 2 steps, maximum 4 steps allow for an agent.
+    # 0.72->0.74, 4 steps, 366s, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json [Already faster, still lots to optimize]
+
+    # 1246s, 12 steps, 0.8 val, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_defe7_run_1.json
+    # 2149s, both gradients, 0.68 -> 0.78 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_8a24a_run_1.json
+    # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_cdcb5_run_1.json 1728 s, 0.8
+    # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_735a7_run_1.json 0.58 -> 0.68 (separate gradients)  "pass": 17,
+    #       "fail": 35
diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
index d80e6336..389787dc 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py
@@ -3,8 +3,9 @@
 import adalflow as adal
 from adalflow.eval.answer_match_acc import AnswerMatchAcc
 from adalflow.datasets.types import HotPotQAData
+from benchmarks.hotpot_qa.config import load_datasets
 
-from benchmarks.hotpot_qa._adal_train import load_datasets
+# from benchmarks.hotpot_qa._adal_train import load_datasets
 from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import MultiHopRAG
 from use_cases.config import gpt_3_model, gpt_4o_model
 
@@ -23,12 +24,12 @@ def __init__(
         task = MultiHopRAG(
             model_client=model_client,
             model_kwargs=model_kwargs,
-            passages_per_hop=3,
+            passages_per_hop=2,  # better with only two passages, ablation study 0.49 vs 0.52
             max_hops=2,
         )
-        eval_fn = AnswerMatchAcc(type="fuzzy_match").compute_single_item
+        eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
         loss_fn = adal.EvalFnToTextLoss(
-            eval_fn=eval_fn, eval_fn_desc="fuzzy_match: 1 if str(y) in str(y_gt) else 0"
+            eval_fn=eval_fn, eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0"
         )
         super().__init__(
             task=task,
@@ -67,13 +68,19 @@ def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
 
         # pred's full_response is the output of the task pipeline which is GeneratorOutput
         pred.eval_input = (
-            pred.full_response.data.answer
-            if pred.full_response
-            and pred.full_response.data
-            and pred.full_response.data.answer
+            pred.data.data.answer
+            if pred.data and pred.data.data and pred.data.data.answer
             else ""
         )
-        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}}
+        return self.loss_fn, {
+            "kwargs": {"y": pred, "y_gt": y_gt},
+            "input": {"question": sample.question},
+            "id": sample.id,
+            "gt": sample.answer,
+        }
+
+
+from adalflow.core.generator import BackwardPassSetup
 
 
 # Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics
@@ -101,23 +108,32 @@ def train_diagnose(
 
 def train(
     train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
-    raw_shots: int = 0,
-    bootstrap_shots: int = 4,
+    raw_shots: int = 2,
+    bootstrap_shots: int = 2,
     max_steps=1,
-    num_workers=4,
+    num_workers=10,
     strategy="constrained",
     optimization_order="sequential",
     debug=False,
     resume_from_ckpt=None,
     exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
 ):
     adal_component = MultiHopRAGAdal(
         **gpt_3_model,
-        teacher_model_config=gpt_3_model,
+        teacher_model_config=gpt_4o_model,
         text_optimizer_model_config=gpt_4o_model,  # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt
         backward_engine_model_config=gpt_4o_model,
     )
-    print(adal_component)
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+    # print(adal_component)
     trainer = adal.Trainer(
         train_batch_size=train_batch_size,
         adaltask=adal_component,
@@ -127,40 +143,82 @@ def train(
         raw_shots=raw_shots,
         bootstrap_shots=bootstrap_shots,
         debug=debug,
-        weighted_sampling=True,
+        weighted_sampling=False,
         optimization_order=optimization_order,
         exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
         sequential_order=["text", "demo"],
+        max_proposals_per_step=max_proposals_per_step,
+        backward_pass_setup=backward_pass_setup,
     )
+    trainer.set_random_seed(seed)
     print(trainer)
 
     train_dataset, val_dataset, test_dataset = load_datasets()
-    trainer.fit(
+    ckpt, _ = trainer.fit(
         train_dataset=train_dataset,
         val_dataset=val_dataset,
         test_dataset=test_dataset,
         resume_from_ckpt=resume_from_ckpt,
     )
+    return ckpt
 
 
 if __name__ == "__main__":
     from use_cases.config import gpt_3_model
 
-    log = adal.get_logger(level="DEBUG", enable_console=False)
+    # log = adal.get_logger(level="DEBUG", enable_console=False)
 
     adal.setup_env()
 
+    import json
+
+    import random
+
+    random.seed(2025)
+
+    adal.setup_env()
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
     # task = MultiHopRAGAdal(**gpt_3_model)
     # print(task)
 
     # train_diagnose(**gpt_3_model)
 
     # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion
-    train(
-        debug=False,
+    ckpt = train(
+        debug=True,
         max_steps=12,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_fde51_run_1.json",
         # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json",
     )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
 
     # notes for debug: if have nontype, delete all model cache and try again
     #    raise ValueError(ValueError: score must be provided for each demo,
@@ -181,3 +239,4 @@ def train(
     # feedback while seeing the gt + y
     # only negative feedback /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_f5506_run_1.json 0.62 -> 0.7
     # /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_b4aa5_run_1.json 0.74 pass rate 8 32
+    # random cycle rag: /Users/liyin/.adalflow/ckpt/MultiHopRAGCycleAdal/random_max_steps_12_82bd2_run_1.json 0.64
diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py
new file mode 100644
index 00000000..1c20d18a
--- /dev/null
+++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py
@@ -0,0 +1,234 @@
+from typing import Any, Callable, Dict, Tuple
+
+import adalflow as adal
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.datasets.types import HotPotQAData
+
+from benchmarks.hotpot_qa.config import load_datasets
+from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import MultiHopRAGCycle
+from use_cases.config import gpt_3_model, gpt_4o_model
+
+
+# TODO: look more into the loss function
+# TODO: test LLM judge too.
+class MultiHopRAGCycleAdal(adal.AdalComponent):
+    def __init__(
+        self,
+        model_client: adal.ModelClient,
+        model_kwargs: Dict,
+        backward_engine_model_config: Dict | None = None,
+        teacher_model_config: Dict | None = None,
+        text_optimizer_model_config: Dict | None = None,
+    ):
+        task = MultiHopRAGCycle(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            passages_per_hop=2,
+            max_hops=2,
+        )
+        eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+        loss_fn = adal.EvalFnToTextLoss(
+            eval_fn=eval_fn, eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0"
+        )
+        # eval_fn = AnswerMatchAcc(type="fuzzy_match").compute_single_item
+        # loss_fn = adal.EvalFnToTextLoss(
+        #     eval_fn=eval_fn,
+        #     eval_fn_desc="fuzzy_match: 1 if  str(y_gt) in str(y) in else 0",
+        # )
+        super().__init__(
+            task=task,
+            eval_fn=eval_fn,
+            loss_fn=loss_fn,
+            backward_engine_model_config=backward_engine_model_config,
+            teacher_model_config=teacher_model_config,
+            text_optimizer_model_config=text_optimizer_model_config,
+        )
+
+    # tell the trainer how to call the task
+    def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
+        if self.task.training:
+            return self.task.forward, {"question": sample.question, "id": sample.id}
+        else:
+            return self.task.call, {"question": sample.question, "id": sample.id}
+
+    # TODO: use two map fn to make the cde even simpler
+
+    # eval mode: get the generator output, directly engage with the eval_fn
+    def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float:
+        y_label = ""
+        if y_pred and y_pred.data and y_pred.data.answer:
+            y_label = y_pred.data.answer
+        return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
+
+    # train mode: get the loss and get the data from the full_response
+    def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
+        # prepare gt parameter
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.answer,
+            eval_input=sample.answer,
+            requires_opt=False,
+        )
+
+        # pred's full_response is the output of the task pipeline which is GeneratorOutput
+        pred.eval_input = (
+            pred.data.data.answer
+            if pred.data and pred.data.data and pred.data.data.answer
+            else ""
+        )
+        return self.loss_fn, {
+            "kwargs": {"y": pred, "y_gt": y_gt},
+            "id": sample.id,
+            "input": {"question": sample.question},
+        }
+
+
+# Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics
+# i checked the eval which does fuzzy match, and found some yes and Yes are not matched, then converted both strings to lower and
+# the performances have gone up from 0.15 to 0.4
+def train_diagnose(
+    model_client: adal.ModelClient,
+    model_kwargs: Dict,
+) -> Dict:
+
+    trainset, valset, testset = load_datasets()
+
+    adal_component = MultiHopRAGCycleAdal(
+        model_client,
+        model_kwargs,
+        backward_engine_model_config=gpt_4o_model,
+        teacher_model_config=gpt_3_model,
+        text_optimizer_model_config=gpt_3_model,
+    )
+    trainer = adal.Trainer(adaltask=adal_component)
+    trainer.diagnose(dataset=trainset, split="train")
+    trainer.diagnose(dataset=valset, split="val")
+    trainer.diagnose(dataset=testset, split="test")
+
+
+from adalflow.core.generator import BackwardPassSetup
+
+
+def train(
+    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
+    raw_shots: int = 0,
+    bootstrap_shots: int = 4,
+    max_steps=1,
+    num_workers=4,
+    strategy="random",
+    optimization_order="sequential",
+    debug=False,
+    resume_from_ckpt=None,
+    exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
+):
+    adal_component = MultiHopRAGCycleAdal(
+        **gpt_3_model,
+        teacher_model_config=gpt_3_model,
+        text_optimizer_model_config=gpt_4o_model,  # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt
+        backward_engine_model_config=gpt_4o_model,
+    )
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+    print(adal_component)
+    trainer = adal.Trainer(
+        train_batch_size=train_batch_size,
+        adaltask=adal_component,
+        strategy=strategy,
+        max_steps=max_steps,
+        num_workers=num_workers,
+        raw_shots=raw_shots,
+        bootstrap_shots=bootstrap_shots,
+        debug=debug,
+        weighted_sampling=True,
+        optimization_order=optimization_order,
+        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        sequential_order=["text", "demo"],
+        backward_pass_setup=backward_pass_setup,
+    )
+    print(trainer)
+    trainer.set_random_seed(seed)
+
+    train_dataset, val_dataset, test_dataset = load_datasets()
+
+    # replace the train dataset for debug
+    # if debug:
+    #     train_dataset = train_dataset[:2]
+    #     data: HotPotQAData = train_dataset[0]
+    #     data.question = "Brown State Fishing Lake is in a country that has a population of how many inhabitants?"
+    #     data.answer = "9,984"
+    #     print(f"train_dataset: {train_dataset}")
+
+    ckpt, _ = trainer.fit(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        resume_from_ckpt=resume_from_ckpt,
+    )
+    return ckpt
+
+
+if __name__ == "__main__":
+    from use_cases.config import gpt_3_model
+    import json
+
+    import random
+
+    random.seed(2025)
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
+    # log = adal.get_logger(
+    #     level="DEBUG", enable_console=False, filename="multi_hop_rag_cycle.log"
+    # )
+
+    adal.setup_env()
+
+    # task = MultiHopRAGAdal(**gpt_3_model)
+    # print(task)
+
+    # train_diagnose(**gpt_3_model)
+    # exit()
+
+    # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion
+    ckpt = train(
+        debug=False,
+        max_steps=12,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        # resume_from_ckpt="/Users/liyin/Documents/test/LightRAG/.adalflow/ckpt/MultiHopRAGCycleAdal/constrained_max_steps_12_69e07_run_1.json",
+    )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
+
+    # the best 0.74
+    # /Users/liyin/.adalflow/ckpt/MultiHopRAGCycleAdal/constrained_max_steps_12_75fb6_run_1.json 0.7 no positive gradients
+    # /Users/liyin/.adalflow/ckpt/MultiHopRAGCycleAdal/constrained_max_steps_12_0976c_run_1.json 0.7
diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py
new file mode 100644
index 00000000..c0edded3
--- /dev/null
+++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py
@@ -0,0 +1,314 @@
+from typing import Any, Callable, Dict, Tuple, List
+
+import adalflow as adal
+from adalflow.eval.retriever_recall import RetrieverEvaluator
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.datasets.types import HotPotQAData
+from benchmarks.hotpot_qa.config import load_datasets
+
+from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import (
+    MultiHopRetriever,
+)
+from use_cases.config import gpt_3_model, gpt_4o_model
+from adalflow.utils import printc
+
+
+def retriever_recall(y: List[str], y_gt: List[str]) -> float:
+    return RetrieverEvaluator().compute_single_item(y, y_gt)["recall"]
+
+
+def retriever_precision(y: List[str], y_gt: List[str]) -> float:
+    return RetrieverEvaluator().compute_single_item(y, y_gt)["precision"]
+
+
+def retriever_query_f1(y: str, y_gt: str) -> float:
+    evaluator = AnswerMatchAcc(type="f1_score")
+    score = evaluator.compute_single_item(y, y_gt)
+
+    return score
+
+
+class MultiHopRetrieverAdal(adal.AdalComponent):
+    def __init__(
+        self,
+        model_client: adal.ModelClient,
+        model_kwargs: Dict,
+        backward_engine_model_config: Dict | None = None,
+        teacher_model_config: Dict | None = None,
+        text_optimizer_model_config: Dict | None = None,
+    ):
+        task = MultiHopRetriever(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            passages_per_hop=2,
+            max_hops=2,
+        )
+        eval_fn = retriever_recall
+        loss_fn = adal.EvalFnToTextLoss(
+            eval_fn=eval_fn,
+            eval_fn_desc="recall: len(y_gt.intersection(y)) / len(y_gt)",
+        )
+        super().__init__(
+            task=task,
+            eval_fn=eval_fn,
+            loss_fn=loss_fn,
+            backward_engine_model_config=backward_engine_model_config,
+            teacher_model_config=teacher_model_config,
+            text_optimizer_model_config=text_optimizer_model_config,
+        )
+
+    def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
+        if self.task.training:
+            return self.task.forward, {"input": sample.question, "id": sample.id}
+        else:
+            return self.task.call, {"input": sample.question, "id": sample.id}
+
+    def prepare_eval(self, sample: HotPotQAData, y_pred: adal.RetrieverOutput) -> float:
+        if isinstance(y_pred, adal.Parameter):
+            raise ValueError("y_pred is not a RetrieverOutput")
+        documents = y_pred.documents
+        y_pred_titles = []
+        for doc in documents:
+            title, content = doc.split("|")
+            y_pred_titles.append(title)
+
+        return self.eval_fn, {
+            "y": y_pred_titles,
+            "y_gt": list(sample.gold_titles),
+        }
+
+    def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.gold_titles,
+            eval_input=list(sample.gold_titles),
+            requires_opt=False,
+        )
+
+        pred_titles = []
+        for doc in pred.data.documents:
+            title, content = doc.split("|")
+            pred_titles.append(title)
+
+        pred.eval_input = pred_titles
+        return self.loss_fn, {
+            "kwargs": {"y": pred, "y_gt": y_gt},
+            "id": sample.id,
+            "gt": y_gt.data,
+        }
+
+
+# 1. test the eval and the loss use different metrics
+class MultiHopRetriever2Adal(adal.AdalComponent):
+    def __init__(
+        self,
+        model_client: adal.ModelClient,
+        model_kwargs: Dict,
+        backward_engine_model_config: Dict | None = None,
+        teacher_model_config: Dict | None = None,
+        text_optimizer_model_config: Dict | None = None,
+    ):
+        task = MultiHopRetriever(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            passages_per_hop=2,
+            max_hops=2,
+        )
+        eval_fn = retriever_query_f1
+        loss_fn = adal.EvalFnToTextLoss(
+            eval_fn=eval_fn,
+            eval_fn_desc="precision: overlap of words between gt and prediction (queries). Only evaluate the generated queries from the generator. The multiple queries are joiend together by ',' to evaluate over the overlap on words.",
+        )
+        super().__init__(
+            task=task,
+            eval_fn=eval_fn,
+            loss_fn=loss_fn,
+            backward_engine_model_config=backward_engine_model_config,
+            teacher_model_config=teacher_model_config,
+            text_optimizer_model_config=text_optimizer_model_config,
+        )
+        self.eval_retriever_recall = retriever_recall
+
+    def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
+        if self.task.training:
+            return self.task.forward2, {"input": sample.question, "id": sample.id}
+        else:
+            return self.task.call2, {"input": sample.question, "id": sample.id}
+
+    def prepare_eval(self, sample: HotPotQAData, y_pred: any) -> float:
+        if isinstance(y_pred, adal.Parameter):
+            raise ValueError("y_pred is not a RetrieverOutput")
+
+        y_gt = ", ".join(sample.gold_titles)
+        # for doc in documents:
+        #     title, content = doc.split("|")
+        #     y_pred_titles.append(title)
+
+        printc(f"y_gt: {y_gt}, pred: {y_pred}")
+
+        return self.eval_fn, {
+            "y": y_pred.data,
+            "y_gt": y_gt,
+        }
+
+    def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
+
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.gold_titles,
+            eval_input=", ".join(sample.gold_titles),
+            requires_opt=False,
+        )
+
+        pred.eval_input = pred.data.data
+
+        printc(f"y_gt 1: {sample.gold_titles}, pred 1: {pred.eval_input}")
+
+        return self.loss_fn, {
+            "kwargs": {"y": pred, "y_gt": y_gt},
+            "id": sample.id,
+            "gt": y_gt.data,
+        }
+
+
+from adalflow.core.generator import BackwardPassSetup
+
+
+def train_diagnose(
+    model_client: adal.ModelClient,
+    model_kwargs: Dict,
+) -> Dict:
+
+    trainset, valset, testset = load_datasets()
+
+    adal_component = MultiHopRetrieverAdal(
+        model_client,
+        model_kwargs,
+        backward_engine_model_config=gpt_4o_model,
+        teacher_model_config=gpt_3_model,
+        text_optimizer_model_config=gpt_3_model,
+    )
+    trainer = adal.Trainer(adaltask=adal_component)
+    # trainer.diagnose(dataset=trainset, split="train")  # 0.69 recall
+    # trainer.diagnose(dataset=valset, split="val")  # 0.675 recall
+    trainer.diagnose(dataset=testset, split="test")  # 0.71 (0.665)
+
+
+def train(
+    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
+    raw_shots: int = 1,
+    bootstrap_shots: int = 1,
+    max_steps=1,
+    num_workers=10,
+    strategy="constrained",
+    optimization_order="sequential",
+    debug=False,
+    resume_from_ckpt=None,
+    exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
+):
+    adal_component = MultiHopRetrieverAdal(
+        **gpt_3_model,
+        teacher_model_config=gpt_4o_model,
+        text_optimizer_model_config=gpt_4o_model,  # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt
+        backward_engine_model_config=gpt_4o_model,
+    )
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+    # print(adal_component)
+    trainer = adal.Trainer(
+        train_batch_size=train_batch_size,
+        adaltask=adal_component,
+        strategy=strategy,
+        max_steps=max_steps,
+        num_workers=num_workers,
+        raw_shots=raw_shots,
+        bootstrap_shots=bootstrap_shots,
+        debug=debug,
+        weighted_sampling=False,
+        optimization_order=optimization_order,
+        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        sequential_order=["text", "demo"],
+        max_proposals_per_step=max_proposals_per_step,
+        backward_pass_setup=backward_pass_setup,
+    )
+    trainer.set_random_seed(seed)
+    print(trainer)
+
+    train_dataset, val_dataset, test_dataset = load_datasets()
+    # val_dataset = val_dataset[:20]
+    ckpt, _ = trainer.fit(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        resume_from_ckpt=resume_from_ckpt,
+    )
+    return ckpt
+
+
+if __name__ == "__main__":
+    from use_cases.config import gpt_3_model
+
+    # log = adal.get_logger(level="DEBUG", enable_console=False)
+
+    adal.setup_env()
+
+    import json
+
+    import random
+
+    random.seed(2025)
+
+    adal.setup_env()
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
+    # task = MultiHopRAGAdal(**gpt_3_model)
+    # print(task)
+
+    # train_diagnose(**gpt_3_model)
+    # exit()
+
+    # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion
+    ckpt = train(
+        debug=True,
+        max_steps=12,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        exclude_input_fields_from_bootstrap_demos=True,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRetrieverAdal/constrained_max_steps_12_945bd_run_1.json",
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRetrieverAdal/constrained_max_steps_12_d7043_run_1.json",
+    )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
+
+    #
diff --git a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
index fc14e161..14f5e92e 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py
@@ -4,14 +4,16 @@
 from adalflow.eval.answer_match_acc import AnswerMatchAcc
 from adalflow.datasets.types import HotPotQAData
 
-from benchmarks.hotpot_qa._adal_train import load_datasets
-from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG
-from use_cases.config import gpt_3_model, gpt_4o_model
+from benchmarks.hotpot_qa.config import load_datasets
+from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import Vanilla
+from use_cases.config import gpt_3_model, gpt_4o_model, gpt_3_1106_model
+
+from adalflow.utils import printc
 
 
 # TODO: look more into the loss function
 # TODO: test LLM judge too.
-class VallinaRAGAdal(adal.AdalComponent):
+class VallinaAdal(adal.AdalComponent):
     def __init__(
         self,
         model_client: adal.ModelClient,
@@ -20,14 +22,14 @@ def __init__(
         teacher_model_config: Dict | None = None,
         text_optimizer_model_config: Dict | None = None,
     ):
-        task = VanillaRAG(
+        task = Vanilla(
             model_client=model_client,
             model_kwargs=model_kwargs,
             passages_per_hop=3,
         )
-        eval_fn = AnswerMatchAcc(type="fuzzy_match").compute_single_item
+        eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
         loss_fn = adal.EvalFnToTextLoss(
-            eval_fn=eval_fn, eval_fn_desc="fuzzy_match: 1 if str(y) in str(y_gt) else 0"
+            eval_fn=eval_fn, eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0"
         )
         super().__init__(
             task=task,
@@ -41,9 +43,17 @@ def __init__(
     # tell the trainer how to call the task
     def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
         if self.task.training:
-            return self.task.forward, {"question": sample.question, "id": sample.id}
+            return self.task.forward, {
+                "question": sample.question,
+                "context": sample.context,
+                "id": sample.id,
+            }
         else:
-            return self.task.call, {"question": sample.question, "id": sample.id}
+            return self.task.call, {
+                "question": sample.question,
+                "context": sample.context,
+                "id": sample.id,
+            }
 
     # TODO: use two map fn to make the cde even simpler
 
@@ -51,7 +61,8 @@ def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
     def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float:
         y_label = ""
         if y_pred and y_pred.data and y_pred.data.answer:
-            y_label = y_pred.data.answer
+            y_label = y_pred.data.answer  # .lower()
+        printc(f"y_label: {y_label}, y_gt: {sample.answer}")
         return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
 
     # train mode: get the loss and get the data from the full_response
@@ -66,13 +77,11 @@ def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
 
         # pred's full_response is the output of the task pipeline which is GeneratorOutput
         pred.eval_input = (
-            pred.full_response.data.answer
-            if pred.full_response
-            and pred.full_response.data
-            and pred.full_response.data.answer
+            pred.data.data.answer
+            if pred.data and pred.data.data and pred.data.data.answer
             else ""
         )
-        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}}
+        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
 
 
 # Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics
@@ -85,7 +94,7 @@ def train_diagnose(
 
     trainset, valset, testset = load_datasets()
 
-    adal_component = VallinaRAGAdal(
+    adal_component = VallinaAdal(
         model_client,
         model_kwargs,
         backward_engine_model_config=gpt_4o_model,
@@ -98,6 +107,9 @@ def train_diagnose(
     # trainer.diagnose(dataset=testset, split="test")
 
 
+from adalflow.core.generator import BackwardPassSetup
+
+
 def train(
     train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
     raw_shots: int = 0,
@@ -109,14 +121,23 @@ def train(
     debug=False,
     resume_from_ckpt=None,
     exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
 ):
-    adal_component = VallinaRAGAdal(
-        **gpt_3_model,
+    adal_component = VallinaAdal(
+        **gpt_3_1106_model,
         teacher_model_config=gpt_4o_model,
         text_optimizer_model_config=gpt_4o_model,
         backward_engine_model_config=gpt_4o_model,
     )
     print(adal_component)
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
     trainer = adal.Trainer(
         train_batch_size=train_batch_size,
         adaltask=adal_component,
@@ -129,23 +150,54 @@ def train(
         weighted_sampling=True,
         optimization_order=optimization_order,
         exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        max_proposals_per_step=max_proposals_per_step,
+        backward_pass_setup=backward_pass_setup,
     )
+    trainer.set_random_seed(seed)
     print(trainer)
 
     train_dataset, val_dataset, test_dataset = load_datasets()
-    trainer.fit(
+    ckpt, _ = trainer.fit(
         train_dataset=train_dataset,
         val_dataset=val_dataset,
+        # test_dataset=val_dataset[0:4],
         test_dataset=test_dataset,
         resume_from_ckpt=resume_from_ckpt,
     )
+    # diagnose the test set
+    # trainer.diagnose(dataset=test_dataset, split="test", resume_from_ckpt=ckpt)
+    return ckpt
 
 
 if __name__ == "__main__":
     from use_cases.config import gpt_3_model
 
+    import json
+
+    import random
+
+    random.seed(2025)
+
     adal.setup_env()
 
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
     # task = VallinaRAGAdal(**gpt_3_model)
     # print(task)
 
@@ -153,11 +205,22 @@ def train(
 
     # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion
     # TODO: test debug mode
-    train(
+    ckpt = train(
         debug=False,
-        max_steps=12,
-        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json",
+        max_steps=1,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        resume_from_ckpt="/Users/liyin/.adalflow/ckpt/VallinaAdal/random_max_steps_24_1511c_run_1.json",
     )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
     # random_max_steps_12_ecf16_run_9.json, demo only, val 0.6 to 0.68,  test: 0.58-0.61
     # random_max_steps_12_7c091_run_1.json,  prompt + demo, 0.58 -0.62, test: 0.55 - 0.58
     # resume from random_max_steps_12_7c091_run_1.json
diff --git a/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py b/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py
new file mode 100644
index 00000000..0ba32d1a
--- /dev/null
+++ b/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py
@@ -0,0 +1,219 @@
+from typing import Any, Callable, Dict, Tuple
+
+import adalflow as adal
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.datasets.types import HotPotQAData
+
+from benchmarks.hotpot_qa.config import load_datasets
+from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG
+from use_cases.config import gpt_3_model, gpt_4o_model, gpt_3_1106_model
+
+from adalflow.utils import printc
+
+
+# TODO: look more into the loss function
+# TODO: test LLM judge too.
+class VallinaRAGAdal(adal.AdalComponent):
+    def __init__(
+        self,
+        model_client: adal.ModelClient,
+        model_kwargs: Dict,
+        backward_engine_model_config: Dict | None = None,
+        teacher_model_config: Dict | None = None,
+        text_optimizer_model_config: Dict | None = None,
+    ):
+        task = VanillaRAG(
+            model_client=model_client,
+            model_kwargs=model_kwargs,
+            passages_per_hop=3,
+        )
+        eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+        loss_fn = adal.EvalFnToTextLoss(
+            eval_fn=eval_fn, eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0"
+        )
+        super().__init__(
+            task=task,
+            eval_fn=eval_fn,
+            loss_fn=loss_fn,
+            backward_engine_model_config=backward_engine_model_config,
+            teacher_model_config=teacher_model_config,
+            text_optimizer_model_config=text_optimizer_model_config,
+        )
+
+    # tell the trainer how to call the task
+    def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]:
+        if self.task.training:
+            return self.task.forward, {"question": sample.question, "id": sample.id}
+        else:
+            return self.task.call, {"question": sample.question, "id": sample.id}
+
+    # TODO: use two map fn to make the cde even simpler
+
+    # eval mode: get the generator output, directly engage with the eval_fn
+    def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float:
+        y_label = ""
+        if y_pred and y_pred.data and y_pred.data.answer:
+            y_label = y_pred.data.answer  # .lower()
+        printc(f"y_label: {y_label}, y_gt: {sample.answer}")
+        return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
+
+    # train mode: get the loss and get the data from the full_response
+    def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter):
+        # prepare gt parameter
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.answer,
+            eval_input=sample.answer,
+            requires_opt=False,
+        )
+
+        # pred's full_response is the output of the task pipeline which is GeneratorOutput
+        pred.eval_input = (
+            pred.data.data.answer
+            if pred.data and pred.data.data and pred.data.data.answer
+            else ""
+        )
+        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
+
+
+# Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics
+# i checked the eval which does fuzzy match, and found some yes and Yes are not matched, then converted both strings to lower and
+# the performances have gone up from 0.15 to 0.4
+def train_diagnose(
+    model_client: adal.ModelClient,
+    model_kwargs: Dict,
+) -> Dict:
+
+    trainset, valset, testset = load_datasets()
+
+    adal_component = VallinaRAGAdal(
+        model_client,
+        model_kwargs,
+        backward_engine_model_config=gpt_4o_model,
+        teacher_model_config=gpt_3_model,
+        text_optimizer_model_config=gpt_3_model,
+    )
+    trainer = adal.Trainer(adaltask=adal_component)
+    trainer.diagnose(dataset=trainset, split="train")
+    # trainer.diagnose(dataset=valset, split="val")
+    # trainer.diagnose(dataset=testset, split="test")
+
+
+from adalflow.core.generator import BackwardPassSetup
+
+
+def train(
+    train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
+    raw_shots: int = 0,
+    bootstrap_shots: int = 4,
+    max_steps=1,
+    num_workers=4,
+    strategy="constrained",
+    optimization_order="sequential",
+    debug=False,
+    resume_from_ckpt=None,
+    exclude_input_fields_from_bootstrap_demos=True,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
+):
+    adal_component = VallinaRAGAdal(
+        **gpt_3_1106_model,
+        teacher_model_config=gpt_4o_model,
+        text_optimizer_model_config=gpt_4o_model,
+        backward_engine_model_config=gpt_4o_model,
+    )
+    print(adal_component)
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+    trainer = adal.Trainer(
+        train_batch_size=train_batch_size,
+        adaltask=adal_component,
+        strategy=strategy,
+        max_steps=max_steps,
+        num_workers=num_workers,
+        raw_shots=raw_shots,
+        bootstrap_shots=bootstrap_shots,
+        debug=debug,
+        weighted_sampling=False,
+        optimization_order=optimization_order,
+        exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        max_proposals_per_step=max_proposals_per_step,
+        backward_pass_setup=backward_pass_setup,
+    )
+    trainer.set_random_seed(seed)
+    print(trainer)
+
+    train_dataset, val_dataset, test_dataset = load_datasets()
+    ckpt, _ = trainer.fit(
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
+        resume_from_ckpt=resume_from_ckpt,
+    )
+    return ckpt
+
+
+if __name__ == "__main__":
+    from use_cases.config import gpt_3_model
+
+    import json
+
+    import random
+
+    random.seed(2025)
+
+    adal.setup_env()
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_false")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
+    # task = VallinaRAGAdal(**gpt_3_model)
+    # print(task)
+
+    # train_diagnose(**gpt_3_model)
+
+    # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion
+    # TODO: test debug mode
+    ckpt = train(
+        debug=False,
+        max_steps=12,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        strategy=set_strategy,
+        max_proposals_per_step=max_proposals_per_step,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/VallinaRAGAdal/constrained_max_steps_12_5a4b4_run_1.json",
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json",
+    )
+    print(f"ckpt: {ckpt}")
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
+    # random_max_steps_12_ecf16_run_9.json, demo only, val 0.6 to 0.68,  test: 0.58-0.61
+    # random_max_steps_12_7c091_run_1.json,  prompt + demo, 0.58 -0.62, test: 0.55 - 0.58
+    # resume from random_max_steps_12_7c091_run_1.json
+
+    # demo only, no input, 4 shots, 0.58-> 0.62, VallinaRAGAdal/constrained_max_steps_12_b0a37_run_1.json
+    # this is the same as dspy's 20shots, because dspy does not use the weighted sampling
diff --git a/benchmarks/hotpot_qa/config.py b/benchmarks/hotpot_qa/config.py
index ebdfbb01..a7de7e86 100644
--- a/benchmarks/hotpot_qa/config.py
+++ b/benchmarks/hotpot_qa/config.py
@@ -1,2 +1,13 @@
 dspy_save_path = "benchmarks/BHH_object_count/models/dspy"
 adal_save_path = "benchmarks/BHH_object_count/models/adal"
+
+from adalflow.datasets.hotpot_qa import HotPotQA
+
+
+def load_datasets():
+
+    trainset = HotPotQA(split="train", size=100)  # 20
+    valset = HotPotQA(split="val", size=100)  # 50
+    testset = HotPotQA(split="test", size=200)  # to keep the same as the dspy #50
+    print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}")
+    return trainset, valset, testset
diff --git a/test_graph.py b/test_graph.py
new file mode 100644
index 00000000..e1577916
--- /dev/null
+++ b/test_graph.py
@@ -0,0 +1,272 @@
+# node_graph_visualizer.py
+
+import os
+from pyvis.network import Network
+import streamlit as st
+import networkx as nx
+from jinja2 import Template
+
+
+# Node class definition
+class Node:
+    def __init__(
+        self,
+        id,
+        name,
+        role_desc,
+        data,
+        data_id,
+        previous_data,
+        requires_opt,
+        param_type,
+        gradients,
+    ):
+        self.id = id
+        self.name = name
+        self.role_desc = role_desc
+        self.data = data
+        self.data_id = data_id
+        self.previous_data = previous_data
+        self.requires_opt = requires_opt
+        self.param_type = param_type
+        self.gradients = gradients
+
+    def get_gradients_names(self):
+        return self.gradients.split(", ") if self.gradients else []
+
+
+# Function to generate individual HTML pages for each node
+def generate_node_html(node, output_dir="node_pages"):
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    filename = f"{output_dir}/{node.name}.html"
+    # from_param = Parameter("from", "From Parameter")
+    # dummy_gradients = Gradient("dummy", "Dummy Gradient")
+    with open(filename, "w") as file:
+        file.write(
+            f"""
+        <!DOCTYPE html>
+    <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <title>{node.name}</title>
+        </head>
+        <body>
+            <h1>Details for Node: {node.name}</h1>
+            <p><b>ID:</b> {node.id}</p>
+            <p><b>Role:</b> {node.role_desc}</p>
+            <p><b>Data:</b> {node.data}</p>
+            <p><b>Data ID:</b> {node.data_id}</p>
+            <p><b>Previous Value:</b> {node.previous_data}</p>
+            <p><b>Requires Optimization:</b> {node.requires_opt}</p>
+            <p><b>Type:</b> {node.param_type}</p>
+            <p><b>Gradients:</b> {', '.join(node.get_gradients_names())}</p>
+        </body>
+        </html>
+        """
+        )
+    print(f"Generated HTML for node: {node.name} at {filename}")
+
+
+# Function to create the main graph with clickable links to individual node pages
+def create_graph_with_links(
+    nodes, edges, output_dir="node_pages", main_file="graph.html"
+):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    net = Network(height="750px", width="100%", directed=True)
+    net.toggle_physics(True)
+    net.template = Template(
+        """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
+        <link href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis-network.min.css" rel="stylesheet" />
+        <style>
+            #tooltip {
+                display: none;
+                position: absolute;
+                max-width: 300px;
+                border: 1px solid #ccc;
+                padding: 10px;
+                background: white;
+                z-index: 1000;
+                font-family: Arial, sans-serif;
+                font-size: 12px;
+                line-height: 1.5;
+            }
+            #tooltip button {
+                display: block;
+                margin-top: 10px;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="tooltip">
+            <div id="tooltip-content"></div>
+            <button onclick="document.getElementById('tooltip').style.display='none'">Close</button>
+        </div>
+        <div id="mynetwork" style="height: {{ height }};"></div>
+        <script type="text/javascript">
+            var nodes = new vis.DataSet({{ nodes | safe }});
+            var edges = new vis.DataSet({{ edges | safe }});
+            var container = document.getElementById('mynetwork');
+            var data = { nodes: nodes, edges: edges };
+            var options = {{ options | safe }};
+            var network = new vis.Network(container, data, options);
+
+            // Handle node click to open a link
+            network.on("click", function (params) {
+                if (params.nodes.length > 0) {
+                    const nodeId = params.nodes[0];
+                    const node = nodes.get(nodeId);
+                    if (node.url) {
+                        window.open(node.url, '_blank');
+                    }
+                }
+            });
+        </script>
+    </body>
+    </html>
+    """
+    )
+
+    for node in nodes:
+        # Generate individual HTML pages for each node
+        generate_node_html(node, output_dir)
+
+        # Add node to the main graph with link to its HTML page
+        net.add_node(
+            node.id,
+            label=node.name,
+            title=f"<a href='{output_dir}/{node.name}.html' target='_blank'>Open Details</a>",
+            shape="dot",
+            size=15,
+            url=f"{output_dir}/{node.name}.html",  # Add the URL here
+        )
+
+    for edge in edges:
+        net.add_edge(edge[0].id, edge[1].id)
+
+    net.show(main_file)
+    print(f"Generated main graph HTML at {main_file}")
+
+
+# Function to create a Streamlit app for interactive graph exploration
+def create_interactive_streamlit_app(nodes, edges):
+    G = nx.DiGraph()
+    for node in nodes:
+        G.add_node(node.id, node_obj=node)
+    G.add_edges_from([(edge[0].id, edge[1].id) for edge in edges])
+
+    st.title("Interactive Graph Visualization")
+    st.sidebar.title("Node Selector")
+    selected_node_name = st.sidebar.selectbox(
+        "Select a node", [node.name for node in nodes]
+    )
+
+    net = Network(height="500px", width="100%", directed=True)
+    net.template = Template(
+        """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
+        <link href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis-network.min.css" rel="stylesheet" />
+        <style>
+            #tooltip {
+                display: none;
+                position: absolute;
+                max-width: 300px;
+                max-height: 200px;
+                overflow-y: auto;
+                border: 1px solid #ccc;
+                padding: 10px;
+                background: white;
+                z-index: 1000;
+                font-family: Arial, sans-serif;
+                font-size: 12px;
+                line-height: 1.5;
+            }
+        </style>
+    </head>
+    <body>
+        <div id="tooltip"></div>
+        <div id="mynetwork" style="height: {{ height }};"></div>
+        <script type="text/javascript">
+            var nodes = new vis.DataSet({{ nodes | safe }});
+            var edges = new vis.DataSet({{ edges | safe }});
+            var container = document.getElementById('mynetwork');
+            var data = { nodes: nodes, edges: edges };
+            var options = {{ options | safe }};
+            var network = new vis.Network(container, data, options);
+
+            // Tooltip functionality
+            const tooltip = document.getElementById('tooltip');
+
+            network.on("hoverNode", function (params) {
+                const node = nodes.get(params.node);
+                tooltip.innerHTML = node.title;
+                tooltip.style.display = "block";
+                tooltip.style.left = params.event.pointer.DOM.x + "px";
+                tooltip.style.top = params.event.pointer.DOM.y + "px";
+            });
+
+            network.on("blurNode", function () {
+                // Keep tooltip visible for persistence
+                tooltip.style.display = "block";
+            });
+
+            // Hide tooltip only on outside click
+            document.addEventListener("click", function (event) {
+                if (!tooltip.contains(event.target) && event.target.id !== 'tooltip') {
+                    tooltip.style.display = "none";
+                }
+            });
+        </script>
+    </body>
+    </html>
+    """
+    )
+
+    for node in nodes:
+        net.add_node(node.id, label=node.name)
+    for edge in edges:
+        net.add_edge(edge[0].id, edge[1].id)
+
+    net.save_graph("graph.html")
+    st.components.v1.html(open("graph.html", "r").read(), height=550)
+
+    if selected_node_name:
+        selected_node = next(node for node in nodes if node.name == selected_node_name)
+        st.subheader(f"Details for Node: {selected_node.name}")
+        st.write(f"**ID**: {selected_node.id}")
+        st.write(f"**Role**: {selected_node.role_desc}")
+        st.write(f"**Data**: {selected_node.data}")
+        st.write(f"**Data ID**: {selected_node.data_id}")
+        st.write(f"**Previous Value**: {selected_node.previous_data}")
+        st.write(f"**Requires Optimization**: {selected_node.requires_opt}")
+        st.write(f"**Type**: {selected_node.param_type}")
+        st.write(f"**Gradients**: {', '.join(selected_node.get_gradients_names())}")
+
+
+if __name__ == "__main__":
+    # Dummy data
+    dummy_nodes = [
+        Node(1, "Node1", "Input", "Value1", "D1", "Prev1", "Yes", "Type1", "Grad1"),
+        Node(2, "Node2", "Process", "Value2", "D2", "Prev2", "No", "Type2", "Grad2"),
+        Node(3, "Node3", "Output", "Value3", "D3", "Prev3", "Yes", "Type3", "Grad3"),
+    ]
+
+    dummy_edges = [(dummy_nodes[0], dummy_nodes[1]), (dummy_nodes[1], dummy_nodes[2])]
+
+    # Test HTML generation
+    create_graph_with_links(dummy_nodes, dummy_edges)
+
+    # Uncomment the following line to test the Streamlit app
+    # create_interactive_streamlit_app(dummy_nodes, dummy_edges)
diff --git a/text_grad_2_0_recompute.py b/text_grad_2_0_recompute.py
new file mode 100644
index 00000000..bd52905b
--- /dev/null
+++ b/text_grad_2_0_recompute.py
@@ -0,0 +1,127 @@
+import json
+import os
+import math
+
+
+def recompute_metrics_and_update_summary(result_file):
+    try:
+        # Load the results file
+        with open(result_file, "r") as f:
+            ckpt_values = json.load(f)
+
+        # Initialize variables for metrics computation
+        highest_test_score = 0
+        mean_test_score = 0
+        standard_deviation = 0
+        past_highest_scores = []
+        past_highest_test_scores = []
+
+        average_pass_rate_list = []
+        average_pass_prompts_list = []
+        average_total_prompts_list = []
+
+        highest_val_score = 0
+
+        # Process each experiment
+        for experiment, data in ckpt_values.items():
+            if "summary" in experiment:
+                continue  # Skip summary entries
+
+            ckpt_path = data
+
+            if os.path.exists(ckpt_path):
+                with open(ckpt_path, "r") as ckpt_file:
+                    experiment_data = json.load(ckpt_file)
+
+                val_scores = experiment_data.get("val_scores", [])
+                test_scores = experiment_data.get("test_scores", [])
+                _high_test_score = max(val_scores, default=0)
+                _high_val_score = max(test_scores, default=0)
+
+                past_highest_scores.append(_high_test_score)
+                past_highest_test_scores.append(_high_val_score)
+
+                if _high_test_score > highest_test_score:
+                    highest_test_score = _high_test_score
+
+                if _high_val_score > highest_val_score:
+                    highest_val_score = _high_val_score
+
+                effective_measures = experiment_data.get("effective_measure", {})
+
+                if effective_measures:
+                    pass_num = effective_measures["valset"].get("pass", 0)
+                    total_val_prompts = effective_measures["valset"].get(
+                        "pass", 0
+                    ) + effective_measures["valset"].get("fail", 0)
+                else:
+                    total_val_prompts = len(val_scores) - 1
+                    pass_num = len(set(val_scores))
+
+                average_pass_rate = (
+                    pass_num / total_val_prompts if total_val_prompts > 0 else 0
+                )
+                average_pass_rate_list.append(average_pass_rate)
+                average_pass_prompts_list.append(pass_num)
+                average_total_prompts_list.append(total_val_prompts)
+
+        # Compute final metrics
+        if past_highest_scores:
+            mean_test_score = sum(past_highest_scores) / len(past_highest_scores)
+            standard_deviation = math.sqrt(
+                sum((x - mean_test_score) ** 2 for x in past_highest_scores)
+                / len(past_highest_scores)
+            )
+
+        average_pass_rate = (
+            sum(average_pass_rate_list) / len(average_pass_rate_list)
+            if average_pass_rate_list
+            else 0
+        )
+        average_pass_prompts = (
+            sum(average_pass_prompts_list) / len(average_pass_prompts_list)
+            if average_pass_prompts_list
+            else 0
+        )
+        average_total_prompts = (
+            sum(average_total_prompts_list) / len(average_total_prompts_list)
+            if average_total_prompts_list
+            else 0
+        )
+
+        # Update the summary in ckpt_values
+        summary_key = "summary"
+        ckpt_values[summary_key] = {
+            "highest_test_score": highest_test_score,
+            "mean_test_score": mean_test_score,
+            "standard_deviation": standard_deviation,
+            "average_pass_rate": average_pass_rate,
+            "average_pass_prompts": average_pass_prompts,
+            "average_total_prompts": average_total_prompts,
+            "past_highest_scores": past_highest_scores,
+            "past_highest_test_scores": past_highest_test_scores,
+            "highest_val_score": highest_val_score,
+        }
+
+        # Save updated ckpt_values back to the file
+        with open(result_file, "w") as f:
+            json.dump(ckpt_values, f, indent=4)
+
+        return ckpt_values[summary_key]
+
+    except Exception as e:
+        print(f"Error while recomputing metrics: {e}")
+        return None
+
+
+# Usage
+if __name__ == "__main__":
+    result_file = "results.json"  # Replace with your actual result file
+    result_file = "text_grad_2_results_4_runs_1872c441-0db2-4640-9cf6-8ef910744a93.json"
+    result_file = "text_grad_2_results_4_runs_02b9f463-aa21-4485-9899-07ac2542ddac.json"  # only use fullset
+    summary = recompute_metrics_and_update_summary(result_file)
+
+    if summary:
+        print("Updated Summary:")
+        for key, value in summary.items():
+            print(f"{key}: {value}")
diff --git a/tutorials/react_note.py b/tutorials/react_note.py
index 58dc93e9..072b318b 100644
--- a/tutorials/react_note.py
+++ b/tutorials/react_note.py
@@ -68,6 +68,50 @@ def test_react_agent(model_client: ModelClient, model_kwargs: dict):
         print("")
 
 
+"""
+To have an agent.
+input, prompt, template, step_history -> generator
+-> stepoutput -> step_history  -> generator -> stepoutput -> step_history
+-> generator -> stepoutput -> step_history -> generator -> stepoutput -> step_history
+"""
+
+
+def test_react_agent_train(model_client: ModelClient, model_kwargs: dict):
+    tools = [multiply, add, divide]
+    queries = [
+        "What is the capital of France? and what is 465 times 321 then add 95297 and then divide by 13.2?",
+        "Give me 5 words rhyming with cool, and make a 4-sentence poem using them",
+    ]
+    # define a generator without tools for comparison
+
+    # generator = Generator(
+    #     model_client=model_client,
+    #     model_kwargs=model_kwargs,
+    # )
+
+    react = ReActAgent(
+        max_steps=6,
+        add_llm_as_fallback=True,
+        tools=tools,
+        model_client=model_client,
+        model_kwargs=model_kwargs,
+    )
+    # print(react)
+    react.train()
+
+    for query in queries:
+        print(f"Query: {query}")
+        agent_response = react.forward(query)
+        agent_response.draw_graph()
+        agent_response.draw_output_subgraph()
+        # print(f"Agent response: {agent_response}")
+
+        break
+        # llm_response = generator.call(prompt_kwargs={"input_str": query})
+        # print(f"LLM response: {llm_response}")
+        print("")
+
+
 def test_react_agent_use_examples(model_client: ModelClient, model_kwargs: dict):
     tools = [multiply, add, divide]
     queries = [
@@ -106,12 +150,12 @@ def test_react_agent_use_examples(model_client: ModelClient, model_kwargs: dict)
 
 
 if __name__ == "__main__":
-    from adalflow.utils import get_logger
 
-    get_logger(level="DEBUG")
+    # get_logger(level="DEBUG")
 
-    test_react_agent(ModelClientType.GROQ(), llama3_model_kwargs)
-    test_react_agent(ModelClientType.OPENAI(), gpt_model_kwargs)
-    print("Done")
+    # test_react_agent(ModelClientType.GROQ(), llama3_model_kwargs)
+    test_react_agent_train(ModelClientType.OPENAI(), gpt_model_kwargs)
+    # test_react_agent(ModelClientType.OPENAI(), gpt_model_kwargs)
+    # print("Done")
 
-    test_react_agent_use_examples(ModelClientType.GROQ(), llama3_model_kwargs)
+    # test_react_agent_use_examples(ModelClientType.GROQ(), llama3_model_kwargs)
diff --git a/use_cases/agent/react_agent.ipynb b/use_cases/agent/react_agent.ipynb
deleted file mode 100644
index a93cb89e..00000000
--- a/use_cases/agent/react_agent.ipynb
+++ /dev/null
@@ -1,1387 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ReAct Agent Use Case"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Q&A Chatbot\n",
-    "In this tutorial, we will implement ``adalflow ReAct`` to build a Q&A chatbot on [HotpotQA](https://arxiv.org/pdf/1809.09600) dataset. \n",
-    "\n",
-    "To learn more about ``adalflow ReAct``, please refer to our developer notes."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "# 2. HotpotQA Dataset\n",
-    "We are using [HotpotQA](https://arxiv.org/pdf/1809.09600). It is a Wikipedia-based multi-hop question and answer dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/.venv/lib/python3.11/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n",
-      "  table = cls._concat_blocks(blocks, axis=0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# load the dataset\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "len of eval: 7405\n",
-      "example: {'id': '5a8b57f25542995d1e6f1371', 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'answer': 'yes', 'type': 'comparison', 'level': 'hard', 'supporting_facts': {'title': ['Scott Derrickson', 'Ed Wood'], 'sent_id': [0, 0]}, 'context': {'title': ['Adam Collis', 'Ed Wood (film)', 'Tyler Bates', 'Doctor Strange (2016 film)', 'Hellraiser: Inferno', 'Sinister (film)', 'Deliver Us from Evil (2014 film)', 'Woodson, Arkansas', 'Conrad Brooks', 'The Exorcism of Emily Rose'], 'sentences': [['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\\'s short \"Love in the Ruins\" (1995).', ' In 1998, he played \"Crankshaft\" in Eric Koyanagi\\'s \"Hundred Percent\".'], ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', \" The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.\", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.'], ['Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.', ' Much of his work is in the action and horror film genres, with films like \"Dawn of the Dead, 300, Sucker Punch,\" and \"John Wick.\"', ' He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.', ' With Gunn, he has scored every one of the director\\'s films; including \"Guardians of the Galaxy\", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums \"The Pale Emperor\" and \"Heaven Upside Down\".'], ['Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.', ' It is the fourteenth film of the Marvel Cinematic Universe (MCU).', ' The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.', ' In \"Doctor Strange\", surgeon Strange learns the mystic arts after a career-ending car accident.'], ['Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.', ' It is the fifth installment in the \"Hellraiser\" series and the first \"Hellraiser\" film to go straight-to-DVD.', ' It was directed by Scott Derrickson and released on October 3, 2000.', \" The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.\", \" The film's reviews were mixed.\"], ['Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.', ' It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.'], ['Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.', ' The film is officially based on a 2001 non-fiction book entitled \"Beware the Night\" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was \"inspired by actual accounts\".', ' The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.'], ['Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.', ' Its population was 403 at the 2010 census.', ' It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.', ' Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.', ' Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.'], ['Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.', ' He moved to Hollywood, California in 1948 to pursue a career in acting.', ' He got his start in movies appearing in Ed Wood films such as \"Plan 9 from Outer Space\", \"Glen or Glenda\", and \"Jail Bait.\"', ' He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.', ' He also has since gone on to write, produce and direct several films.'], ['The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.', ' The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.']]}}\n",
-      "attributes in each sample: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']\n"
-     ]
-    }
-   ],
-   "source": [
-    "# check the data sample\n",
-    "test_sample = dataset[\"validation\"][0]\n",
-    "print(f\"len of eval: {len(dataset['validation'])}\")\n",
-    "print(f\"example: {test_sample}\")\n",
-    "print(f\"attributes in each sample: {list(test_sample.keys())}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "question: Were Scott Derrickson and Ed Wood of the same nationality?\n",
-      "answer: yes\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Each sample contains a question and a corresponding answer.\n",
-    "print(f\"question: {test_sample.get('question')}\")\n",
-    "print(f\"answer: {test_sample.get('answer')}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 3. Set up\n",
-    "Please make sure you have set the model client APIs before running the agent. Now import the necessary packages."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import dotenv\n",
-    "from adalflow.components.model_client import OpenAIClient\n",
-    "from adalflow.components.agent.react_agent import ReActAgent\n",
-    "from adalflow.core.tool_helper import FunctionTool\n",
-    "\n",
-    "import time\n",
-    "\n",
-    "# load evironment, please set the relative path to your .env file that includes the api key\n",
-    "dotenv.load_dotenv(dotenv_path=\"../../.env\", override=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 4. Create Agent\n",
-    "To create an gent, we need to define the basic components.\n",
-    "\n",
-    "## Tools\n",
-    "Firstly, we need to specify what functions the agent will need to answer the question. In this case, we are answering the Wikipedia-based questions, we will allow the agent to **search** Wikipedia api. The [ReAct Paper](https://arxiv.org/pdf/2210.03629) includes a **lookup** function that serves as Ctrl+F functionality on the browser.\n",
-    "\n",
-    "As ``adalflow ReAct`` has a built in ``finish`` function, we don't need to define by ourselves."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "import re\n",
-    "import string\n",
-    "\n",
-    "\n",
-    "# copy code from the paper\n",
-    "def clean_str(p):\n",
-    "    return p.encode().decode(\"unicode-escape\").encode(\"latin1\").decode(\"utf-8\")\n",
-    "\n",
-    "\n",
-    "# normalization copied from the paper's code\n",
-    "def normalize_answer(s):\n",
-    "    def remove_articles(text):\n",
-    "        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n",
-    "\n",
-    "    def white_space_fix(text):\n",
-    "        return \" \".join(text.split())\n",
-    "\n",
-    "    def remove_punc(text):\n",
-    "        exclude = set(string.punctuation)\n",
-    "        return \"\".join(ch for ch in text if ch not in exclude)\n",
-    "\n",
-    "    def lower(text):\n",
-    "        return text.lower()\n",
-    "\n",
-    "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
-    "\n",
-    "\n",
-    "def search(entity: str) -> str:\n",
-    "    \"\"\"\n",
-    "    searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\n",
-    "    \"\"\"\n",
-    "    # Format the entity for URL encoding\n",
-    "    entity_formatted = entity.replace(\" \", \"+\")\n",
-    "    url = f\"https://en.wikipedia.org/w/index.php?search={entity_formatted}\"\n",
-    "\n",
-    "    # Fetch the page\n",
-    "    response = requests.get(url)\n",
-    "    soup = BeautifulSoup(response.text, \"html.parser\")\n",
-    "\n",
-    "    # Check if the exact page was found or suggest similar items\n",
-    "    # when <div class=mw-search-result-heading> is detected, it means the entity page is not found on wikipedia\n",
-    "    result_divs = soup.find_all(\"div\", {\"class\": \"mw-search-result-heading\"})\n",
-    "\n",
-    "    if (\n",
-    "        result_divs\n",
-    "    ):  # this means the searched entity page is not in wikipedia, wikipedia will show a list of similar entities\n",
-    "        # get Similar results\n",
-    "        similar_titles = [div.a.get_text() for div in result_divs]\n",
-    "        return f\"Could not find exact page for '{entity}'. Similar topics: {similar_titles[:5]}\"  # return the top 5 similar titles\n",
-    "    else:\n",
-    "        # the paper uses page to represent content in <p>\n",
-    "        # Extract xontent\n",
-    "        page_list = [\n",
-    "            p.get_text().strip() for p in soup.find_all(\"p\") + soup.find_all(\"ul\")\n",
-    "        ]\n",
-    "        # TODO: Recursive search, if find any concept that needs more search then call search again\n",
-    "        # if any(\"may refer to:\" in p for p in page_list):\n",
-    "        #     search(entity)\n",
-    "\n",
-    "        # restructure & clean the page content following the paper's logic\n",
-    "        page = \"\"\n",
-    "        for p in page_list:\n",
-    "            if len(p.split(\" \")) > 2:\n",
-    "                page += clean_str(p)\n",
-    "                if not p.endswith(\"\\n\"):\n",
-    "                    page += \"\\n\"\n",
-    "        paragraphs = page.split(\"\\n\")\n",
-    "        paragraphs = [p.strip() for p in paragraphs if p.strip()]\n",
-    "\n",
-    "        sentences = []\n",
-    "        for p in paragraphs:\n",
-    "            sentences += p.split(\". \")\n",
-    "        sentences = [s.strip() + \".\" for s in sentences if s.strip()]\n",
-    "\n",
-    "        # return the first 5 sentences\n",
-    "        if sentences:\n",
-    "            return (\n",
-    "                \" \".join(sentences[:5]) if len(sentences) >= 5 else \" \".join(sentences)\n",
-    "            )\n",
-    "        else:\n",
-    "            return \"No content found on this page.\"\n",
-    "\n",
-    "        # TODO: clean the paragraphs and return the searched content\n",
-    "\n",
-    "\n",
-    "def lookup(text: str, keyword: str) -> str:\n",
-    "    \"\"\"\n",
-    "    returns the sentences containing keyword in the current passage.\n",
-    "    \"\"\"\n",
-    "    sentences = text.split(\".\")\n",
-    "    matching_sentences = [\n",
-    "        sentence.strip() + \".\"\n",
-    "        for sentence in sentences\n",
-    "        if keyword.lower() in sentence.lower()\n",
-    "    ]\n",
-    "    if not matching_sentences:\n",
-    "        return \"No sentences found with the keyword.\"\n",
-    "    else:\n",
-    "        return \" \".join(\n",
-    "            matching_sentences\n",
-    "        )  # Join all matching sentences into a single string"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set up tools for the agent\n",
-    "tools = [FunctionTool.from_defaults(fn=search), FunctionTool.from_defaults(fn=lookup)]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Examples\n",
-    "The next thing to add is examples. Few shot prompt engineering is a common practice to improve the model performance.\n",
-    "\n",
-    "Let's use the paper's examples. The paper has 6 examples altogether."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "examples = [\n",
-    "    \"\"\"Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\n",
-    "Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\n",
-    "Action 1: search(\"Colorado orogeny\")\n",
-    "Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\n",
-    "Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.\n",
-    "Action 2: lookup(\"eastern sector\")\n",
-    "Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\n",
-    "Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\n",
-    "Action 3: search(\"High Plains\")\n",
-    "Observation 3: High Plains refers to one of two distinct land regions:\n",
-    "Thought 4: I need to instead search High Plains (United States).\n",
-    "Action 4: search(\"High Plains (United States)\")\n",
-    "Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\n",
-    "Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\n",
-    "Action 5: finish(\"1,800 to 7,000 ft\")\"\"\",\n",
-    "    \"\"\"Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\n",
-    "Thought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\n",
-    "Action 1: search(\"Milhouse\")\n",
-    "Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\n",
-    "Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\n",
-    "Action 2: lookup(\"named after\")\n",
-    "Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \n",
-    "Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\n",
-    "Action 3: finish(\"Richard Nixon\")\"\"\",\n",
-    "    \"\"\"Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\n",
-    "Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\n",
-    "Action 1: search(\"Adam Clayton Powell\")\n",
-    "Observation 1: Could not find [\"Adam Clayton Powell\"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].\n",
-    "Thought 2: To find the documentary, I can search Adam Clayton Powell (film).\n",
-    "Action 2: search(\"Adam Clayton Powell (film)\")\n",
-    "Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\n",
-    "The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\n",
-    "Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\n",
-    "Action 3: finish(\"The Saimaa Gesture\")\"\"\",\n",
-    "    \"\"\"Question: What profession does Nicholas Ray and Elia Kazan have in common?\n",
-    "Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\n",
-    "Action 1: search(\"Nicholas Ray\")\n",
-    "Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\n",
-    "Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\n",
-    "Action 2: search(\"Elia Kazan\")\n",
-    "Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\n",
-    "Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\n",
-    "Action 3: finish(\"director, screenwriter, actor\")\"\"\",\n",
-    "    \"\"\"Question: Which magazine was started first Arthur's Magazine or First for Women?\n",
-    "Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.\n",
-    "Action 1: search(\"Arthur's Magazine\")\n",
-    "Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. \n",
-    "Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.\n",
-    "Action 2: search(\"First for Women\")\n",
-    "Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \n",
-    "Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.\n",
-    "Action 3: finish(\"Arthur's Magazine\")\"\"\",\n",
-    "    \"\"\"Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\n",
-    "Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\n",
-    "Action 1: search(\"Pavel Urysohn\")\n",
-    "Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\n",
-    "Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\n",
-    "Action 2: search(\"Leonid Levin\")\n",
-    "Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \n",
-    "Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \n",
-    "Action 3: finish(\"yes\")\"\"\",\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n",
-    "\n",
-    "preset_prompt_kwargs = {\"examples\": examples}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model\n",
-    "\n",
-    "Next, we can choose the model to call. In this example we will use OpenAIClient ``gpt-3.5-turbo`` model. We will set the ``temperature`` at 0.0 to make the response as consistent as possible."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gpt_model_kwargs = {\n",
-    "    \"model\": \"gpt-3.5-turbo\",\n",
-    "    \"temperature\": 0.0,\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Agent\n",
-    "Combining the previous components, we can define the agent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ReActAgent(\n",
-       "  tools=[FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)], max_steps=3, model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n",
-       "  (prompt): Prompt(\n",
-       "    template: \n",
-       "    {# role/task description #}\n",
-       "    You task is to answer user's query with minimum steps and maximum accuracy using the tools provided.\n",
-       "    {# REACT instructions #}\n",
-       "    Each step you will read the previous Thought, Action, and Observation(execution result of the action)steps and then provide the next Thought and Action.\n",
-       "    \n",
-       "    You only have access to the following tools:\n",
-       "    {# tools #}\n",
-       "    {% for tool in tools %}\n",
-       "    {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n",
-       "        Tool Description: {{ tool.metadata.description }}\n",
-       "        Tool Parameters: {{ tool.metadata.fn_schema_str }} {#tool args can be misleading, especially if we already have type hints and docstring in the function#}\n",
-       "    {% endfor %}\n",
-       "    {# output is always more robust to use json than string #}\n",
-       "    ---\n",
-       "    Your output must be in valid JSON format(raw Python string format) with two keys:\n",
-       "    {\n",
-       "        \"thought\": \"<Why you are taking this action>\",\n",
-       "        \"action\": \"ToolName(<args>, <kwargs>)\"\n",
-       "    }\n",
-       "    - Must double quote the JSON str.\n",
-       "    - Inside of the JSON str, Must use escape double quote and escape backslash for string.\n",
-       "    For example:\n",
-       "    \"action\": \"finish(\\\"John's.\\\")\"\n",
-       "    ---\n",
-       "    {# Specifications TODO: preference between the usage of llm tool vs the other tool #}\n",
-       "    Process:\n",
-       "    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n",
-       "    - Call one available tool at a time to solve each subquery/subquestion. \\\n",
-       "    - At step 'finish', join all subqueries answers and finish the task.\n",
-       "    Remember:\n",
-       "    - Action must call one of the above tools with Took Name. It can not be empty.\n",
-       "    - Read the Tool Description and ensure your args and kwarg follow what each tool expects in types. e.g. (a=1, b=2) if it is keyword argument or (1, 2) if it is positional.\n",
-       "    - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n",
-       "    - When the initial query is simple, use minimum steps to answer the query.\n",
-       "    {#Examples can be here#}\n",
-       "    {# Check if there are any examples #}\n",
-       "    {% if examples %}\n",
-       "    <EXAMPLES>\n",
-       "    {% for example in examples %}\n",
-       "    {{ example }}\n",
-       "    {% endfor %}\n",
-       "    </EXAMPLES>\n",
-       "    {% endif %}\n",
-       "    <</SYS>>\n",
-       "    -----------------\n",
-       "    {# History #}\n",
-       "    {% for history in step_history %}\n",
-       "    Step {{history.step}}:\n",
-       "    {\n",
-       "     \"thought\": \"{{history.thought}}\",\n",
-       "     \"action\": \"{{history.action}}\",\n",
-       "    }\n",
-       "    \"observation\": \"{{history.observation}}\"\n",
-       "    {% endfor %}\n",
-       "    {% if input_str %}\n",
-       "    User query:\n",
-       "    {{ input_str }}\n",
-       "    {% endif %}\n",
-       "    , preset_prompt_kwargs: {'examples': ['Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\\nThought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\\nAction 1: search(\"Colorado orogeny\")\\nObservation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\\nThought 2: It does not mention the eastern sector. So I need to look up eastern sector.\\nAction 2: lookup(\"eastern sector\")\\nObservation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\\nThought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\\nAction 3: search(\"High Plains\")\\nObservation 3: High Plains refers to one of two distinct land regions:\\nThought 4: I need to instead search High Plains (United States).\\nAction 4: search(\"High Plains (United States)\")\\nObservation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\\nThought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\\nAction 5: finish(\"1,800 to 7,000 ft\")', 'Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\\nThought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\\nAction 1: search(\"Milhouse\")\\nObservation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\\nThought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\\nAction 2: lookup(\"named after\")\\nObservation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \\nThought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\\nAction 3: finish(\"Richard Nixon\")', 'Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\\nThought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\\nAction 1: search(\"Adam Clayton Powell\")\\nObservation 1: Could not find [\"Adam Clayton Powell\"]. Similar: [\\'Adam Clayton Powell III\\', \\'Seventh Avenue (Manhattan)\\', \\'Adam Clayton Powell Jr. State Office Building\\', \\'Isabel Washington Powell\\', \\'Adam Powell\\', \\'Adam Clayton Powell (film)\\', \\'Giancarlo Esposito\\'].\\nThought 2: To find the documentary, I can search Adam Clayton Powell (film).\\nAction 2: search(\"Adam Clayton Powell (film)\")\\nObservation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\\nThe film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\\nThought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\\nAction 3: finish(\"The Saimaa Gesture\")', 'Question: What profession does Nicholas Ray and Elia Kazan have in common?\\nThought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\\nAction 1: search(\"Nicholas Ray\")\\nObservation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\\nThought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\\nAction 2: search(\"Elia Kazan\")\\nObservation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\\nThought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\\nAction 3: finish(\"director, screenwriter, actor\")', 'Question: Which magazine was started first Arthur\\'s Magazine or First for Women?\\nThought 1: I need to search Arthur\\'s Magazine and First for Women, and find which was started first.\\nAction 1: search(\"Arthur\\'s Magazine\")\\nObservation 1: Arthur\\'s Magazine (1844-\\x80\\x931846) was an American literary periodical published in Philadelphia in the 19th century. \\nThought 2: Arthur\\'s Magazine was started in 1844. I need to search First for Women next.\\nAction 2: search(\"First for Women\")\\nObservation 2: First for Women is a woman\\'s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \\nThought 3: First for Women was started in 1989. 1844 (Arthur\\'s Magazine) < 1989 (First for Women), so Arthur\\'s Magazine was started first.\\nAction 3: finish(\"Arthur\\'s Magazine\")', 'Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\\nThought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\\nAction 1: search(\"Pavel Urysohn\")\\nObservation 1: Pavel Samuilovich Urysohn (February 3, 1898 â\\x80\\x93 August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\\nThought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\\nAction 2: search(\"Leonid Levin\")\\nObservation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \\nThought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \\nAction 3: finish(\"yes\")'], 'tools': [FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)]}, prompt_variables: ['examples', 'step_history', 'input_str', 'tools']\n",
-       "  )\n",
-       "  (model_client): OpenAIClient()\n",
-       "  (output_processors): JsonParser()\n",
-       "  (additional_llm_tool): Generator(\n",
-       "    model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n",
-       "    (prompt): Prompt(\n",
-       "      template: \n",
-       "      {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n",
-       "      <SYS>\n",
-       "      {% endif %}\n",
-       "      {# task desc #}\n",
-       "      {% if task_desc_str %}\n",
-       "      {{task_desc_str}}\n",
-       "      {% endif %}\n",
-       "      {# output format #}\n",
-       "      {% if output_format_str %}\n",
-       "      <OUTPUT_FORMAT>\n",
-       "      {{output_format_str}}\n",
-       "      </OUTPUT_FORMAT>\n",
-       "      {% endif %}\n",
-       "      {# tools #}\n",
-       "      {% if tools_str %}\n",
-       "      <TOOLS>\n",
-       "      {{tools_str}}\n",
-       "      </TOOLS>\n",
-       "      {% endif %}\n",
-       "      {# example #}\n",
-       "      {% if examples_str %}\n",
-       "      <EXAMPLES>\n",
-       "      {{examples_str}}\n",
-       "      </EXAMPLES>\n",
-       "      {% endif %}\n",
-       "      {# chat history #}\n",
-       "      {% if chat_history_str %}\n",
-       "      <CHAT_HISTORY>\n",
-       "      {{chat_history_str}}\n",
-       "      </CHAT_HISTORY>\n",
-       "      {% endif %}\n",
-       "      {#contex#}\n",
-       "      {% if context_str %}\n",
-       "      <CONTEXT>\n",
-       "      {{context_str}}\n",
-       "      </CONTEXT>\n",
-       "      {% endif %}\n",
-       "      {# steps #}\n",
-       "      {% if steps_str %}\n",
-       "      <STEPS>\n",
-       "      {{steps_str}}\n",
-       "      </STEPS>\n",
-       "      {% endif %}\n",
-       "      {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n",
-       "      </SYS>\n",
-       "      {% endif %}\n",
-       "      {% if input_str %}\n",
-       "      <Inputs>\n",
-       "      {{input_str}}\n",
-       "      </Inputs>\n",
-       "      {% endif %}\n",
-       "      {% if output_str %}\n",
-       "      <Outputs>\n",
-       "      {{output_str}}\n",
-       "      </Outputs>\n",
-       "      {% endif %}\n",
-       "      You:\n",
-       "      , prompt_variables: ['context_str', 'input_str', 'steps_str', 'output_format_str', 'output_str', 'chat_history_str', 'tools_str', 'examples_str', 'task_desc_str']\n",
-       "    )\n",
-       "    (model_client): OpenAIClient()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# max_steps refers to how many thought-action round we allow the model to perform\n",
-    "# to save resources, let's use 3 here\n",
-    "agent = ReActAgent(\n",
-    "    tools=tools,\n",
-    "    max_steps=3,\n",
-    "    model_client=OpenAIClient(),\n",
-    "    model_kwargs=gpt_model_kwargs,\n",
-    "    preset_prompt_kwargs=preset_prompt_kwargs,\n",
-    ")\n",
-    "agent"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<module 'lightrag' from '/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/lightrag/__init__.py'>"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import importlib\n",
-    "import adalflow\n",
-    "\n",
-    "importlib.reload(adalflow)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 5. Q & A\n",
-    "Next we can use the agent to answer our questions. Let's run 5 examples. We will use the validation data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Dataset({\n",
-       "    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],\n",
-       "    num_rows: 7405\n",
-       "})"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "val_dataset = dataset[\"validation\"]\n",
-    "val_dataset"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "``LightRAG`` provides a ``printc`` function. You can utilize it to show colored console output for angent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m2024-06-15 23:13:47 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:48 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.\n",
-      "Action 1: search(\"Scott Derrickson\")\n",
-      "Observation 1: Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:50 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.\n",
-      "Action 2: search(\"Ed Wood\")\n",
-      "Observation 2: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:51 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Scott Derrickson is an American filmmaker and Ed Wood is also an American filmmaker. They are of the same nationality.\n",
-      "Action 3: finish(\"yes\")\n",
-      "Observation 3: yes\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:13:51 - [react_agent.py:345:call] - answer: yes\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.', action='search(\"Scott Derrickson\")', fun_name='search', fun_args=['Scott Derrickson'], fun_kwargs={}, observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.'), StepOutput(step=2, thought='Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.', action='search(\"Ed Wood\")', fun_name='search', fun_args=['Ed Wood'], fun_kwargs={}, observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\"), StepOutput(step=3, thought='Scott Derrickson is an American filmmaker and Ed Wood is also an American filmmaker. They are of the same nationality.', action='finish(\"yes\")', fun_name='finish', fun_args=['yes'], fun_kwargs={}, observation='yes')]\n",
-      "\u001b[33m2024-06-15 23:13:51 - [2706144185.py:12:<module>] - question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:13:51 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:53 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.\n",
-      "Action 1: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 1: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:55 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n",
-      "Action 2: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 2: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\",\n",
-      " \"action\": \"search(\\\"Corliss Archer Kiss and Tell film\\\")\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:13:57 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n",
-      "Action 3: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 3: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:13:57 - [react_agent.py:345:call] - answer: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=2, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=3, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\")]\n",
-      "\u001b[33m2024-06-15 23:13:57 - [2706144185.py:12:<module>] - question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: could not find exact page for corliss archer kiss and tell film similar topics kiss for corliss corliss archer kiss and tell 1945 film kiss and tell play meet corliss archer\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:13:57 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:13:59 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for a science fantasy young adult series told in first person, with companion books narrating the stories of enslaved worlds and alien species.\n",
-      "Action 1: search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")\n",
-      "Observation 1: Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:00 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The search results suggest 'Animorphs' as a similar topic, I should search for 'Animorphs' to see if it matches the criteria.\n",
-      "Action 2: search('Animorphs')\n",
-      "Observation 2: Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:01 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The series that matches the criteria is 'Animorphs'. I should provide this as the answer.\n",
-      "Action 3: finish(\"Animorphs\")\n",
-      "Observation 3: Animorphs\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:14:01 - [react_agent.py:345:call] - answer: Animorphs\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for a science fantasy young adult series told in first person, with companion books narrating the stories of enslaved worlds and alien species.', action='search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")', fun_name='search', fun_args=['science fantasy young adult series first person companion books enslaved worlds alien species'], fun_kwargs={}, observation=\"Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\"), StepOutput(step=2, thought=\"The search results suggest 'Animorphs' as a similar topic, I should search for 'Animorphs' to see if it matches the criteria.\", action=\"search('Animorphs')\", fun_name='search', fun_args=['Animorphs'], fun_kwargs={}, observation=\"Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\"), StepOutput(step=3, thought=\"The series that matches the criteria is 'Animorphs'. I should provide this as the answer.\", action='finish(\"Animorphs\")', fun_name='finish', fun_args=['Animorphs'], fun_kwargs={}, observation='Animorphs')]\n",
-      "\u001b[33m2024-06-15 23:14:01 - [2706144185.py:12:<module>] - question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: animorphs\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:14:01 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:03 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.\n",
-      "Action 1: search(\"Laleli Mosque\")\n",
-      "Observation 1: The Laleli Mosque (Turkish: Laleli Camii, lit. 'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:04 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.\n",
-      "Action 2: search(\"Esma Sultan Mansion\")\n",
-      "Observation 2: The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:05 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.\n",
-      "Action 3: finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")\n",
-      "Observation 3: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:14:05 - [react_agent.py:345:call] - answer: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.', action='search(\"Laleli Mosque\")', fun_name='search', fun_args=['Laleli Mosque'], fun_kwargs={}, observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\"), StepOutput(step=2, thought='I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.', action='search(\"Esma Sultan Mansion\")', fun_name='search', fun_args=['Esma Sultan Mansion'], fun_kwargs={}, observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].'), StepOutput(step=3, thought='I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.', action='finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")', fun_name='finish', fun_args=['No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.'], fun_kwargs={}, observation='No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.')]\n",
-      "\u001b[33m2024-06-15 23:14:05 - [2706144185.py:12:<module>] - question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is located in laleli fatih istanbul turkey while esma sultan mansion is in ortaköy neighborhood of istanbul turkey\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:14:06 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:07 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\n",
-      "Action 1: search('Big Stone Gap director')\n",
-      "Observation 1: Could not find exact page for 'Big Stone Gap director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Union High School (Big Stone Gap, Virginia)', 'Star Wood', 'Dagmara Domińczyk']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:14:09 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\n",
-      "Action 2: search('Big Stone Gap film location')\n",
-      "Observation 2: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\",\n",
-      " \"action\": \"search('Big Stone Gap film location')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:14:11 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\n",
-      "Action 3: search('Big Stone Gap film location')\n",
-      "Observation 3: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:14:11 - [react_agent.py:345:call] - answer: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought=\"I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\", action=\"search('Big Stone Gap director')\", fun_name='search', fun_args=['Big Stone Gap director'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Union High School (Big Stone Gap, Virginia)', 'Star Wood', 'Dagmara Domińczyk']\"), StepOutput(step=2, thought=\"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\", action=\"search('Big Stone Gap film location')\", fun_name='search', fun_args=['Big Stone Gap film location'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\"), StepOutput(step=3, thought=\"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\", action=\"search('Big Stone Gap film location')\", fun_name='search', fun_args=['Big Stone Gap film location'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\")]\n",
-      "\u001b[33m2024-06-15 23:14:11 - [2706144185.py:12:<module>] - question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: could not find exact page for big stone gap film location similar topics big stone gap virginia adriana trigiani dagmara domińczyk big lebowski beach film\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "from adalflow.utils.logger import printc\n",
-    "\n",
-    "num_questions = 5\n",
-    "for i in range(num_questions):\n",
-    "    question = val_dataset[i][\"question\"]\n",
-    "    gt_answer = normalize_answer(\n",
-    "        val_dataset[i][\"answer\"]\n",
-    "    )  # normalize the ground truth answer\n",
-    "\n",
-    "    # get the agent's response\n",
-    "    pred_answer = agent(question)\n",
-    "    pred_answer = normalize_answer(pred_answer)\n",
-    "\n",
-    "    printc(\n",
-    "        f\"question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
-    "        color=\"yellow\",\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 6. Evaluation\n",
-    "\n",
-    "Now you will see that we have the ``exact correct answer`` for some questions:\n",
-    "\n",
-    "question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes`` pred answer: ``yes``\n",
-    "\n",
-    "question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: ``animorphs``, pred answer: ``animorphs``\n",
-    "\n",
-    "Sometimes the agent performs correctly but not in the same format with the ground truth. E.g. ground truth: ``no``, pred answer: ``no, they are not the same``. This is what we can tolerate.\n",
-    "\n",
-    "But how to evaluate if the agent is doing well, or if our tools, examples, and prompt implementations work well? We need to evaluate it.\n",
-    "\n",
-    "1. Exact Match(EM)\n",
-    "Exact Match is what the paper is using. Only when the normalized agent response is the same with the ground truth answer, we count it as correct. The paper's EM for react agent is around 30%(gpt-3).\n",
-    "\n",
-    "2. Fuzzy Match(FM)\n",
-    "EM doesn't make much sense in question and answering. So we propose fuzzy match based on the LLMs' lengthy output nature. If the ground truth answer is included in the agent response, then we count it as correct. FM is not necessarily correct. \n",
-    "E.g. question: Harry Potter and Dumbledore, who is older? ground truth: ``dumbledore``, pred answer: ``harry potter is older than dumbledore.``\n",
-    "the model mentioned the groud truth but still provide wrong answer. So FM serves as reference.\n",
-    "\n",
-    "Let's use ``LightRAG eval`` module and evaluate on 10 questions and keep the model's practice to set ``max_step`` at `7`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m2024-06-15 23:15:11 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:13 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.\n",
-      "Action 1: search(\"Scott Derrickson\")\n",
-      "Observation 1: Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:15 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.\n",
-      "Action 2: search(\"Ed Wood\")\n",
-      "Observation 2: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:15 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Scott Derrickson is an American filmmaker and Ed Wood is also American. So, they are of the same nationality.\n",
-      "Action 3: finish(\"yes\")\n",
-      "Observation 3: yes\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:15 - [react_agent.py:345:call] - answer: yes\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.', action='search(\"Scott Derrickson\")', fun_name='search', fun_args=['Scott Derrickson'], fun_kwargs={}, observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.'), StepOutput(step=2, thought='Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.', action='search(\"Ed Wood\")', fun_name='search', fun_args=['Ed Wood'], fun_kwargs={}, observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\"), StepOutput(step=3, thought='Scott Derrickson is an American filmmaker and Ed Wood is also American. So, they are of the same nationality.', action='finish(\"yes\")', fun_name='finish', fun_args=['yes'], fun_kwargs={}, observation='yes')]\n",
-      "\u001b[33m2024-06-15 23:15:15 - [3641068398.py:26:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:15 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.\n",
-      "Action 1: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 1: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:19 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n",
-      "Action 2: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 2: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['Corliss Archer', 'A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\",\n",
-      " \"action\": \"search(\\\"Corliss Archer Kiss and Tell film\\\")\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:21 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n",
-      "Action 3: search(\"Corliss Archer Kiss and Tell film\")\n",
-      "Observation 3: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:23 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n",
-      "Action 4: search(\"Corliss Archer film Kiss and Tell cast\")\n",
-      "Observation 4: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:25 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n",
-      "Action 5: search(\"Corliss Archer film Kiss and Tell cast\")\n",
-      "Observation 5: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:27 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n",
-      "Action 6: search(\"Corliss Archer film Kiss and Tell cast\")\n",
-      "Observation 6: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:29 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n",
-      "Action 7: search(\"Corliss Archer film Kiss and Tell cast\")\n",
-      "Observation 7: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:29 - [react_agent.py:345:call] - answer: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=2, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['Corliss Archer', 'A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=3, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=4, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=5, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=6, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=7, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\")]\n",
-      "\u001b[33m2024-06-15 23:15:29 - [3641068398.py:26:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: could not find exact page for corliss archer film kiss and tell cast similar topics kiss for corliss kiss and tell 1945 film kiss and tell play janet waldo kathryn card\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:29 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:32 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for a science fantasy young adult series told in first person and involving companion books about enslaved worlds and alien species.\n",
-      "Action 1: search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")\n",
-      "Observation 1: Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:34 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The search did not return an exact match. I should search for 'Animorphs' as it involves young adult science fantasy, first-person narration, enslaved worlds, and alien species.\n",
-      "Action 2: search('Animorphs')\n",
-      "Observation 2: Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:35 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The series that matches the description is Animorphs. I should provide this as the answer.\n",
-      "Action 3: finish(\"Animorphs\")\n",
-      "Observation 3: Animorphs\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:35 - [react_agent.py:345:call] - answer: Animorphs\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for a science fantasy young adult series told in first person and involving companion books about enslaved worlds and alien species.', action='search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")', fun_name='search', fun_args=['science fantasy young adult series first person companion books enslaved worlds alien species'], fun_kwargs={}, observation=\"Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\"), StepOutput(step=2, thought=\"The search did not return an exact match. I should search for 'Animorphs' as it involves young adult science fantasy, first-person narration, enslaved worlds, and alien species.\", action=\"search('Animorphs')\", fun_name='search', fun_args=['Animorphs'], fun_kwargs={}, observation=\"Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\"), StepOutput(step=3, thought='The series that matches the description is Animorphs. I should provide this as the answer.', action='finish(\"Animorphs\")', fun_name='finish', fun_args=['Animorphs'], fun_kwargs={}, observation='Animorphs')]\n",
-      "\u001b[33m2024-06-15 23:15:35 - [3641068398.py:26:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: animorphs\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:35 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.\n",
-      "Action 1: search(\"Laleli Mosque\")\n",
-      "Observation 1: The Laleli Mosque (Turkish: Laleli Camii, lit. 'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:37 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.\n",
-      "Action 2: search(\"Esma Sultan Mansion\")\n",
-      "Observation 2: The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:39 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.\n",
-      "Action 3: finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")\n",
-      "Observation 3: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:39 - [react_agent.py:345:call] - answer: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.', action='search(\"Laleli Mosque\")', fun_name='search', fun_args=['Laleli Mosque'], fun_kwargs={}, observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\"), StepOutput(step=2, thought='I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.', action='search(\"Esma Sultan Mansion\")', fun_name='search', fun_args=['Esma Sultan Mansion'], fun_kwargs={}, observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].'), StepOutput(step=3, thought='I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.', action='finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")', fun_name='finish', fun_args=['No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.'], fun_kwargs={}, observation='No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.')]\n",
-      "\u001b[33m2024-06-15 23:15:39 - [3641068398.py:26:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is located in laleli fatih istanbul turkey while esma sultan mansion is in ortaköy neighborhood of istanbul turkey\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:39 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:43 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\n",
-      "Action 1: search('Big Stone Gap film director')\n",
-      "Observation 1: Could not find exact page for 'Big Stone Gap film director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'C. C. Swiney']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:45 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 2: search('Big Stone Gap film')\n",
-      "Observation 2: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n",
-      " \"action\": \"search('Big Stone Gap film')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:47 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 3: search('Big Stone Gap film')\n",
-      "Observation 3: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n",
-      " \"action\": \"search('Big Stone Gap film')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:49 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 4: search('Big Stone Gap film')\n",
-      "Observation 4: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n",
-      " \"action\": \"search('Big Stone Gap film')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:51 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 5: search('Big Stone Gap film')\n",
-      "Observation 5: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n",
-      " \"action\": \"search('Big Stone Gap film')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:53 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 6: search('Big Stone Gap film')\n",
-      "Observation 6: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n",
-      " \"action\": \"search('Big Stone Gap film')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:15:54 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n",
-      "Action 7: search('Big Stone Gap film')\n",
-      "Observation 7: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:54 - [react_agent.py:345:call] - answer: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought=\"I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\", action=\"search('Big Stone Gap film director')\", fun_name='search', fun_args=['Big Stone Gap film director'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'C. C. Swiney']\"), StepOutput(step=2, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=3, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=4, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=5, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=6, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=7, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\")]\n",
-      "\u001b[33m2024-06-15 23:15:54 - [3641068398.py:26:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: could not find exact page for big stone gap film similar topics big stone gap film big stone gap virginia adriana trigiani dagmara domińczyk darién gap\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:15:54 - [react_agent.py:330:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:57 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the debut album 2014 S/S of a South Korean boy group and find out who formed the group.\n",
-      "Action 1: search(\"2014 S/S South Korean boy group debut album\")\n",
-      "Observation 1: Could not find exact page for '2014 S/S South Korean boy group debut album'. Similar topics: ['2014 S/S', 'S.Coups', 'S (South Korean band)', 'List of South Korean boy bands', 'Kara (South Korean group)']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:15:58 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: To find out who formed the group, I need to search for the South Korean boy group's debut album 2014 S/S and look for information on the group's formation.\n",
-      "Action 2: search('S (South Korean band)')\n",
-      "Observation 2: S (에스) is a South Korean project group consisting of three members: Kangta, Lee Ji-hoon and Shin Hye-sung. The group debuted in 2003, under the SM Entertainment label. After 11 years, they released and promoted another mini-album in 2014.. In 2003, S released their first album Fr.In.Cl, which stands for Friends in Classic. In 2014, after more than a decade in hiatus, the group released their second mini-album Autumn Breeze on October 27.\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\",\n",
-      " \"action\": \"search('2014 S/S (album)')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:16:00 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\n",
-      "Action 3: search('2014 S/S (album)')\n",
-      "Observation 3: 2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group's record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album's songs.[1]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I.,  Airplay, and others.[2] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[3][4]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard's K-Pop Hot 100 chart.[5][6] Internationally, 2014 S/S took the top spot in Billboard's World Album Chart.[7].\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\",\n",
-      " \"action\": \"search('Winner (South Korean band)')\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:16:02 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\n",
-      "Action 4: search('Winner (South Korean band)')\n",
-      "Observation 4: Winner (Korean: 위너; romanized: wineo; stylized in all caps) is a South Korean boy band consisting of four members: Jinu, Hoony, Mino and Seungyoon. The band was formed in 2013 by YG Entertainment through Mnet's survival program WIN: Who Is Next leading to their official debut on August 17, 2014, with studio album 2014 S/S. Originally a five-piece band, Taehyun departed in November 2016 for his mental health and well-being. Upon their debut, The Korea Herald cited their immediate success as \"unprecedented\" for a new group, thus dubbed as \"Monster rookies\". Winner is often cited as a \"self-producing\" band with all members actively taking on roles from producing and choreographing to designing and marketing.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:03 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to provide the name of the group that formed Winner.\n",
-      "Action 5: finish(\"YG Entertainment\")\n",
-      "Observation 5: YG Entertainment\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:03 - [react_agent.py:345:call] - answer: YG Entertainment\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for the debut album 2014 S/S of a South Korean boy group and find out who formed the group.', action='search(\"2014 S/S South Korean boy group debut album\")', fun_name='search', fun_args=['2014 S/S South Korean boy group debut album'], fun_kwargs={}, observation=\"Could not find exact page for '2014 S/S South Korean boy group debut album'. Similar topics: ['2014 S/S', 'S.Coups', 'S (South Korean band)', 'List of South Korean boy bands', 'Kara (South Korean group)']\"), StepOutput(step=2, thought=\"To find out who formed the group, I need to search for the South Korean boy group's debut album 2014 S/S and look for information on the group's formation.\", action=\"search('S (South Korean band)')\", fun_name='search', fun_args=['S (South Korean band)'], fun_kwargs={}, observation='S (에스) is a South Korean project group consisting of three members: Kangta, Lee Ji-hoon and Shin Hye-sung. The group debuted in 2003, under the SM Entertainment label. After 11 years, they released and promoted another mini-album in 2014.. In 2003, S released their first album Fr.In.Cl, which stands for Friends in Classic. In 2014, after more than a decade in hiatus, the group released their second mini-album Autumn Breeze on October 27.'), StepOutput(step=3, thought=\"To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\", action=\"search('2014 S/S (album)')\", fun_name='search', fun_args=['2014 S/S (album)'], fun_kwargs={}, observation='2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group\\'s record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album\\'s songs.[1]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I.,  Airplay, and others.[2] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[3][4]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard\\'s K-Pop Hot 100 chart.[5][6] Internationally, 2014 S/S took the top spot in Billboard\\'s World Album Chart.[7].'), StepOutput(step=4, thought=\"To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\", action=\"search('Winner (South Korean band)')\", fun_name='search', fun_args=['Winner (South Korean band)'], fun_kwargs={}, observation='Winner (Korean:\\xa0위너; romanized: wineo; stylized in all caps) is a South Korean boy band consisting of four members: Jinu, Hoony, Mino and Seungyoon. The band was formed in 2013 by YG Entertainment through Mnet\\'s survival program WIN: Who Is Next leading to their official debut on August 17, 2014, with studio album 2014 S/S. Originally a five-piece band, Taehyun departed in November 2016 for his mental health and well-being. Upon their debut, The Korea Herald cited their immediate success as \"unprecedented\" for a new group, thus dubbed as \"Monster rookies\". Winner is often cited as a \"self-producing\" band with all members actively taking on roles from producing and choreographing to designing and marketing.'), StepOutput(step=5, thought='To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to provide the name of the group that formed Winner.', action='finish(\"YG Entertainment\")', fun_name='finish', fun_args=['YG Entertainment'], fun_kwargs={}, observation='YG Entertainment')]\n",
-      "\u001b[33m2024-06-15 23:16:03 - [3641068398.py:26:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:03 - [react_agent.py:330:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:05 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the person known by the stage name Aladin and who worked as a consultant to help organizations improve their performance.\n",
-      "Action 1: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 1: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:07 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I should try searching for Aladin's stage name and his work as a consultant to help organizations improve their performance.\n",
-      "Action 2: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 2: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:08 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n",
-      "Action 3: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 3: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:10 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n",
-      "Action 4: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 4: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:12 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n",
-      "Action 5: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 5: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:13 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n",
-      "Action 6: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 6: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:15 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n",
-      "Action 7: search(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 7: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:15 - [react_agent.py:345:call] - answer: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to find the person known by the stage name Aladin and who worked as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=2, thought=\"I should try searching for Aladin's stage name and his work as a consultant to help organizations improve their performance.\", action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=3, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=4, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=5, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=6, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=7, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\")]\n",
-      "\u001b[33m2024-06-15 23:16:15 - [3641068398.py:26:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: could not find exact page for aladin stage name consultant organizations performance similar topics 2009 in film bret gilliam\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:15 - [react_agent.py:330:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the arena where the Lewiston Maineiacs played their home games and find out its seating capacity.\n",
-      "Action 1: search(\"Lewiston Maineiacs arena\")\n",
-      "Observation 1: Could not find exact page for 'Lewiston Maineiacs arena'. Similar topics: ['Lewiston Maineiacs', 'The Colisée', 'Centre Agnico Eagle', 'Sherbrooke Castors', 'Trois-Rivières Draveurs']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:18 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n",
-      "Action 2: search(\"The Colisée seating capacity\")\n",
-      "Observation 2: Could not find exact page for 'The Colisée seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'List of indoor arenas in the United States', 'Colosseum', 'List of indoor arenas in Canada']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:20 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the seating capacity of the Colisée de Québec where the Lewiston Maineiacs played their home games.\n",
-      "Action 3: search(\"Colisée de Québec seating capacity\")\n",
-      "Observation 3: Could not find exact page for 'Colisée de Québec seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'Videotron Centre', 'List of indoor arenas by capacity', 'List of indoor arenas in Canada']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:22 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n",
-      "Action 4: search(\"Videotron Centre seating capacity\")\n",
-      "Observation 4: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:24 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n",
-      "Action 5: search(\"Videotron Centre seating capacity\")\n",
-      "Observation 5: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:25 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n",
-      "Action 6: search(\"Videotron Centre seating capacity\")\n",
-      "Observation 6: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:27 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n",
-      "Action 7: search(\"Videotron Centre seating capacity\")\n",
-      "Observation 7: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:27 - [react_agent.py:345:call] - answer: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for the arena where the Lewiston Maineiacs played their home games and find out its seating capacity.', action='search(\"Lewiston Maineiacs arena\")', fun_name='search', fun_args=['Lewiston Maineiacs arena'], fun_kwargs={}, observation=\"Could not find exact page for 'Lewiston Maineiacs arena'. Similar topics: ['Lewiston Maineiacs', 'The Colisée', 'Centre Agnico Eagle', 'Sherbrooke Castors', 'Trois-Rivières Draveurs']\"), StepOutput(step=2, thought='I need to search for the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='search(\"The Colisée seating capacity\")', fun_name='search', fun_args=['The Colisée seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'The Colisée seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'List of indoor arenas in the United States', 'Colosseum', 'List of indoor arenas in Canada']\"), StepOutput(step=3, thought='I need to search for the seating capacity of the Colisée de Québec where the Lewiston Maineiacs played their home games.', action='search(\"Colisée de Québec seating capacity\")', fun_name='search', fun_args=['Colisée de Québec seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Colisée de Québec seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'Videotron Centre', 'List of indoor arenas by capacity', 'List of indoor arenas in Canada']\"), StepOutput(step=4, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=5, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=6, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=7, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\")]\n",
-      "\u001b[33m2024-06-15 23:16:27 - [3641068398.py:26:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: could not find exact page for videotron centre seating capacity similar topics videotron centre list of ice hockey arenas by capacity colisée de québec list of indoor arenas by capacity potential national hockey league expansion\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:27 - [react_agent.py:330:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:28 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 1: search(\"Annie Morton\")\n",
-      "Observation 1: Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames,  and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper's Bazaar, Sports Illustrated and Victoria's Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan's book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:29 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the age of Terry Richardson to compare it with Annie Morton's age.\n",
-      "Action 2: search(\"Terry Richardson\")\n",
-      "Observation 2: Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct, although numerous other models have defended him.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\u001b[0m\n",
-      "Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\n",
-      "\u001b[34m2024-06-15 23:16:31 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n",
-      "Action 3: finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\")\n",
-      "Observation 3: Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\n",
-      "\u001b[34m2024-06-15 23:16:32 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n",
-      "Action 4: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n",
-      "Observation 4: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\n",
-      "\u001b[34m2024-06-15 23:16:33 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n",
-      "Action 5: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n",
-      "Observation 5: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\n",
-      "\u001b[34m2024-06-15 23:16:34 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n",
-      "Action 6: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n",
-      "Observation 6: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\n",
-      "\u001b[34m2024-06-15 23:16:35 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to compare the birth dates of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 7: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n",
-      "Observation 7: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:35 - [react_agent.py:345:call] - answer: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (<unknown>, line 1)\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for the ages of Annie Morton and Terry Richardson to determine who is older.', action='search(\"Annie Morton\")', fun_name='search', fun_args=['Annie Morton'], fun_kwargs={}, observation='Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames,  and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper\\'s Bazaar, Sports Illustrated and Victoria\\'s Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan\\'s book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.'), StepOutput(step=2, thought=\"I need to search for the age of Terry Richardson to compare it with Annie Morton's age.\", action='search(\"Terry Richardson\")', fun_name='search', fun_args=['Terry Richardson'], fun_kwargs={}, observation=\"Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct, although numerous other models have defended him.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\"), StepOutput(step=3, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (<unknown>, line 1)'), StepOutput(step=4, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (<unknown>, line 1)'), StepOutput(step=5, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (<unknown>, line 1)'), StepOutput(step=6, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (<unknown>, line 1)'), StepOutput(step=7, thought='I need to compare the birth dates of Annie Morton and Terry Richardson to determine who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (<unknown>, line 1)')]\n",
-      "\u001b[33m2024-06-15 23:16:35 - [3641068398.py:26:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: error executing finishannie morton if born before terry richardson else finishterry richardson expected else after if expression unknown line 1\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:35 - [react_agent.py:330:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for Local H and For Against to find out if they are both from the United States.\n",
-      "Action 1: search(\"Local H\")\n",
-      "Observation 1: Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Joe Pletcher, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:39 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for For Against to find out if they are from the United States.\n",
-      "Action 2: search(\"For Against\")\n",
-      "Observation 2: For Against is an American post-punk/dream pop band from Lincoln, Nebraska, United States. Despite numerous lineup shuffles and some periods of dormancy, the band has produced material steadily since 1984.. Something of an anomaly in Midwestern America, For Against—especially early in its career—has looked to a distinctly British aesthetic for inspiration, most obviously the post-punk scene exemplified by the early rosters of the Factory and 4AD record labels.. Featuring members of underground Lincoln bands Cartoon Pupils and Hymn to Joy, the band first called itself Glue, and then D.B.L., finally settling on Four Against One,[1] a name that reflected its status as a quintet at the time. Bassist Liz Panarelli's early departure gave way to keyboardist Jeffrey Runnings' shift to bass, as well as the band's truncation of its name to For Against.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:16:40 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Both Local H and For Against are from the United States.\n",
-      "Action 3: finish(\"Yes, both Local H and For Against are from the United States.\")\n",
-      "Observation 3: Yes, both Local H and For Against are from the United States.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:16:40 - [react_agent.py:345:call] - answer: Yes, both Local H and For Against are from the United States.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I need to search for Local H and For Against to find out if they are both from the United States.', action='search(\"Local H\")', fun_name='search', fun_args=['Local H'], fun_kwargs={}, observation=\"Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Joe Pletcher, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\"), StepOutput(step=2, thought='I need to search for For Against to find out if they are from the United States.', action='search(\"For Against\")', fun_name='search', fun_args=['For Against'], fun_kwargs={}, observation=\"For Against is an American post-punk/dream pop band from Lincoln, Nebraska, United States. Despite numerous lineup shuffles and some periods of dormancy, the band has produced material steadily since 1984.. Something of an anomaly in Midwestern America, For Against—especially early in its career—has looked to a distinctly British aesthetic for inspiration, most obviously the post-punk scene exemplified by the early rosters of the Factory and 4AD record labels.. Featuring members of underground Lincoln bands Cartoon Pupils and Hymn to Joy, the band first called itself Glue, and then D.B.L., finally settling on Four Against One,[1] a name that reflected its status as a quintet at the time. Bassist Liz Panarelli's early departure gave way to keyboardist Jeffrey Runnings' shift to bass, as well as the band's truncation of its name to For Against.\"), StepOutput(step=3, thought='Both Local H and For Against are from the United States.', action='finish(\"Yes, both Local H and For Against are from the United States.\")', fun_name='finish', fun_args=['Yes, both Local H and For Against are from the United States.'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.')]\n",
-      "\u001b[33m2024-06-15 23:16:40 - [3641068398.py:26:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes both local h and for against are from united states\u001b[0m\n",
-      "EM = (0.3, [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]), FM = (0.6, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0]), average time = 8.897640180587768\n"
-     ]
-    }
-   ],
-   "source": [
-    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
-    "\n",
-    "# set up evaluation type\n",
-    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
-    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
-    "\n",
-    "agent = ReActAgent(\n",
-    "    tools=tools,\n",
-    "    max_steps=7,\n",
-    "    model_client=OpenAIClient(),\n",
-    "    model_kwargs=gpt_model_kwargs,\n",
-    "    preset_prompt_kwargs=preset_prompt_kwargs,\n",
-    ")\n",
-    "\n",
-    "num_questions = 10\n",
-    "gt_answers = []\n",
-    "pred_answers = []\n",
-    "start_time = time.time()\n",
-    "for i in range(num_questions):\n",
-    "    question = val_dataset[i][\"question\"]\n",
-    "    gt_answer = normalize_answer(\n",
-    "        val_dataset[i][\"answer\"]\n",
-    "    )  # normalize the ground truth answer\n",
-    "    gt_answers.append(gt_answer)\n",
-    "\n",
-    "    # get the agent's response\n",
-    "    pred_answer = agent(question)\n",
-    "    pred_answer = normalize_answer(pred_answer)\n",
-    "    pred_answers.append(pred_answer)\n",
-    "\n",
-    "    printc(\n",
-    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
-    "        color=\"yellow\",\n",
-    "    )\n",
-    "\n",
-    "end_time = time.time()\n",
-    "\n",
-    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
-    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
-    "avg_time = (end_time - start_time) / num_questions\n",
-    "\n",
-    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The EM is 0.3 and the FM is 0.6. Each query takes 9s in average. (The performance also depends on the success of wikipedia query connection.)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What if we use simple LLM models to answer these questions? To test on this, we just need to remove the tools. We have a built-in ``llm_tool`` and ``finish`` that automatically handles the query. ``llm_tool`` uses the same model with the agent. We can't add the examples because the examples will mislead the agent to use non-existing tools."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[36m2024-06-15 23:17:00 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:01 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The nationality of Scott Derrickson and Ed Wood needs to be determined.\n",
-      "Action 1: llm_tool(\"What is the nationality of Scott Derrickson and Ed Wood?\")\n",
-      "Observation 1: What is the nationality of Scott Derrickson and Ed Wood?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:03 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is asking if Scott Derrickson and Ed Wood were of the same nationality.\n",
-      "Action 2: llm_tool(\"Were Scott Derrickson and Ed Wood of the same nationality?\")\n",
-      "Observation 2: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:04 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user query has been successfully answered.\n",
-      "Action 3: finish(\"No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\")\n",
-      "Observation 3: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:04 - [react_agent.py:345:call] - answer: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The nationality of Scott Derrickson and Ed Wood needs to be determined.', action='llm_tool(\"What is the nationality of Scott Derrickson and Ed Wood?\")', fun_name='llm_tool', fun_args=['What is the nationality of Scott Derrickson and Ed Wood?'], fun_kwargs={}, observation='What is the nationality of Scott Derrickson and Ed Wood?'), StepOutput(step=2, thought='The user is asking if Scott Derrickson and Ed Wood were of the same nationality.', action='llm_tool(\"Were Scott Derrickson and Ed Wood of the same nationality?\")', fun_name='llm_tool', fun_args=['Were Scott Derrickson and Ed Wood of the same nationality?'], fun_kwargs={}, observation='No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.'), StepOutput(step=3, thought='The user query has been successfully answered.', action='finish(\"No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\")', fun_name='finish', fun_args=['No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.'], fun_kwargs={}, observation='No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.')]\n",
-      "\u001b[33m2024-06-15 23:17:04 - [3230041225.py:26:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:04 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:06 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use llm_tool to answer this question.\n",
-      "Action 1: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 1: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:08 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I will use llm_tool to quickly find the answer.\n",
-      "Action 2: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 2: The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:12 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I will use llm_tool to quickly provide the answer.\n",
-      "Action 3: llm_tool(\"Shirley Temple held the government position of U.S. Representative.\")\n",
-      "Observation 3: Shirley Temple did not hold the government position of U.S. Representative. She was a famous child actress and diplomat, but she did not serve as a U.S. Representative.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:13 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I will use llm_tool to quickly find the answer.\n",
-      "Action 4: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 4: The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:15 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I will use llm_tool to quickly provide the answer.\n",
-      "Action 5: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 5: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:17 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I will use llm_tool to quickly find the answer to the user's query.\n",
-      "Action 6: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 6: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:18 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I will use llm_tool to quickly find the answer to the user's query.\n",
-      "Action 7: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n",
-      "Observation 7: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:18 - [react_agent.py:345:call] - answer: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I will use llm_tool to answer this question.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?'), StepOutput(step=2, thought='I will use llm_tool to quickly find the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.'), StepOutput(step=3, thought='I will use llm_tool to quickly provide the answer.', action='llm_tool(\"Shirley Temple held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['Shirley Temple held the government position of U.S. Representative.'], fun_kwargs={}, observation='Shirley Temple did not hold the government position of U.S. Representative. She was a famous child actress and diplomat, but she did not serve as a U.S. Representative.'), StepOutput(step=4, thought='I will use llm_tool to quickly find the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.'), StepOutput(step=5, thought='I will use llm_tool to quickly provide the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?'), StepOutput(step=6, thought=\"I will use llm_tool to quickly find the answer to the user's query.\", action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?'), StepOutput(step=7, thought=\"I will use llm_tool to quickly find the answer to the user's query.\", action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?')]\n",
-      "\u001b[33m2024-06-15 23:17:18 - [3230041225.py:26:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: who is woman who portrayed corliss archer in film kiss and tell\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:18 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:20 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n",
-      "Action 1: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n",
-      "Observation 1: That sounds like an exciting series! Do you have a favorite book or character from the series?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:22 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n",
-      "Action 2: llm_tool(\"Science fantasy young adult series with companion books about enslaved worlds and alien species.\")\n",
-      "Observation 2: That sounds like an exciting series! Can you tell me more about the main plot or any specific themes in the books?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:24 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking a science fantasy young adult series narrated in the first person with companion books detailing stories of enslaved worlds and alien species.\n",
-      "Action 3: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n",
-      "Observation 3: Hello! It sounds like you're interested in a science fantasy young adult series with a first-person narration, companion books about enslaved worlds, and alien species. That's a fascinating combination! Do you have a specific question or topic you'd like to discuss about these books?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:27 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n",
-      "Action 4: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n",
-      "Observation 4: It sounds like you're looking for a science fantasy young adult series that is narrated in the first person and includes companion books about enslaved worlds and alien species. One recommendation that fits this description is the \"Red Rising\" series by Pierce Brown. This series follows the story of Darrow, a young man living in a society where people are divided by color and forced to work as slaves for the ruling class. As Darrow discovers the truth about his world and joins a rebellion, he encounters various alien species and explores different enslaved worlds. The series includes multiple companion books that delve deeper into the world and its characters.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:29 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n",
-      "Action 5: llm_tool(\"Red Rising series by Pierce Brown.\")\n",
-      "Observation 5: Hello! How can I assist you with the Red Rising series by Pierce Brown?\u001b[0m\n",
-      "Trying to fix potential missing commas...\n",
-      "Trying to fix scaped single quotes...\n",
-      "Fixed JSON string: {\n",
-      " \"thought\": \"The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\",\n",
-      " \"action\": \"llm_tool(\\\"Red Rising series by Pierce Brown.\\\")\",\n",
-      "}\n",
-      "Parsing JSON string with PyYAML...\n",
-      "\u001b[34m2024-06-15 23:17:30 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n",
-      "Action 6: llm_tool(\"Red Rising series by Pierce Brown.\")\n",
-      "Observation 6: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:31 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n",
-      "Action 7: llm_tool(\"Red Rising series by Pierce Brown.\")\n",
-      "Observation 7: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:31 - [react_agent.py:345:call] - answer: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='That sounds like an exciting series! Do you have a favorite book or character from the series?'), StepOutput(step=2, thought='The user is inquiring about a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='That sounds like an exciting series! Can you tell me more about the main plot or any specific themes in the books?'), StepOutput(step=3, thought='The user is seeking a science fantasy young adult series narrated in the first person with companion books detailing stories of enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation=\"Hello! It sounds like you're interested in a science fantasy young adult series with a first-person narration, companion books about enslaved worlds, and alien species. That's a fascinating combination! Do you have a specific question or topic you'd like to discuss about these books?\"), StepOutput(step=4, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='It sounds like you\\'re looking for a science fantasy young adult series that is narrated in the first person and includes companion books about enslaved worlds and alien species. One recommendation that fits this description is the \"Red Rising\" series by Pierce Brown. This series follows the story of Darrow, a young man living in a society where people are divided by color and forced to work as slaves for the ruling class. As Darrow discovers the truth about his world and joins a rebellion, he encounters various alien species and explores different enslaved worlds. The series includes multiple companion books that delve deeper into the world and its characters.'), StepOutput(step=5, thought='The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you with the Red Rising series by Pierce Brown?'), StepOutput(step=6, thought='The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you today with the Red Rising series by Pierce Brown?'), StepOutput(step=7, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you today with the Red Rising series by Pierce Brown?')]\n",
-      "\u001b[33m2024-06-15 23:17:31 - [3230041225.py:26:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: hello how can i assist you today with red rising series by pierce brown\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:31 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:33 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use llm_tool to quickly answer this question.\n",
-      "Action 1: llm_tool(\"Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\")\n",
-      "Observation 1: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:35 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is asking about the location of two specific landmarks.\n",
-      "Action 2: finish(\"No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\")\n",
-      "Observation 2: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:35 - [react_agent.py:345:call] - answer: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I will use llm_tool to quickly answer this question.', action='llm_tool(\"Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\")', fun_name='llm_tool', fun_args=['Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?'], fun_kwargs={}, observation='No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.'), StepOutput(step=2, thought='The user is asking about the location of two specific landmarks.', action='finish(\"No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\")', fun_name='finish', fun_args=['No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.'], fun_kwargs={}, observation='No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.')]\n",
-      "\u001b[33m2024-06-15 23:17:35 - [3230041225.py:26:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is located in laleli neighborhood of istanbul while esma sultan mansion is located in ortaköy neighborhood\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:35 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n",
-      "Action 1: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n",
-      "Observation 1: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:38 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the specific New York city where the movie 'Big Stone Gap' is based.\n",
-      "Action 2: llm_tool(\"New York city where the movie 'Big Stone Gap' is based in\")\n",
-      "Observation 2: What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:40 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking information about the specific New York city where the movie 'Big Stone Gap' is based.\n",
-      "Action 3: llm_tool(\"New York city where the movie 'Big Stone Gap' is based in\")\n",
-      "Observation 3: What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:41 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n",
-      "Action 4: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n",
-      "Observation 4: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:42 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n",
-      "Action 5: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n",
-      "Observation 5: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:44 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n",
-      "Action 6: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n",
-      "Observation 6: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:45 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n",
-      "Action 7: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n",
-      "Observation 7: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:45 - [react_agent.py:345:call] - answer: Hello! How can I assist you today?\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=2, thought=\"The user is inquiring about the specific New York city where the movie 'Big Stone Gap' is based.\", action='llm_tool(\"New York city where the movie \\'Big Stone Gap\\' is based in\")', fun_name='llm_tool', fun_args=[\"New York city where the movie 'Big Stone Gap' is based in\"], fun_kwargs={}, observation=\"What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\"), StepOutput(step=3, thought=\"The user is seeking information about the specific New York city where the movie 'Big Stone Gap' is based.\", action='llm_tool(\"New York city where the movie \\'Big Stone Gap\\' is based in\")', fun_name='llm_tool', fun_args=[\"New York city where the movie 'Big Stone Gap' is based in\"], fun_kwargs={}, observation=\"What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\"), StepOutput(step=4, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=5, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=6, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=7, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?')]\n",
-      "\u001b[33m2024-06-15 23:17:45 - [3230041225.py:26:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: hello how can i assist you today\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:45 - [react_agent.py:330:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:47 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the debut album of a South Korean boy group and who formed it.\n",
-      "Action 1: llm_tool(\"2014 S/S is the debut album of a South Korean boy group\")\n",
-      "Observation 1: What is the name of the South Korean boy group that released the album 2014 S/S?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:49 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the formation of the South Korean boy group that released the album 2014 S/S.\n",
-      "Action 2: llm_tool(\"Who formed the South Korean boy group that released the album 2014 S/S?\")\n",
-      "Observation 2: The South Korean boy group that released the album 2014 S/S is WINNER.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:50 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking confirmation of the information provided.\n",
-      "Action 3: llm_tool(\"Is WINNER the South Korean boy group that released the album 2014 S/S?\")\n",
-      "Observation 3: Yes, WINNER is the South Korean boy group that released the album 2014 S/S.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:52 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is looking for the name of the person or group that formed the South Korean boy group WINNER.\n",
-      "Action 4: llm_tool(\"Who formed the South Korean boy group WINNER?\")\n",
-      "Observation 4: The South Korean boy group WINNER was formed by YG Entertainment through the reality survival program \"WIN: Who is Next\" in 2013.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:53 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking about the formation of the South Korean boy group that released the album 2014 S/S.\n",
-      "Action 5: finish(\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\")\n",
-      "Observation 5: YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:53 - [react_agent.py:345:call] - answer: YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The user is asking about the debut album of a South Korean boy group and who formed it.', action='llm_tool(\"2014 S/S is the debut album of a South Korean boy group\")', fun_name='llm_tool', fun_args=['2014 S/S is the debut album of a South Korean boy group'], fun_kwargs={}, observation='What is the name of the South Korean boy group that released the album 2014 S/S?'), StepOutput(step=2, thought='The user is inquiring about the formation of the South Korean boy group that released the album 2014 S/S.', action='llm_tool(\"Who formed the South Korean boy group that released the album 2014 S/S?\")', fun_name='llm_tool', fun_args=['Who formed the South Korean boy group that released the album 2014 S/S?'], fun_kwargs={}, observation='The South Korean boy group that released the album 2014 S/S is WINNER.'), StepOutput(step=3, thought='The user is seeking confirmation of the information provided.', action='llm_tool(\"Is WINNER the South Korean boy group that released the album 2014 S/S?\")', fun_name='llm_tool', fun_args=['Is WINNER the South Korean boy group that released the album 2014 S/S?'], fun_kwargs={}, observation='Yes, WINNER is the South Korean boy group that released the album 2014 S/S.'), StepOutput(step=4, thought='The user is looking for the name of the person or group that formed the South Korean boy group WINNER.', action='llm_tool(\"Who formed the South Korean boy group WINNER?\")', fun_name='llm_tool', fun_args=['Who formed the South Korean boy group WINNER?'], fun_kwargs={}, observation='The South Korean boy group WINNER was formed by YG Entertainment through the reality survival program \"WIN: Who is Next\" in 2013.'), StepOutput(step=5, thought='The user is asking about the formation of the South Korean boy group that released the album 2014 S/S.', action='finish(\"YG Entertainment through the reality survival program \\'WIN: Who is Next\\' in 2013.\")', fun_name='finish', fun_args=[\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\"], fun_kwargs={}, observation=\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\")]\n",
-      "\u001b[33m2024-06-15 23:17:53 - [3230041225.py:26:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment through reality survival program win who is next in 2013\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:17:53 - [react_agent.py:330:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:54 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.\n",
-      "Action 1: llm_tool(\"Aladin stage name consultant organizations performance\")\n",
-      "Observation 1: Hello! How can I assist you with the information you provided?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:56 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the identity of the person known as Aladin who worked as a consultant to enhance organizational performance.\n",
-      "Action 2: llm_tool(\"Aladin consultant organizational performance\")\n",
-      "Observation 2: Hello! How can I assist you with Aladin consultant organizational performance?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:57 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.\n",
-      "Action 3: llm_tool(\"Aladin consultant organizational performance\")\n",
-      "Observation 3: Hello! How can I assist you with Aladin consultant organizational performance?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:17:59 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is asking about the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\n",
-      "Action 4: llm_tool(\"Aladin consultant organizations performance\")\n",
-      "Observation 4: Hello! How can I assist you with the performance of Aladin consultant organizations?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:00 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking for information about a person known as Aladin who worked as a consultant to enhance organizational performance.\n",
-      "Action 5: llm_tool(\"Aladin consultant organizations performance\")\n",
-      "Observation 5: Hello! How can I assist you with the Aladin consultant organizations performance?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:01 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.\n",
-      "Action 6: llm_tool(\"Aladin consultant organizations performance\")\n",
-      "Observation 6: Hello! How can I assist you with the Aladin consultant organizations performance?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:03 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is looking for information about the person known as Aladin, who assisted organizations in improving their performance as a consultant.\n",
-      "Action 7: llm_tool(\"Aladin consultant organizations performance\")\n",
-      "Observation 7: Hello! How can I assist you with the Aladin consultant organizations performance data?\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:03 - [react_agent.py:345:call] - answer: Hello! How can I assist you with the Aladin consultant organizations performance data?\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action='llm_tool(\"Aladin stage name consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the information you provided?'), StepOutput(step=2, thought='The user is inquiring about the identity of the person known as Aladin who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizational performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizational performance'], fun_kwargs={}, observation='Hello! How can I assist you with Aladin consultant organizational performance?'), StepOutput(step=3, thought='The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizational performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizational performance'], fun_kwargs={}, observation='Hello! How can I assist you with Aladin consultant organizational performance?'), StepOutput(step=4, thought='The user is asking about the person known by the stage name Aladin who helped organizations improve their performance as a consultant.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the performance of Aladin consultant organizations?'), StepOutput(step=5, thought='The user is asking for information about a person known as Aladin who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance?'), StepOutput(step=6, thought='The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance?'), StepOutput(step=7, thought='The user is looking for information about the person known as Aladin, who assisted organizations in improving their performance as a consultant.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance data?')]\n",
-      "\u001b[33m2024-06-15 23:18:03 - [3230041225.py:26:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: hello how can i assist you with aladin consultant organizations performance data\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:03 - [react_agent.py:330:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:05 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n",
-      "Action 1: llm_tool(\"Seating capacity of the arena where the Lewiston Maineiacs played their home games.\")\n",
-      "Observation 1: The seating capacity of the arena where the Lewiston Maineiacs played their home games was 3,677.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:06 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n",
-      "Action 2: finish(\"3,677.\")\n",
-      "Observation 2: 3,677.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:06 - [react_agent.py:345:call] - answer: 3,677.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='llm_tool(\"Seating capacity of the arena where the Lewiston Maineiacs played their home games.\")', fun_name='llm_tool', fun_args=['Seating capacity of the arena where the Lewiston Maineiacs played their home games.'], fun_kwargs={}, observation='The seating capacity of the arena where the Lewiston Maineiacs played their home games was 3,677.'), StepOutput(step=2, thought='The user is inquiring about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='finish(\"3,677.\")', fun_name='finish', fun_args=['3,677.'], fun_kwargs={}, observation='3,677.')]\n",
-      "\u001b[33m2024-06-15 23:18:06 - [3230041225.py:26:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: 3677\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:06 - [react_agent.py:330:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:07 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use the llm_tool to get the information about the ages of Annie Morton and Terry Richardson.\n",
-      "Action 1: llm_tool(\"Annie Morton age\")\n",
-      "Observation 1: Hello! How old is Annie Morton?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:09 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 2: llm_tool(\"Annie Morton age\")\n",
-      "Observation 2: Hello! How old is Annie Morton?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:10 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I will use the llm_tool to get the information about the ages of Terry Richardson.\n",
-      "Action 3: llm_tool(\"Terry Richardson age\")\n",
-      "Observation 3: How old is Terry Richardson?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:12 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 4: llm_tool(\"Annie Morton age\")\n",
-      "Observation 4: Hello! How old is Annie Morton?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:13 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 5: llm_tool(\"Annie Morton age\")\n",
-      "Observation 5: Hello! How old is Annie Morton?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:14 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 6: llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")\n",
-      "Observation 6: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:16 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n",
-      "Action 7: llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")\n",
-      "Observation 7: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:16 - [react_agent.py:345:call] - answer: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='I will use the llm_tool to get the information about the ages of Annie Morton and Terry Richardson.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=2, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=3, thought='I will use the llm_tool to get the information about the ages of Terry Richardson.', action='llm_tool(\"Terry Richardson age\")', fun_name='llm_tool', fun_args=['Terry Richardson age'], fun_kwargs={}, observation='How old is Terry Richardson?'), StepOutput(step=4, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=5, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=6, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")', fun_name='llm_tool', fun_args=['Who is older, Annie Morton or Terry Richardson?'], fun_kwargs={}, observation='Who is older, Annie Morton or Terry Richardson?'), StepOutput(step=7, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")', fun_name='llm_tool', fun_args=['Who is older, Annie Morton or Terry Richardson?'], fun_kwargs={}, observation='Who is older, Annie Morton or Terry Richardson?')]\n",
-      "\u001b[33m2024-06-15 23:18:16 - [3230041225.py:26:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: who is older annie morton or terry richardson\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:16 - [react_agent.py:330:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the origin of two entities, Local H and For Against.\n",
-      "Action 1: llm_tool(\"Local H origin\")\n",
-      "Observation 1: Hello! How can I assist you today?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:18 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the origin of For Against.\n",
-      "Action 2: llm_tool(\"For Against origin\")\n",
-      "Observation 2: Hello! How can I assist you today regarding the topic of origin?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:20 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user wants to know if both Local H and For Against are from the United States.\n",
-      "Action 3: llm_tool(\"Local H and For Against origin\")\n",
-      "Observation 3: Hello! How can I assist you today with the information about Local H and their stance on For Against?\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:23 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: Confirming the origin of Local H and For Against.\n",
-      "Action 4: llm_tool(\"Are Local H and For Against both from the United States?\")\n",
-      "Observation 4: Yes, both Local H and For Against are from the United States.\u001b[0m\n",
-      "\u001b[34m2024-06-15 23:18:24 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is seeking confirmation about the origin of Local H and For Against.\n",
-      "Action 5: finish(\"Yes, both Local H and For Against are from the United States.\")\n",
-      "Observation 5: Yes, both Local H and For Against are from the United States.\u001b[0m\n",
-      "\u001b[36m2024-06-15 23:18:24 - [react_agent.py:345:call] - answer: Yes, both Local H and For Against are from the United States.\u001b[0m\n",
-      "step_history: [StepOutput(step=1, thought='The user is asking about the origin of two entities, Local H and For Against.', action='llm_tool(\"Local H origin\")', fun_name='llm_tool', fun_args=['Local H origin'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=2, thought='The user is inquiring about the origin of For Against.', action='llm_tool(\"For Against origin\")', fun_name='llm_tool', fun_args=['For Against origin'], fun_kwargs={}, observation='Hello! How can I assist you today regarding the topic of origin?'), StepOutput(step=3, thought='The user wants to know if both Local H and For Against are from the United States.', action='llm_tool(\"Local H and For Against origin\")', fun_name='llm_tool', fun_args=['Local H and For Against origin'], fun_kwargs={}, observation='Hello! How can I assist you today with the information about Local H and their stance on For Against?'), StepOutput(step=4, thought='Confirming the origin of Local H and For Against.', action='llm_tool(\"Are Local H and For Against both from the United States?\")', fun_name='llm_tool', fun_args=['Are Local H and For Against both from the United States?'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.'), StepOutput(step=5, thought='The user is seeking confirmation about the origin of Local H and For Against.', action='finish(\"Yes, both Local H and For Against are from the United States.\")', fun_name='finish', fun_args=['Yes, both Local H and For Against are from the United States.'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.')]\n",
-      "\u001b[33m2024-06-15 23:18:24 - [3230041225.py:26:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes both local h and for against are from united states\u001b[0m\n",
-      "EM = (0.0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), FM = (0.4, [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0]), average time = 8.354214406013488\n"
-     ]
-    }
-   ],
-   "source": [
-    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
-    "\n",
-    "# set up evaluation type\n",
-    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
-    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
-    "\n",
-    "agent = ReActAgent(\n",
-    "    max_steps=7, model_client=OpenAIClient(), model_kwargs=gpt_model_kwargs\n",
-    ")\n",
-    "\n",
-    "num_questions = 10\n",
-    "gt_answers = []\n",
-    "pred_answers = []\n",
-    "start_time = time.time()\n",
-    "for i in range(num_questions):\n",
-    "    question = val_dataset[i][\"question\"]\n",
-    "    gt_answer = normalize_answer(\n",
-    "        val_dataset[i][\"answer\"]\n",
-    "    )  # normalize the ground truth answer\n",
-    "    gt_answers.append(gt_answer)\n",
-    "\n",
-    "    # get the agent's response\n",
-    "    pred_answer = agent(question)\n",
-    "    pred_answer = normalize_answer(pred_answer)\n",
-    "    pred_answers.append(pred_answer)\n",
-    "\n",
-    "    printc(\n",
-    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
-    "        color=\"yellow\",\n",
-    "    )\n",
-    "\n",
-    "end_time = time.time()\n",
-    "\n",
-    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
-    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
-    "avg_time = (end_time - start_time) / num_questions\n",
-    "\n",
-    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Without the tools and examples, EM=0 and FM=0.4. We saw hallucinations and nonsense:\n",
-    "\n",
-    "2024-06-15 23:17:04 - [3230041225.py:26:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes``, pred answer: ``no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american``\n",
-    "\n",
-    "2024-06-15 23:18:16 - [3230041225.py:26:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth:`` terry richardson``, pred answer: ``who is older annie morton or terry richardson``\n",
-    "\n",
-    "Therefore, using ReAct agent outperforms the base LLM.\n",
-    "Meanwhile, ``LightRAG ReAct agent`` shows that the performance on 10 questions(EM=0.3)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 7. Future Improvement"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO:\n",
-    "# 1. advanced, add history to react\n",
-    "# 2. add training, few shot\n",
-    "# 3. llm as judge\n",
-    "# 4. add picture\n",
-    "# 5. better json handling, we need to store the answer output"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "lightrag-project",
-   "language": "python",
-   "name": "light-rag-project"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/use_cases/agent/react_agent_hotpot_qa.ipynb b/use_cases/agent/react_agent_hotpot_qa.ipynb
new file mode 100644
index 00000000..0e1d4d6d
--- /dev/null
+++ b/use_cases/agent/react_agent_hotpot_qa.ipynb
@@ -0,0 +1,1272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ReAct Agent Use Case"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Q&A Chatbot\n",
+    "In this tutorial, we will implement ``adalflow ReAct`` to build a Q&A chatbot on [HotpotQA](https://arxiv.org/pdf/1809.09600) dataset. \n",
+    "\n",
+    "To learn more about ``adalflow ReAct``, please refer to our developer notes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# 2. HotpotQA Dataset\n",
+    "We are using [HotpotQA](https://arxiv.org/pdf/1809.09600). It is a Wikipedia-based multi-hop question and answer dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load the dataset\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len of eval: 7405\n",
+      "example: {'id': '5a8b57f25542995d1e6f1371', 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'answer': 'yes', 'type': 'comparison', 'level': 'hard', 'supporting_facts': {'title': ['Scott Derrickson', 'Ed Wood'], 'sent_id': [0, 0]}, 'context': {'title': ['Adam Collis', 'Ed Wood (film)', 'Tyler Bates', 'Doctor Strange (2016 film)', 'Hellraiser: Inferno', 'Sinister (film)', 'Deliver Us from Evil (2014 film)', 'Woodson, Arkansas', 'Conrad Brooks', 'The Exorcism of Emily Rose'], 'sentences': [['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\\'s short \"Love in the Ruins\" (1995).', ' In 1998, he played \"Crankshaft\" in Eric Koyanagi\\'s \"Hundred Percent\".'], ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', \" The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.\", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.'], ['Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.', ' Much of his work is in the action and horror film genres, with films like \"Dawn of the Dead, 300, Sucker Punch,\" and \"John Wick.\"', ' He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.', ' With Gunn, he has scored every one of the director\\'s films; including \"Guardians of the Galaxy\", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums \"The Pale Emperor\" and \"Heaven Upside Down\".'], ['Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.', ' It is the fourteenth film of the Marvel Cinematic Universe (MCU).', ' The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.', ' In \"Doctor Strange\", surgeon Strange learns the mystic arts after a career-ending car accident.'], ['Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.', ' It is the fifth installment in the \"Hellraiser\" series and the first \"Hellraiser\" film to go straight-to-DVD.', ' It was directed by Scott Derrickson and released on October 3, 2000.', \" The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.\", \" The film's reviews were mixed.\"], ['Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.', ' It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.'], ['Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.', ' The film is officially based on a 2001 non-fiction book entitled \"Beware the Night\" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was \"inspired by actual accounts\".', ' The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.'], ['Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.', ' Its population was 403 at the 2010 census.', ' It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.', ' Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.', ' Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.'], ['Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.', ' He moved to Hollywood, California in 1948 to pursue a career in acting.', ' He got his start in movies appearing in Ed Wood films such as \"Plan 9 from Outer Space\", \"Glen or Glenda\", and \"Jail Bait.\"', ' He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.', ' He also has since gone on to write, produce and direct several films.'], ['The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.', ' The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.']]}}\n",
+      "attributes in each sample: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the data sample\n",
+    "test_sample = dataset[\"validation\"][0]\n",
+    "print(f\"len of eval: {len(dataset['validation'])}\")\n",
+    "print(f\"example: {test_sample}\")\n",
+    "print(f\"attributes in each sample: {list(test_sample.keys())}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "question: Were Scott Derrickson and Ed Wood of the same nationality?\n",
+      "answer: yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Each sample contains a question and a corresponding answer.\n",
+    "print(f\"question: {test_sample.get('question')}\")\n",
+    "print(f\"answer: {test_sample.get('answer')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Set up\n",
+    "Please make sure you have set the model client APIs before running the agent. Now import the necessary packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import dotenv\n",
+    "from adalflow.components.model_client import OpenAIClient\n",
+    "from adalflow.components.agent.react import ReActAgent\n",
+    "from adalflow.core.func_tool import FunctionTool\n",
+    "\n",
+    "import time\n",
+    "\n",
+    "# load evironment, please set the relative path to your .env file that includes the api key\n",
+    "dotenv.load_dotenv(dotenv_path=\"../../.env\", override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Create Agent\n",
+    "To create an gent, we need to define the basic components.\n",
+    "\n",
+    "## Tools\n",
+    "Firstly, we need to specify what functions the agent will need to answer the question. In this case, we are answering the Wikipedia-based questions, we will allow the agent to **search** Wikipedia api. The [ReAct Paper](https://arxiv.org/pdf/2210.03629) includes a **lookup** function that serves as Ctrl+F functionality on the browser.\n",
+    "\n",
+    "As ``adalflow ReAct`` has a built in ``finish`` function, we don't need to define by ourselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "import re\n",
+    "import string\n",
+    "\n",
+    "\n",
+    "# copy code from the paper\n",
+    "def clean_str(p):\n",
+    "    return p.encode().decode(\"unicode-escape\").encode(\"latin1\").decode(\"utf-8\")\n",
+    "\n",
+    "\n",
+    "# normalization copied from the paper's code\n",
+    "def normalize_answer(s):\n",
+    "    def remove_articles(text):\n",
+    "        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n",
+    "\n",
+    "    def white_space_fix(text):\n",
+    "        return \" \".join(text.split())\n",
+    "\n",
+    "    def remove_punc(text):\n",
+    "        exclude = set(string.punctuation)\n",
+    "        return \"\".join(ch for ch in text if ch not in exclude)\n",
+    "\n",
+    "    def lower(text):\n",
+    "        return text.lower()\n",
+    "\n",
+    "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
+    "\n",
+    "\n",
+    "def search(entity: str) -> str:\n",
+    "    \"\"\"\n",
+    "    searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\n",
+    "    \"\"\"\n",
+    "    # Format the entity for URL encoding\n",
+    "    entity_formatted = entity.replace(\" \", \"+\")\n",
+    "    url = f\"https://en.wikipedia.org/w/index.php?search={entity_formatted}\"\n",
+    "\n",
+    "    # Fetch the page\n",
+    "    response = requests.get(url)\n",
+    "    soup = BeautifulSoup(response.text, \"html.parser\")\n",
+    "\n",
+    "    # Check if the exact page was found or suggest similar items\n",
+    "    # when <div class=mw-search-result-heading> is detected, it means the entity page is not found on wikipedia\n",
+    "    result_divs = soup.find_all(\"div\", {\"class\": \"mw-search-result-heading\"})\n",
+    "\n",
+    "    if (\n",
+    "        result_divs\n",
+    "    ):  # this means the searched entity page is not in wikipedia, wikipedia will show a list of similar entities\n",
+    "        # get Similar results\n",
+    "        similar_titles = [div.a.get_text() for div in result_divs]\n",
+    "        return f\"Could not find exact page for '{entity}'. Similar topics: {similar_titles[:5]}\"  # return the top 5 similar titles\n",
+    "    else:\n",
+    "        # the paper uses page to represent content in <p>\n",
+    "        # Extract xontent\n",
+    "        page_list = [\n",
+    "            p.get_text().strip() for p in soup.find_all(\"p\") + soup.find_all(\"ul\")\n",
+    "        ]\n",
+    "        # TODO: Recursive search, if find any concept that needs more search then call search again\n",
+    "        # if any(\"may refer to:\" in p for p in page_list):\n",
+    "        #     search(entity)\n",
+    "\n",
+    "        # restructure & clean the page content following the paper's logic\n",
+    "        page = \"\"\n",
+    "        for p in page_list:\n",
+    "            if len(p.split(\" \")) > 2:\n",
+    "                page += clean_str(p)\n",
+    "                if not p.endswith(\"\\n\"):\n",
+    "                    page += \"\\n\"\n",
+    "        paragraphs = page.split(\"\\n\")\n",
+    "        paragraphs = [p.strip() for p in paragraphs if p.strip()]\n",
+    "\n",
+    "        sentences = []\n",
+    "        for p in paragraphs:\n",
+    "            sentences += p.split(\". \")\n",
+    "        sentences = [s.strip() + \".\" for s in sentences if s.strip()]\n",
+    "\n",
+    "        # return the first 5 sentences\n",
+    "        if sentences:\n",
+    "            return (\n",
+    "                \" \".join(sentences[:5]) if len(sentences) >= 5 else \" \".join(sentences)\n",
+    "            )\n",
+    "        else:\n",
+    "            return \"No content found on this page.\"\n",
+    "\n",
+    "        # TODO: clean the paragraphs and return the searched content\n",
+    "\n",
+    "\n",
+    "def lookup(text: str, keyword: str) -> str:\n",
+    "    \"\"\"\n",
+    "    returns the sentences containing keyword in the current passage.\n",
+    "    \"\"\"\n",
+    "    sentences = text.split(\".\")\n",
+    "    matching_sentences = [\n",
+    "        sentence.strip() + \".\"\n",
+    "        for sentence in sentences\n",
+    "        if keyword.lower() in sentence.lower()\n",
+    "    ]\n",
+    "    if not matching_sentences:\n",
+    "        return \"No sentences found with the keyword.\"\n",
+    "    else:\n",
+    "        return \" \".join(\n",
+    "            matching_sentences\n",
+    "        )  # Join all matching sentences into a single string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up tools for the agent\n",
+    "tools = [FunctionTool(fn=search)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Examples\n",
+    "The next thing to add is examples. Few shot prompt engineering is a common practice to improve the model performance.\n",
+    "\n",
+    "Let's use the paper's examples. The paper has 6 examples altogether."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = [\n",
+    "    \"\"\"Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\n",
+    "Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\n",
+    "Action 1: search(\"Colorado orogeny\")\n",
+    "Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\n",
+    "Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.\n",
+    "Action 2: lookup(\"eastern sector\")\n",
+    "Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\n",
+    "Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\n",
+    "Action 3: search(\"High Plains\")\n",
+    "Observation 3: High Plains refers to one of two distinct land regions:\n",
+    "Thought 4: I need to instead search High Plains (United States).\n",
+    "Action 4: search(\"High Plains (United States)\")\n",
+    "Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\n",
+    "Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\n",
+    "Action 5: finish(\"1,800 to 7,000 ft\")\"\"\",\n",
+    "    \"\"\"Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\n",
+    "Thought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\n",
+    "Action 1: search(\"Milhouse\")\n",
+    "Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\n",
+    "Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\n",
+    "Action 2: lookup(\"named after\")\n",
+    "Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \n",
+    "Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\n",
+    "Action 3: finish(\"Richard Nixon\")\"\"\",\n",
+    "    \"\"\"Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\n",
+    "Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\n",
+    "Action 1: search(\"Adam Clayton Powell\")\n",
+    "Observation 1: Could not find [\"Adam Clayton Powell\"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].\n",
+    "Thought 2: To find the documentary, I can search Adam Clayton Powell (film).\n",
+    "Action 2: search(\"Adam Clayton Powell (film)\")\n",
+    "Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\n",
+    "The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\n",
+    "Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\n",
+    "Action 3: finish(\"The Saimaa Gesture\")\"\"\",\n",
+    "    \"\"\"Question: What profession does Nicholas Ray and Elia Kazan have in common?\n",
+    "Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\n",
+    "Action 1: search(\"Nicholas Ray\")\n",
+    "Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\n",
+    "Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\n",
+    "Action 2: search(\"Elia Kazan\")\n",
+    "Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\n",
+    "Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\n",
+    "Action 3: finish(\"director, screenwriter, actor\")\"\"\",\n",
+    "    \"\"\"Question: Which magazine was started first Arthur's Magazine or First for Women?\n",
+    "Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.\n",
+    "Action 1: search(\"Arthur's Magazine\")\n",
+    "Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. \n",
+    "Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.\n",
+    "Action 2: search(\"First for Women\")\n",
+    "Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \n",
+    "Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.\n",
+    "Action 3: finish(\"Arthur's Magazine\")\"\"\",\n",
+    "    \"\"\"Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\n",
+    "Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\n",
+    "Action 1: search(\"Pavel Urysohn\")\n",
+    "Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\n",
+    "Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\n",
+    "Action 2: search(\"Leonid Levin\")\n",
+    "Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \n",
+    "Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \n",
+    "Action 3: finish(\"yes\")\"\"\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n",
+    "\n",
+    "# convert this example in FunctionExpression\n",
+    "# import adalflow as adal\n",
+    "# from adalflow.core.types import FunctionExpression\n",
+    "\n",
+    "# eg_1 = {\n",
+    "#     \"Question\": \"What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\",\n",
+    "#     \"Thought 1\": \"I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\",\n",
+    "#     \"Action 1\": \"search\",\n",
+    "#     \"kwargs\": {\"entity\": \"Colorado orogeny\"},\n",
+    "#     \"Observation 1\": \"The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\",\n",
+    "#     \"Thought 2\": \"It does not mention the eastern sector. So I need to look up eastern sector.\",\n",
+    "#     \"Action 2\": \"lookup('eastern sector')\",\n",
+    "#     \"Action 2\": \"lookup\",\n",
+    "#     \"kwargs\": {\"text\": \"eastern sector\", \"keyword\": \"eastern sector\"},\n",
+    "#     \"Observation 2\": \"(Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\",\n",
+    "#     \"Thought 3\": \"The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\",\n",
+    "#     \"Action 3\": \"search('High Plains')\",\n",
+    "#     \"Observation 3\": \"High Plains refers to one of two distinct land regions:\",\n",
+    "#     \"Thought 4\": \"I need to instead search High Plains (United States).\",\n",
+    "#     \"Action 4\": \"search('High Plains (United States)')\",\n",
+    "#     \"Observation 4\": \"The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\",\n",
+    "#     \"Thought 5\": \"High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\",\n",
+    "#     \"Action 5\": \"finish('1,800 to 7,000 ft')\"\n",
+    "# }\n",
+    "\n",
+    "# examples_expression = []\n",
+    "# for example in examples:\n",
+    "#     ex_exp = FunctionExpression(thought=example)\n",
+    "\n",
+    "# preset_prompt_kwargs = {\"examples\": examples}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model\n",
+    "\n",
+    "Next, we can choose the model to call. In this example we will use OpenAIClient ``gpt-3.5-turbo`` model. We will set the ``temperature`` at 0.0 to make the response as consistent as possible."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpt_model_kwargs = {\n",
+    "    \"model\": \"gpt-3.5-turbo\",\n",
+    "    \"temperature\": 0.0,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Agent\n",
+    "Combining the previous components, we can define the agent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ReActAgent(\n",
+       "  max_steps=3, add_llm_as_fallback=True, \n",
+       "  (tool_manager): ToolManager(Tools: [FunctionTool(fn: <function search at 0x12266a520>, async: False, definition: FunctionDefinition(func_name='search', func_desc='search(entity: str) -> str\\n\\n    searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\\n    ', func_parameters={'type': 'object', 'properties': {'entity': {'type': 'str'}}, 'required': ['entity']})), FunctionTool(fn: <function ReActAgent._init_tools.<locals>.llm_tool at 0x1379487c0>, async: False, definition: FunctionDefinition(func_name='llm_tool', func_desc=\"llm_tool(input: str) -> str\\nI answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.\", func_parameters={'type': 'object', 'properties': {'input': {'type': 'str'}}, 'required': ['input']})), FunctionTool(fn: <function ReActAgent._init_tools.<locals>.finish at 0x137948400>, async: False, definition: FunctionDefinition(func_name='finish', func_desc='finish(answer: str) -> str\\nFinish the task with answer.', func_parameters={'type': 'object', 'properties': {'answer': {'type': 'str'}}, 'required': ['answer']}))], Additional Context: {})\n",
+       "  (planner): Generator(\n",
+       "    model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, trainable_prompt_kwargs=[]\n",
+       "    (prompt): Prompt(\n",
+       "      template: <SYS>\n",
+       "      {# role/task description #}\n",
+       "      You are a helpful assistant.\n",
+       "      Answer the user's query using the tools provided below with minimal steps and maximum accuracy.\n",
+       "      {# REACT instructions #}\n",
+       "      Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.\n",
+       "      {# Tools #}\n",
+       "      {% if tools %}\n",
+       "      <TOOLS>\n",
+       "      You available tools are:\n",
+       "      {% for tool in tools %}\n",
+       "      {{ loop.index }}.\n",
+       "      {{tool}}\n",
+       "      ------------------------\n",
+       "      {% endfor %}\n",
+       "      </TOOLS>\n",
+       "      {% endif %}\n",
+       "      {# output format and examples for output format #}\n",
+       "      <OUTPUT_FORMAT>\n",
+       "      {{output_format_str}}\n",
+       "      </OUTPUT_FORMAT>\n",
+       "      <TASK_SPEC>\n",
+       "      {# Task specification to teach the agent how to think using 'divide and conquer' strategy #}\n",
+       "      - For simple queries: Directly call the ``finish`` action and provide the answer.\n",
+       "      - For complex queries:\n",
+       "          - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n",
+       "          - Call one available tool at a time to solve each subquery/subquestion. \\\n",
+       "          - At step 'finish', join all subqueries answers and finish the task.\n",
+       "      Remember:\n",
+       "      - Action must call one of the above tools with name. It can not be empty.\n",
+       "      - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n",
+       "      </TASK_SPEC>\n",
+       "      </SYS>\n",
+       "      -----------------\n",
+       "      User query:\n",
+       "      {{ input_str }}\n",
+       "      {# Step History #}\n",
+       "      {% if step_history %}\n",
+       "      <STEPS>\n",
+       "      Your previous steps:\n",
+       "      {% for history in step_history %}\n",
+       "      Step {{ loop.index }}.\n",
+       "      \"Thought\": \"{{history.action.thought}}\",\n",
+       "      \"Action\": \"{{history.action.action}}\",\n",
+       "      \"Observation\": \"{{history.observation}}\"\n",
+       "      ------------------------\n",
+       "      {% endfor %}\n",
+       "      </STEPS>\n",
+       "      {% endif %}\n",
+       "      You:, prompt_kwargs: {'tools': ['func_name: search\\nfunc_desc: \"search(entity: str) -> str\\\\n\\\\n    searches the exact entity on Wikipedia\\\\\\n  \\\\ and returns the first paragraph if it exists. If not, it will return some similar\\\\\\n  \\\\ entities to search.\\\\n    \"\\nfunc_parameters:\\n  type: object\\n  properties:\\n    entity:\\n      type: str\\n  required:\\n  - entity', \"func_name: llm_tool\\nfunc_desc: 'llm_tool(input: str) -> str\\n\\n  I answer any input query with llm''s world knowledge. Use me as a fallback tool\\n  or when the query is simple.'\\nfunc_parameters:\\n  type: object\\n  properties:\\n    input:\\n      type: str\\n  required:\\n  - input\", \"func_name: finish\\nfunc_desc: 'finish(answer: str) -> str\\n\\n  Finish the task with answer.'\\nfunc_parameters:\\n  type: object\\n  properties:\\n    answer:\\n      type: str\\n  required:\\n  - answer\"], 'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n    \"question\": \"The question to ask the LLM (Optional[str]) (optional)\",\\n    \"thought\": \"Why the function is called (Optional[str]) (optional)\",\\n    \"action\": \"FuncName(<kwargs>) Valid function call expression. Example: \\\\\"FuncName(a=1, b=2)\\\\\" Follow the data type specified in the function parameters.e.g. for Type object with x,y properties, use \\\\\"ObjectType(x=1, y=2) (str) (required)\"\\n}\\n```\\nExamples:\\n```\\n{\\n    \"question\": null,\\n    \"thought\": \"I have finished the task.\",\\n    \"action\": \"finish(answer=\\\\\"final answer: \\'answer\\'\\\\\")\"\\n}\\n________\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'tools', 'step_history', 'output_format_str']\n",
+       "    )\n",
+       "    (model_client): OpenAIClient()\n",
+       "    (output_processors): JsonOutputParser(\n",
+       "      data_class=FunctionExpression, examples=[FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"final answer: \\'answer\\'\")')], exclude_fields=None,             include_fields=None, return_data_class=True\n",
+       "      (output_format_prompt): Prompt(\n",
+       "        template: Your output should be formatted as a standard JSON instance with the following schema:\n",
+       "        ```\n",
+       "        {{schema}}\n",
+       "        ```\n",
+       "        {% if example %}\n",
+       "        Examples:\n",
+       "        ```\n",
+       "        {{example}}\n",
+       "        ```\n",
+       "        {% endif %}\n",
+       "        -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n",
+       "        -Use double quotes for the keys and string values.\n",
+       "        -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n",
+       "        -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']\n",
+       "      )\n",
+       "      (output_processors): JsonParser()\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# max_steps refers to how many thought-action round we allow the model to perform\n",
+    "# to save resources, let's use 3 here\n",
+    "agent = ReActAgent(\n",
+    "    tools=tools,\n",
+    "    max_steps=3,\n",
+    "    model_client=OpenAIClient(),\n",
+    "    model_kwargs=gpt_model_kwargs,\n",
+    "    # preset_prompt_kwargs=preset_prompt_kwargs,\n",
+    ")\n",
+    "agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<module 'lightrag' from '/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/lightrag/__init__.py'>"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import importlib\n",
+    "import adalflow\n",
+    "\n",
+    "importlib.reload(adalflow)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5. Q & A\n",
+    "Next we can use the agent to answer our questions. Let's run 5 examples. We will use the validation data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],\n",
+       "    num_rows: 7405\n",
+       "})"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "val_dataset = dataset[\"validation\"]\n",
+    "val_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "``LightRAG`` provides a ``printc`` function. You can utilize it to show colored console output for angent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 13:58:48 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:49 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:50 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:51 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Scott Derrickson and Ed Wood were both of American nationality.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson and Ed Wood were both of American nationality.'}), observation='Scott Derrickson and Ed Wood were both of American nationality.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:51 - [react.py:299:call] - answer:\n",
+      " Scott Derrickson and Ed Wood were both of American nationality.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:51 - [530968165.py:14:<module>] - question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson and ed wood were both of american nationality\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:51 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:52 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Shirley Temple')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Shirley Temple'}), observation=\"This is an accepted version of this page. Shirley Temple Black (born Shirley Jane Temple; April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, and diplomat, who was Hollywood's number-one box-office draw as a child actress from 1934 to 1938. Later, she was named United States Ambassador to Ghana and Czechoslovakia, and also served as Chief of Protocol of the United States.. Temple began her film career in 1931 when she was three years old and was well-known for her performance in Bright Eyes, which was released in 1934. She won a special Juvenile Academy Award in February 1935 for her outstanding contribution as a juvenile performer in motion pictures during 1934 and continued to appear in popular films through the remainder of the 1930s, although her subsequent films became less popular as she grew older.[1] She appeared in her last film, A Kiss for Corliss, in 1949.[2][3].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:53 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.'}), observation='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:53 - [react.py:299:call] - answer:\n",
+      " The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:53 - [530968165.py:14:<module>] - question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: government positions held by shirley temple who portrayed corliss archer in film kiss and tell include being united states ambassador to ghana and czechoslovakia as well as serving as chief of protocol of united states\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:53 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:55 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?', thought='I will search for the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"search(entity='science fantasy young adult series with companion books about enslaved worlds and alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series with companion books about enslaved worlds and alien species'}), observation=\"Could not find exact page for 'science fantasy young adult series with companion books about enslaved worlds and alien species'. Similar topics: ['Animorphs', 'Feminist science fiction', 'LGBT themes in speculative fiction', 'Apocalyptic and post-apocalyptic fiction', 'Last Legionary']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:56 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will use LLM as a fallback tool to provide information on the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"llm_tool(input='What is a science fantasy young adult series with companion books about enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is a science fantasy young adult series with companion books about enslaved worlds and alien species?'}), observation='One popular science fantasy young adult series that fits your description is the \"Lorien Legacies\" series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:57 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"One popular science fantasy young adult series that fits your description is the \\'Lorien Legacies\\' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\"}), observation=\"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:57 - [react.py:299:call] - answer:\n",
+      " One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:57 - [530968165.py:14:<module>] - question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: one popular science fantasy young adult series that fits your description is lorien legacies series by pittacus lore series includes companion books that delve into enslaved worlds and alien species featured in main novels series follows group of alien teenagers known as garde who are on earth hiding from their enemies mogadorians while developing their powers to fight back\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:57 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:59 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:00 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have found information about the Laleli Mosque. Now, I will search for information about the Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:01 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have found information about both the Laleli Mosque and the Esma Sultan Mansion.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:01 - [react.py:299:call] - answer:\n",
+      " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:01 - [530968165.py:14:<module>] - question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:59:01 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:03 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:04 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will provide the director of the romantic comedy 'Big Stone Gap'.\", action='finish(answer=\"The director of the romantic comedy \\'Big Stone Gap\\' is Adriana Trigiani.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\"}), observation=\"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:04 - [react.py:299:call] - answer:\n",
+      " The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:04 - [530968165.py:14:<module>] - question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy big stone gap is adriana trigiani\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.utils.logger import printc\n",
+    "\n",
+    "num_questions = 5\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6. Evaluation\n",
+    "\n",
+    "Now you will see that we have the ``exact correct answer`` for some questions:\n",
+    "\n",
+    "question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes`` pred answer: ``yes``\n",
+    "\n",
+    "question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: ``animorphs``, pred answer: ``animorphs``\n",
+    "\n",
+    "Sometimes the agent performs correctly but not in the same format with the ground truth. E.g. ground truth: ``no``, pred answer: ``no, they are not the same``. This is what we can tolerate.\n",
+    "\n",
+    "But how to evaluate if the agent is doing well, or if our tools, examples, and prompt implementations work well? We need to evaluate it.\n",
+    "\n",
+    "1. Exact Match(EM)\n",
+    "Exact Match is what the paper is using. Only when the normalized agent response is the same with the ground truth answer, we count it as correct. The paper's EM for react agent is around 30%(gpt-3).\n",
+    "\n",
+    "2. Fuzzy Match(FM)\n",
+    "EM doesn't make much sense in question and answering. So we propose fuzzy match based on the LLMs' lengthy output nature. If the ground truth answer is included in the agent response, then we count it as correct. FM is not necessarily correct. \n",
+    "E.g. question: Harry Potter and Dumbledore, who is older? ground truth: ``dumbledore``, pred answer: ``harry potter is older than dumbledore.``\n",
+    "the model mentioned the groud truth but still provide wrong answer. So FM serves as reference.\n",
+    "\n",
+    "Let's use ``LightRAG eval`` module and evaluate on 10 questions and keep the model's practice to set ``max_step`` at `7`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 13:59:56 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:57 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:58 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:59 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the search for the nationalities of Scott Derrickson and Ed Wood.', action='finish(answer=\"Scott Derrickson is American and Ed Wood was also American.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson is American and Ed Wood was also American.'}), observation='Scott Derrickson is American and Ed Wood was also American.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:59 - [react.py:299:call] - answer:\n",
+      " Scott Derrickson is American and Ed Wood was also American.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:59 - [2823432734.py:31:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson is american and ed wood was also american\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:59:59 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:00 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Corliss Archer')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Corliss Archer'}), observation='Fictional teenage girl Corliss Archer is the lead character in a series of American short stories written by F. Hugh Herbert starting in 1943. She also appears in these derivative works:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:02 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell film cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell film cast'}), observation=\"Could not find exact page for 'Kiss and Tell film cast'. Similar topics: ['Kiss & Tell (1997 film)', 'Kiss Me (2014 film)', 'Kiss Kiss Bang Bang', 'Kiss and Tell (1945 film)', 'Kiss the Girls (1997 film)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:04 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell (1945 film) cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell (1945 film) cast'}), observation=\"Could not find exact page for 'Kiss and Tell (1945 film) cast'. Similar topics: ['Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Kiss Them for Me (film)', 'A Kiss for Corliss', 'State Fair (1945 film)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:05 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have finished the search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='finish(answer=\"Unfortunately, I couldn\\'t find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\"}), observation=\"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:05 - [react.py:299:call] - answer:\n",
+      " Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:05 - [2823432734.py:31:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: unfortunately i couldnt find specific information about woman who portrayed corliss archer in film kiss and tell\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:05 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:07 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the science fantasy young adult series described by the user.', action=\"search(entity='science fantasy young adult series')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series'}), observation=\"Could not find exact page for 'science fantasy young adult series'. Similar topics: ['Ballantine Adult Fantasy series', 'Red Queen (novel)', 'Brandon Sanderson', 'The Mortal Instruments', 'V. E. Schwab']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:09 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books enslaved worlds alien species'. Similar topics: ['List of Doctor Who universe creatures and aliens', 'List of Star Wars species (P–T)', 'List of companions in Doctor Who spin-offs', 'Extraterrestrial life', 'List of Known Space characters']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:11 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books narrating stories enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books narrating stories enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books narrating stories enslaved worlds alien species'. Similar topics: ['Animorphs', 'Eighth Doctor', 'Apocalyptic and post-apocalyptic fiction', 'Barsoom', 'Wicked Witch of the West']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:13 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search for the specific series and companion books. I will use the LLM tool to provide a relevant answer.', action=\"llm_tool(input='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?'}), observation='The science fantasy young adult series you are referring to is \"The Lorien Legacies\" series by Pittacus Lore. The companion books you mentioned are called \"The Lost Files,\" which narrate the stories of enslaved worlds and alien species in the same universe as the main series.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:14 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The science fantasy young adult series you are referring to is \\'The Lorien Legacies\\' series by Pittacus Lore. The companion books you mentioned are called \\'The Lost Files,\\' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\"}), observation=\"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:14 - [react.py:299:call] - answer:\n",
+      " The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:14 - [2823432734.py:31:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: science fantasy young adult series you are referring to is lorien legacies series by pittacus lore companion books you mentioned are called lost files which narrate stories of enslaved worlds and alien species in same universe as main series\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:14 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:16 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:17 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about the location of Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:18 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:18 - [react.py:299:call] - answer:\n",
+      " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:18 - [2823432734.py:31:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:18 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:19 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:20 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Adriana Trigiani')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Adriana Trigiani'}), observation=\"Adriana Trigiani is an American best-selling author of eighteen books, playwright, television writer/producer, film director/screenwriter/producer, and entrepreneur based in New York City. Trigiani has published a novel a year since 2000.[1]. Trigiani graduated from Saint Mary's College in Indiana in 1981.[2]. Inspired by her Italian American heritage and Appalachian childhood in Big Stone Gap, Virginia, Trigiani arrived in New York in 1985. Trigiani made her off-Broadway debut in New York City as a playwright in 1985 at the Manhattan Theater Club with Secrets of the Lava Lamp, directed by Stuart Ross.[3]  From 1988 to 1998, she created scripts for television sitcoms, including The Cosby Show (1984) and its spin-off A Different World (1987).\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:21 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.'}), observation='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:21 - [react.py:299:call] - answer:\n",
+      " Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:21 - [2823432734.py:31:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: adriana trigiani is director of romantic comedy film big stone gap and is based in new york city\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:21 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:22 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the debut album of the South Korean boy group.', action=\"search(entity='2014 S/S (album)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': '2014 S/S (album)'}), observation='2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group\\'s record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album\\'s songs.[2]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I.,  Airplay, and others.[3] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[4][5]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard\\'s K-Pop Hot 100 chart,[6][7] and topping 7 other South Korean charts.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:24 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question='The question to ask the LLM (Optional[str]) (optional)', thought='I will find out who formed the South Korean boy group Winner.', action=\"search(entity='Winner (group)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (group)'}), observation=\"Could not find exact page for 'Winner (group)'. Similar topics: ['2026 FIFA World Cup', '2025 FIFA Club World Cup', '2025 Africa Cup of Nations', 'Reality Winner', 'Dana Winner']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:26 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the formation of the South Korean boy group Winner through another related search.', action=\"search(entity='Winner (South Korean group) formation')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (South Korean group) formation'}), observation=\"Could not find exact page for 'Winner (South Korean group) formation'. Similar topics: ['Blackpink', 'Winner (band)', 'Babymonster', 'BigBang (South Korean band)', 'Seventeen (South Korean band)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:27 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action=\"llm_tool(input='Who formed the South Korean boy group Winner?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who formed the South Korean boy group Winner?'}), observation='The South Korean boy group Winner was formed by YG Entertainment through the reality survival program called \"WIN: Who is Next\" in 2013.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:28 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action='finish(answer=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called \\'WIN: Who is Next\\' in 2013.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\"}), observation=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:28 - [react.py:299:call] - answer:\n",
+      " YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:28 - [2823432734.py:31:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment formed south korean boy group winner through reality survival program called win who is next in 2013\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:28 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:29 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who helped organizations improve their performance as a consultant.', action=\"search(entity='Aladin')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin'}), observation='Aladdin is a folk tale of Middle Eastern origin.. Aladdin, Aladin, and variants such as Aladdin and His Magic Lamp, may also refer to:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file. Donate.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:31 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The search for Aladin did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin consultant'}), observation=\"Could not find exact page for 'Aladin consultant'. Similar topics: ['2009 in film', 'Magicians (2007 film)', 'Liberal Party of Canada candidates in the 2019 Canadian federal election', 'Ashchorjyo Prodeep', 'Waleed El-Ansary']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:33 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"The search for Aladin consultant did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin performance improvement consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin performance improvement consultant'}), observation=\"Could not find exact page for 'Aladin performance improvement consultant'. Similar topics: ['Five Pillars of Islam', 'Signal-to-noise ratio', 'Islamic Golden Age', 'Travelling salesman problem', 'Clock']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:35 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought=\"The search for Aladin with different variations did not yield relevant information. Let's use LLM as a fallback tool to answer the query.\", action=\"llm_tool(input='Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:36 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:36 - [react.py:299:call] - answer:\n",
+      " The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:36 - [2823432734.py:31:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: person known by his stage name aladin who helped organizations improve their performance as consultant is aladin akyurek he is renowned consultant speaker and author who specializes in organizational development and performance improvement\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:36 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:37 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the arena where the Lewiston Maineiacs played their home games.', action=\"search(entity='Androscoggin Bank Colisée')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Androscoggin Bank Colisée'}), observation='The Colisée, formerly Androscoggin Bank Colisée, Central Maine Youth Center, Central Maine Civic Center, and Lewiston Colisee, is a 4,000 capacity (3,677 seated) multi-purpose arena, in Lewiston, Maine, United States that was built from 1956 to 1958 and opened in the latter year to replace the Dominics Regional High School Arena and was initially constructed and operated by the Catholic parish of SS. Peter and Paul. It was used as a replacement venue for the boxing match between Muhammad Ali and Sonny Liston in 1965.. Currently, it is the home of the Maine Nordiques of the North American Hockey League. The Colisée is also used for concerts, conventions and trade shows.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:38 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).' )\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).'}), observation='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:38 - [react.py:299:call] - answer:\n",
+      " The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:38 - [2823432734.py:31:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: arena where lewiston maineiacs played their home games androscoggin bank colisée can seat 4000 people 3677 seated\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:38 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:39 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about Annie Morton and Terry Richardson to determine who is older.', action=\"search(entity='Annie Morton')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Annie Morton'}), observation='Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames,  and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper\\'s Bazaar, Sports Illustrated and Victoria\\'s Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan\\'s book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:41 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about Terry Richardson to determine his age.', action=\"search(entity='Terry Richardson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Terry Richardson'}), observation=\"Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:42 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.'}), observation='Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:42 - [react.py:299:call] - answer:\n",
+      " Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:42 - [2823432734.py:31:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: annie morton was born on october 8 1970 making her older than terry richardson who was born on august 14 1965\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:42 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:43 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='search(entity=\"Local H\")'), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Local H'}), observation=\"Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Matt Garcia, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:44 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, Local H is from the United States. Now, let\\'s check For Against\\'s origin.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States. Now, let's check For Against's origin.\"}), observation=\"Yes, Local H is from the United States. Now, let's check For Against's origin.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:44 - [react.py:299:call] - answer:\n",
+      " Yes, Local H is from the United States. Now, let's check For Against's origin.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:44 - [2823432734.py:31:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states now lets check for againsts origin\u001b[0m\n",
+      "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0], additional_info=None), average time = 4.769389891624451\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
+    "\n",
+    "# set up evaluation type\n",
+    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
+    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
+    "\n",
+    "agent = ReActAgent(\n",
+    "    tools=tools,\n",
+    "    max_steps=7,\n",
+    "    model_client=OpenAIClient(),\n",
+    "    model_kwargs=gpt_model_kwargs,\n",
+    "    # preset_prompt_kwargs=preset_prompt_kwargs,\n",
+    ")\n",
+    "\n",
+    "num_questions = 10\n",
+    "gt_answers = []\n",
+    "pred_answers = []\n",
+    "start_time = time.time()\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "    gt_answers.append(gt_answer)\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "    pred_answers.append(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )\n",
+    "\n",
+    "end_time = time.time()\n",
+    "\n",
+    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "avg_time = (end_time - start_time) / num_questions\n",
+    "\n",
+    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The EM is 0.3 and the FM is 0.6. Each query takes 9s in average. (The performance also depends on the success of wikipedia query connection.)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What if we use simple LLM models to answer these questions? To test on this, we just need to remove the tools. We have a built-in ``llm_tool`` and ``finish`` that automatically handles the query. ``llm_tool`` uses the same model with the agent. We can't add the examples because the examples will mislead the agent to use non-existing tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 14:01:27 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:29 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the nationality of Scott Derrickson and Ed Wood.', action=\"llm_tool(input='What is the nationality of Scott Derrickson?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Scott Derrickson?'}), observation='Scott Derrickson is an American filmmaker.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:30 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the nationality of Ed Wood.', action=\"llm_tool(input='What is the nationality of Ed Wood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Ed Wood?'}), observation='Ed Wood was an American filmmaker.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:31 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='Confirming if Scott Derrickson and Ed Wood were of the same nationality.', action=\"finish(answer='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.'}), observation='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:31 - [react.py:299:call] - answer:\n",
+      " Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:31 - [3974127232.py:27:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes both scott derrickson and ed wood were american filmmakers so they were of same nationality\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:31 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:32 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"llm_tool(input='government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell'}), observation='The government position held by the woman who portrayed Corliss Archer in the film \"Kiss and Tell\" is Shirley Temple. She served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:33 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"finish(answer='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.'}), observation='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:33 - [react.py:299:call] - answer:\n",
+      " Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:33 - [3974127232.py:27:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: shirley temple served as united states ambassador to ghana and later to czechoslovakia\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:33 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:36 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.', action=\"llm_tool(input='Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species'}), observation='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\n",
+      "_______\n",
+      "\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error processing the output processors: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error running step 3: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 4: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 5: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 6: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 7: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-12-19 14:01:37 - [react.py:299:call] - answer:\n",
+      " Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:37 - [3974127232.py:27:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: error planning step 2 error got invalid json object with yamlsafeload error while parsing flow mapping in unicode string line 1 column 1 expected or but got scalar in unicode string line 4 column 61 i recommend checking out lorien legacies series by pitta got json string question null thought providing information about science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species action finishansweri recommend checking out lorien legacies series by pittacus lore this science fiction young adult series follows story of nine alien teenagers who escape their home planet lorien which has been invaded by evil mogadorians each book in series focuses on one of teenagers and their journey to save earth and their own species additionally there are companion books that delve into histories of enslaved worlds and alien species featured in main series\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:37 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:39 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"llm_tool(input='Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?'}), observation='No, Laleli Mosque and Esma Sultan Mansion are not in the same neighborhood. Laleli Mosque is located in the Laleli district of Istanbul, while Esma Sultan Mansion is located in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:40 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Confirming the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"finish(answer='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.'}), observation='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:40 - [react.py:299:call] - answer:\n",
+      " No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:40 - [3974127232.py:27:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is in laleli district of istanbul and esma sultan mansion is in ortaköy district they are in different neighborhoods within istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:40 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:42 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"llm_tool(input='director of the romantic comedy Big Stone Gap and New York city it is based in')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'director of the romantic comedy Big Stone Gap and New York city it is based in'}), observation='\"Big Stone Gap\" is a romantic comedy film directed by Adriana Trigiani. The story is set in a small town in Virginia, not New York City. The film follows the life of a woman named Ave Maria Mulligan, played by Ashley Judd, as she navigates love, family, and community in the charming town of Big Stone Gap.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:43 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"finish(answer='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.'}), observation='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:43 - [react.py:299:call] - answer:\n",
+      " The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:43 - [3974127232.py:27:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy movie big stone gap is adriana trigiani movie is set in small town in virginia not new york city\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:43 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:44 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the debut album of a South Korean boy group and who formed it.', action=\"llm_tool(input='2014 S/S is the debut album of a South Korean boy group that was formed by who?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': '2014 S/S is the debut album of a South Korean boy group that was formed by who?'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:45 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"Providing the information about the debut album and the group's formation.\", action=\"finish(answer='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': '2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:45 - [react.py:299:call] - answer:\n",
+      " 2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:45 - [3974127232.py:27:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: 2014 ss is debut album of south korean boy group winner which was formed by yg entertainment through reality survival program win who is next\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:45 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:48 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action=\"llm_tool(input='Aladin stage name consultant organizations performance improve')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Aladin stage name consultant organizations performance improve'}), observation='It sounds like you are looking for advice on how to improve the performance of organizations that provide stage name consulting services, like Aladin. Here are some suggestions to help enhance their performance:\\n\\n1. Develop a strong brand identity: Help organizations establish a unique and memorable brand identity that sets them apart from competitors.\\n\\n2. Enhance customer experience: Focus on providing exceptional customer service and personalized consultations to ensure client satisfaction.\\n\\n3. Expand service offerings: Consider offering additional services such as social media management, image consulting, or talent development to attract a wider range of clients.\\n\\n4. Invest in marketing and promotion: Utilize digital marketing strategies, social media platforms, and partnerships with influencers to increase visibility and attract new clients.\\n\\n5. Professional development: Provide ongoing training and development opportunities for consultants to stay updated on industry trends and best practices.\\n\\nBy implementing these strategies, organizations offering stage name consulting services can improve their performance and attract more clients.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:49 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is looking for information about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action='finish(answer=\"Aladin is known by his stage name and helped organizations improve their performance as a consultant.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Aladin is known by his stage name and helped organizations improve their performance as a consultant.'}), observation='Aladin is known by his stage name and helped organizations improve their performance as a consultant.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:49 - [react.py:299:call] - answer:\n",
+      " Aladin is known by his stage name and helped organizations improve their performance as a consultant.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:49 - [3974127232.py:27:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: aladin is known by his stage name and helped organizations improve their performance as consultant\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:49 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:50 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action=\"llm_tool(input='The arena where the Lewiston Maineiacs played their home games can seat how many people?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'The arena where the Lewiston Maineiacs played their home games can seat how many people?'}), observation='The arena where the Lewiston Maineiacs played their home games was the Androscoggin Bank Colisée, which had a seating capacity of approximately 3,677 people for hockey games.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:51 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.'}), observation='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:51 - [react.py:299:call] - answer:\n",
+      " The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:51 - [3974127232.py:27:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: androscoggin bank colisée where lewiston maineiacs played their home games had seating capacity of approximately 3677 people for hockey games\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:51 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:52 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the ages of Annie Morton and Terry Richardson.', action=\"llm_tool(input='Annie Morton age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Annie Morton age'}), observation=\"I'm sorry, but I cannot provide personal information such as age without more context or details. How can I assist you further?\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Comparing the ages of Annie Morton and Terry Richardson to determine who is older.', action=\"llm_tool(input='Terry Richardson age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Terry Richardson age'}), observation=\"I'm sorry, but I cannot provide real-time information on individuals. Terry Richardson is a well-known American fashion photographer, but I do not have access to his current age.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Unfortunately, I couldn't retrieve the ages of Annie Morton and Terry Richardson.\", action='finish(answer=\"I don\\'t have access to the specific ages of Annie Morton and Terry Richardson.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"I don't have access to the specific ages of Annie Morton and Terry Richardson.\"}), observation=\"I don't have access to the specific ages of Annie Morton and Terry Richardson.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:54 - [react.py:299:call] - answer:\n",
+      " I don't have access to the specific ages of Annie Morton and Terry Richardson.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:54 - [3974127232.py:27:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: i dont have access to specific ages of annie morton and terry richardson\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:54 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:56 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='llm_tool(input=\"Local H origin\")'), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Local H origin'}), observation='Local H is an American rock band hailing from Zion, Illinois. The band was formed in 1987 by Scott Lucas (vocals, guitar, bass) and Joe Daniels (drums). They are known for their energetic live performances and their blend of alternative rock, grunge, and punk influences.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:58 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the origin of the band For Against.', action=\"llm_tool(input='For Against origin')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'For Against origin'}), observation='Sure, here are some arguments for and against the concept of origin:\\n\\nFor:\\n1. Understanding the origin of something can provide valuable insights into its nature and purpose.\\n2. Knowing the origin of a product or idea can help in determining its authenticity and quality.\\n3. Studying origins can help in tracing the development and evolution of various phenomena.\\n\\nAgainst:\\n1. Focusing too much on origin can lead to oversimplification and overlook the complexity of a subject.\\n2. The origin of something may not always be clear or easily traceable, leading to speculation and uncertainty.\\n3. Placing too much emphasis on origin can limit our understanding and appreciation of the present and future aspects of a subject.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:59 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Combining the origins of Local H and For Against to answer the user's query.\", action='finish(answer=\"Yes, Local H is from the United States (Zion, Illinois) and For Against\\'s origin is not specified in the response.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\"}), observation=\"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:59 - [react.py:299:call] - answer:\n",
+      " Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:59 - [3974127232.py:27:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states zion illinois and for againsts origin is not specified in response\u001b[0m\n",
+      "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], additional_info=None), average time = 3.1863945960998534\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
+    "\n",
+    "# set up evaluation type\n",
+    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
+    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
+    "\n",
+    "agent = ReActAgent(\n",
+    "    max_steps=7, model_client=OpenAIClient(), model_kwargs=gpt_model_kwargs\n",
+    ")\n",
+    "\n",
+    "num_questions = 10\n",
+    "gt_answers = []\n",
+    "pred_answers = []\n",
+    "start_time = time.time()\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "    gt_answers.append(gt_answer)\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "    pred_answers.append(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )\n",
+    "\n",
+    "end_time = time.time()\n",
+    "\n",
+    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "avg_time = (end_time - start_time) / num_questions\n",
+    "\n",
+    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Without the tools and examples, EM=0 and FM=0.4. We saw hallucinations and nonsense:\n",
+    "\n",
+    "2024-06-15 23:17:04 - [3230041225.py:26:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes``, pred answer: ``no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american``\n",
+    "\n",
+    "2024-06-15 23:18:16 - [3230041225.py:26:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth:`` terry richardson``, pred answer: ``who is older annie morton or terry richardson``\n",
+    "\n",
+    "Therefore, using ReAct agent outperforms the base LLM.\n",
+    "Meanwhile, ``LightRAG ReAct agent`` shows that the performance on 10 questions(EM=0.3)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7. Future Improvement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO:\n",
+    "# 1. advanced, add history to react\n",
+    "# 2. add training, few shot\n",
+    "# 3. llm as judge\n",
+    "# 4. add picture\n",
+    "# 5. better json handling, we need to store the answer output"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "my-project-kernel",
+   "language": "python",
+   "name": "my-project-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/use_cases/classification/train.py b/use_cases/classification/train.py
index 0bdbd562..bb7bcfba 100644
--- a/use_cases/classification/train.py
+++ b/use_cases/classification/train.py
@@ -11,6 +11,7 @@
     gpt_3_model,
     gpt_4o_model,
 )
+from adalflow.core.generator import BackwardPassSetup
 
 
 class TrecClassifierAdal(adal.AdalComponent):
@@ -26,7 +27,7 @@ def __init__(
         eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
         loss_fn = adal.EvalFnToTextLoss(
             eval_fn=eval_fn,
-            eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0",
+            eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0. When the LLM prediction failed with format parsing which results with errors, we set y_pred = -1",
         )
         super().__init__(
             task=task,
@@ -51,8 +52,8 @@ def prepare_eval(
     def prepare_loss(
         self, sample: TRECExtendedData, y_pred: adal.Parameter, *args, **kwargs
     ) -> Tuple[Callable[..., Any], Dict]:
-        full_response = y_pred.full_response
-        y_label = -1
+        full_response = y_pred.data
+        y_label = -1  # default value for failed prediction
         if (
             full_response
             and full_response.data is not None
@@ -67,7 +68,11 @@ def prepare_loss(
             eval_input=sample.class_name,
             requires_opt=False,
         )
-        return self.loss_fn, {"kwargs": {"y": y_pred, "y_gt": y_gt}}
+        return self.loss_fn, {
+            "kwargs": {"y": y_pred, "y_gt": y_gt},
+            "id": sample.id,
+            "gt": y_gt.eval_input,
+        }
 
 
 def train(
@@ -81,6 +86,9 @@ def train(
     strategy="constrained",
     optimization_order="sequential",
     debug=False,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
 ):
     # TODO: ensure the teacher prompt gets updated with the new model
     adal_component = TrecClassifierAdal(
@@ -90,6 +98,12 @@ def train(
         backward_engine_model_config=gpt_4o_model,
         teacher_model_config=gpt_4o_model,
     )
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
     print(adal_component)
     trainer = adal.Trainer(
         train_batch_size=train_batch_size,
@@ -103,32 +117,71 @@ def train(
         weighted_sampling=True,
         optimization_order=optimization_order,
         exclude_input_fields_from_bootstrap_demos=False,
+        max_proposals_per_step=max_proposals_per_step,
     )
+    trainer.set_random_seed(seed)
     print(trainer)
 
     train_dataset, val_dataset, test_dataset = load_datasets()
-    trainer.fit(
+    ckpt, _ = trainer.fit(
         train_dataset=train_dataset,
-        val_dataset=test_dataset,
-        # val_dataset=val_dataset,
-        # test_dataset=test_dataset,
+        val_dataset=val_dataset,
+        test_dataset=test_dataset,
         debug=debug,
-        resume_from_ckpt="/Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json",
+        backward_pass_setup=backward_pass_setup,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json",
     )
+    return ckpt
 
 
 if __name__ == "__main__":
     # TODO:
     #     Evaluating step(6): 0.7333 across 30 samples, Max potential: 0.7778:  83%|▊| 30/36 [00:08<00:01,
     # Optimizer revert: 0.7096774193548387 <= 0.7777777777777778
-    train(
+    import json
+
+    import random
+
+    random.seed(2025)
+    # np.random.seed(2025)  # Set NumPy random seed
+
+    # make the strategy configurable in the script
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_true")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
+    ckpt = train(
         **gpt_3_model,
         debug=False,
         max_steps=12,
         strategy="constrained",
         optimization_order="sequential",
-    )
-    # val 0.694 -> 0.833, #test 0.8472 -> 0.833, adding more shots does not help
+        seed=2025,
+        tg=use_tg,
+        max_proposals_per_step=max_proposals_per_step,
+    )  # val 0.694 -> 0.833, #test 0.8472 -> 0.833, adding more shots does not help
+
+    if set_output_path:
+        with open(set_output_path, "w") as f:
+            json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
+
     # NOTE: raw: 40, bootstrap: 4, max_steps: 8, strategy: random, val: 86.1, test: 86.8 (+4.2% compared with dspy)
     # NOTE: train task without output format: val: 0.67->0.805, test: 0.805-> 0.896 # best performing model (zero-shot)
     # NOTE: train with without output format, use new class_name: constrained_max_steps_12_bac8d_run_1.json
@@ -146,6 +199,18 @@ def train(
     # NOTE:
     # continue from last best, 1 bootstrap, (both input and rational)86.1 val, 86.1 test (not really better)
     # TrecClassifierAdal/constrained_max_steps_12_2ffa7_run_2.json
+    # 1086s
+    # 0.88 validation (the steps are not right, it shows 56 steps)
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json
+    # 0.8958 validations -> 81 steps
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_05739_run_1.json 12 steps, 85.42% test both positve and negative gradients, 1472 seconds
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_63dec_run_1.json 86.81% test on only negative gradients. with past history, 987 seconds
+    # no past history, 83% only. 84 /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_ca5ac_run_1.json
+    # past history, both gradients, 88.89% in 12 steps /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_b4612_run_1.json 1477s
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_f1e5a_run_1.json 811s 89.58% both positive and negative gradients
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_05a8e_run_1.json 1518s 85.41% only negative gradients
+    # /Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_e0f86_run_1.json 1247s, 88.88 both gradients
 
 
 # theory: all few-shots demo or instruction, all so that the llm can reason better. Once it reches to its limits, no more shots can help or further instruction can.
diff --git a/use_cases/classification/train_string_output.py b/use_cases/classification/train_string_output.py
index 9ecdef27..45fe5bcf 100644
--- a/use_cases/classification/train_string_output.py
+++ b/use_cases/classification/train_string_output.py
@@ -7,7 +7,7 @@
 from use_cases.classification.data import load_datasets, TRECExtendedData
 
 from adalflow.eval.answer_match_acc import AnswerMatchAcc
-from LightRAG.use_cases.config import (
+from use_cases.config import (
     gpt_3_model,
     gpt_4o_model,
 )
diff --git a/use_cases/classification/trec_task_structured_output.py b/use_cases/classification/trec_task_structured_output.py
index eb5333cd..56014cc6 100644
--- a/use_cases/classification/trec_task_structured_output.py
+++ b/use_cases/classification/trec_task_structured_output.py
@@ -60,7 +60,7 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
                 # data="You are a classifier. Given a question, classify it into one of the following classes based on what the question is seeking:\n\nFormat: class_index. class_name, class_description\n\n0. ABBR, Abbreviation\n1. ENTY, Entity\n2. DESC, Description and abstract concept\n3. HUM, Human being\n4. LOC, Location\n5. NUM, Numeric value\n\nPay special attention to questions about entities versus descriptions, as well as those asking for specific terms or people. Do not try to answer the question:",
                 # best  # data="You are a classifier. For each question given, classify it into one of the following classes:\n\nFormat: class_index. class_name, class_description\n\n0. ABBR, Abbreviation (includes initials)\n1. ENTY, Entity (includes products, languages, objects, etc.)\n2. DESC, Description and abstract concept (includes explanations)\n3. HUM, Human being (includes individuals, groups, etc.)\n4. LOC, Location (includes addresses, places, etc.)\n5. NUM, Numeric value (includes distances, dates, ages, etc.)\n\n- Focus on identifying the primary subject of the question and classifying based on what is being explicitly asked for.",
                 role_desc="Task description",
-                requires_opt=False,
+                requires_opt=True,
                 param_type=adal.ParameterType.PROMPT,
             ),
             "output_format_str": adal.Parameter(
@@ -70,12 +70,12 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
                 param_type=adal.ParameterType.PROMPT,
             ),
             # NOTE: 88.19%
-            "few_shot_demos": adal.Parameter(
-                data=None,
-                requires_opt=True,
-                role_desc="Few shot examples to help the model",
-                param_type=adal.ParameterType.DEMOS,
-            ),
+            # "few_shot_demos": adal.Parameter(
+            #     data=None,
+            #     requires_opt=True,
+            #     role_desc="Few shot examples to help the model",
+            #     param_type=adal.ParameterType.DEMOS,
+            # ),
         }
 
         self.llm = adal.Generator(
@@ -96,7 +96,7 @@ def _prepare_input(self, question: str):
         prompt_kwargs = {
             "input_str": adal.Parameter(
                 data=input_str,
-                requires_opt=True,
+                requires_opt=False,
                 role_desc="input to the LLM",
                 param_type=adal.ParameterType.INPUT,
             )
@@ -108,6 +108,8 @@ def call(
     ) -> Union[adal.GeneratorOutput, adal.Parameter]:
         prompt_kwargs = self._prepare_input(question)
         output = self.llm(prompt_kwargs=prompt_kwargs, id=id)
+        if isinstance(output, adal.Parameter):
+            output.data_in_prompt = lambda x: x.data.raw_response
         return output
 
 
diff --git a/use_cases/config.py b/use_cases/config.py
index 895ed097..3ee366b0 100644
--- a/use_cases/config.py
+++ b/use_cases/config.py
@@ -25,6 +25,19 @@
     },
 }
 
+gpt_3_1106_model = {
+    "model_client": OpenAIClient(input_type="text"),
+    "model_kwargs": {
+        "model": "gpt-3.5-turbo-1106",
+        "max_tokens": 2000,
+        "temperature": 0.0,
+        "top_p": 0.99,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+        "stop": None,
+    },
+}
+
 # https://openai.com/api/pricing/
 # use this for evaluation
 gpt_4o_mini_model = {
@@ -38,10 +51,32 @@
     },
 }
 
+gpt_4_model = {
+    "model_client": OpenAIClient(),
+    "model_kwargs": {
+        "model": "gpt-4-turbo",
+        "temperature": 1,
+        "top_p": 0.99,
+        "max_tokens": 1000,
+        # "frequency_penalty": 1,  # high for nto repeating prompt
+    },
+}
+
 gpt_4o_model = {
     "model_client": OpenAIClient(),
     "model_kwargs": {
-        "model": "gpt-4o-mini",
+        "model": "gpt-4o",  # gpt-4o-realtime-preview-2024-12-17
+        "temperature": 1,
+        "top_p": 0.99,
+        "max_tokens": 1000,
+        # "frequency_penalty": 1,  # high for nto repeating prompt
+    },
+}
+
+gpt_4o1_model = {
+    "model_client": OpenAIClient(),
+    "model_kwargs": {
+        "model": "o1-preview",
         "temperature": 1,
         "top_p": 0.99,
         "max_tokens": 1000,
diff --git a/use_cases/question_answering/bbh/data.py b/use_cases/question_answering/bbh/data.py
index d1fc3709..910b7e00 100644
--- a/use_cases/question_answering/bbh/data.py
+++ b/use_cases/question_answering/bbh/data.py
@@ -3,11 +3,11 @@
 import re
 
 from dataclasses import dataclass, field
-import adalflow as adal
 
 from adalflow.core import DataClass
 from adalflow.datasets.big_bench_hard import BigBenchHard
 from adalflow.utils.data import subset_dataset
+from adalflow.core import func_to_parser
 
 
 @dataclass
@@ -64,7 +64,7 @@ class QuestionAnswer(DataClass):
     )  # score can be used as weight for demo, weight = score (the higher the more likely to be sampled)
 
 
-@adal.fun_to_component
+@func_to_parser
 def parse_integer_answer(answer: str):
     """A function that parses the last integer from a string using regular expressions."""
     try:
@@ -81,7 +81,7 @@ def parse_integer_answer(answer: str):
     return answer
 
 
-@adal.fun_to_component
+@func_to_parser
 def extract_answer(answer: str) -> str:
     try:
         pattern = re.compile(r"Answer:\s*(.*)", re.DOTALL)
diff --git a/use_cases/question_answering/bbh/object_count/task.py b/use_cases/question_answering/bbh/object_count/task.py
index 6f5571f8..4892fe0f 100644
--- a/use_cases/question_answering/bbh/object_count/task.py
+++ b/use_cases/question_answering/bbh/object_count/task.py
@@ -37,12 +37,12 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
             param_type=ParameterType.PROMPT,
             instruction_to_optimizer="You can try to show examples to see if it helps.",
         )
-        few_shot_demos = adal.Parameter(
-            data=None,
-            role_desc="To provide few shot demos to the language model",
-            requires_opt=False,
-            param_type=ParameterType.DEMOS,
-        )
+        # few_shot_demos = adal.Parameter(
+        #     data=None,
+        #     role_desc="To provide few shot demos to the language model",
+        #     requires_opt=True,
+        #     param_type=ParameterType.DEMOS,
+        # )
 
         self.llm_counter = adal.Generator(
             model_client=model_client,
@@ -50,19 +50,19 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
             template=few_shot_template,
             prompt_kwargs={
                 "system_prompt": system_prompt,
-                "few_shot_demos": few_shot_demos,
+                # "few_shot_demos": few_shot_demos,
             },
             output_processors=parse_integer_answer,
             use_cache=True,
         )
 
-    def call(
+    def bicall(
         self, question: str, id: str = None
     ) -> Union[adal.GeneratorOutput, adal.Parameter]:
         output = self.llm_counter(prompt_kwargs={"input_str": question}, id=id)
-        print(f"output: {output}, training: {self.training}")
+        # print(f"output: {output}, training: {self.training}")
         if self.training:
-            if output.full_response.error and "429" in output.full_response.error:
+            if output.data.error and "429" in output.data.error:
                 raise ValueError("Rate limit exceeded")
         else:
             if output.error and "429" in output.error:
@@ -85,8 +85,9 @@ def test_object_count_task():
     task_pipeline.train()
     answer: adal.Parameter = task_pipeline(question, id="1")
     print(answer)
-    print(f"full_response: {answer.full_response}")
+    print(f"data: {answer.data}")
     answer.draw_graph()
+    print(f"prompt_data: {answer.get_prompt_data()}")
 
 
 if __name__ == "__main__":
diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py
index 48309aa7..8ed2f2b2 100644
--- a/use_cases/question_answering/bbh/object_count/train_new.py
+++ b/use_cases/question_answering/bbh/object_count/train_new.py
@@ -43,6 +43,7 @@ def prepare_eval(
         self, sample: Example, y_pred: adal.GeneratorOutput
     ) -> Tuple[float, Dict[str, Any]]:
         y_label = -1
+        print(f"y_pred: {y_pred}")
         if (
             y_pred is not None and y_pred.data is not None
         ):  # if y_pred and y_pred.data: might introduce bug when the data is 0
@@ -58,8 +59,8 @@ def prepare_loss(
             eval_input=sample.answer,
             requires_opt=False,
         )
-        pred.eval_input = pred.full_response.data
-        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}}
+        pred.eval_input = pred.data.data
+        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
 
 
 # TODO: make the train diagnose on the student model and the teacher model automatcally
@@ -95,6 +96,9 @@ def train_diagnose_teacher(
 
 # You will answer a reasoning question. Think step by step and double-check each calculation you make. Pay close attention to any numerical quantities in the text, converting written numbers into their numerical equivalents. Additionally, re-verify your final answer before concluding. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.
 # 0.98 val, 0.91 test
+from adalflow.core.generator import BackwardPassSetup
+
+
 def train(
     train_batch_size=4,  # larger batch size is not that effective, probably because of llm's lost in the middle
     raw_shots: int = 0,
@@ -106,6 +110,9 @@ def train(
     debug=False,
     resume_from_ckpt=None,
     exclude_input_fields_from_bootstrap_demos=False,
+    seed=None,
+    tg: bool = False,
+    max_proposals_per_step: int = 5,
 ):
     adal_component = ObjectCountAdalComponent(
         **gpt_3_model,
@@ -114,6 +121,13 @@ def train(
         backward_engine_model_config=gpt_4o_model,
     )
     print(adal_component)
+    backward_pass_setup = None
+    if tg:
+        backward_pass_setup = BackwardPassSetup(
+            all_pred_at_once=False,
+            compute_grad_for_errors_only=False,
+        )
+
     trainer = adal.Trainer(
         train_batch_size=train_batch_size,
         adaltask=adal_component,
@@ -123,37 +137,73 @@ def train(
         raw_shots=raw_shots,
         bootstrap_shots=bootstrap_shots,
         debug=debug,
-        weighted_sampling=True,
+        weighted_sampling=False,
         optimization_order=optimization_order,
         exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
+        max_proposals_per_step=max_proposals_per_step,
     )
+    trainer.set_random_seed(seed)
     print(trainer)
 
     train_dataset, val_dataset, test_dataset = load_datasets()
+    # train_dataset = train_dataset[:4]
+    # val_dataset = val_dataset[:4]
+    # test_dataset = test_dataset[:4]
+
     ckpt, _ = trainer.fit(
         train_dataset=train_dataset,
         val_dataset=val_dataset,
         test_dataset=test_dataset,
         resume_from_ckpt=resume_from_ckpt,
+        backward_pass_setup=backward_pass_setup,
     )
     return ckpt
 
 
 if __name__ == "__main__":
-    import sys
     import json
 
+    import random
+
+    random.seed(2025)
+    # np.random.seed(2025)  # Set NumPy random seed
+
+    # make the strategy configurable in the script
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="constrained")
+    parser.add_argument("--use_tg", action="store_true")
+    parser.add_argument("--max_proposals_per_step", type=int, default=5)
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+    use_tg = args.use_tg
+    max_proposals_per_step = args.max_proposals_per_step
+
     ckpt = train(
         debug=False,
         max_steps=12,
-        strategy="constrained",
+        strategy=set_strategy,
         exclude_input_fields_from_bootstrap_demos=True,
+        seed=2025,  # pass the numpy seed
+        tg=use_tg,
+        max_proposals_per_step=max_proposals_per_step,
+        # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json",
     )
     print(f"ckpt: {ckpt}")
-    # Save ckpt to a file passed as an argument
-    if len(sys.argv) > 1:  # Check if a file path is provided
-        with open(sys.argv[1], "w") as f:
+    if set_output_path:
+        with open(set_output_path, "w") as f:
             json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
 
     # train_diagnose(**gpt_3_model)
     # train_diagnose_teacher(**gpt_4o_model) # 4omini works well as an optimizer too
@@ -163,3 +213,11 @@ def train(
     # 0.86->0.94 val, 0.79 -> 0.93 with only negative gradients /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_7a649_run_1.json
 
     # without gradients -> 0.9 on tests
+    # without positive gradients -> /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_8ac70_run_1.json 0.84->0.94 val, 0.82 -> 0.88 test
+
+    # /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_1f358_run_1.json 1 val 0.96 val 955s
+    # 0.94 val, 0.89 test, /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_e1bb5_run_1.json 907s, with both positive and negatives
+    # 92, 91 test /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json 747s
+    # 96%  /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json
+    # (90%, 94%, 92%, 94%) 92.5 + 1.5
+    # (96%, 100%, 96%, 96% ) 97+ 1.73
diff --git a/use_cases/question_answering/bbh/word_sorting/train.py b/use_cases/question_answering/bbh/word_sorting/train.py
index 12518206..c2a20da1 100644
--- a/use_cases/question_answering/bbh/word_sorting/train.py
+++ b/use_cases/question_answering/bbh/word_sorting/train.py
@@ -76,7 +76,7 @@ def prepare_loss(self, sample: Example, pred: adal.Parameter):
             eval_input=sample.answer,
             requires_opt=False,
         )
-        pred.eval_input = pred.full_response.data  # processed
+        pred.eval_input = pred.data.data  # processed
         question_param = adal.Parameter(
             name="question",
             data=sample.question,
@@ -89,7 +89,8 @@ def prepare_loss(self, sample: Example, pred: adal.Parameter):
                 "pred_answer": pred,
                 "gt_answer": y_gt,
                 "question": question_param,
-            }
+            },
+            "id": sample.id,
         }
 
 
diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py
index 37ff320d..1ae09ea1 100644
--- a/use_cases/text_grad_2.0_train.py
+++ b/use_cases/text_grad_2.0_train.py
@@ -1,21 +1,52 @@
 import subprocess
 import tempfile
 import json
+import numpy as np
+import argparse
 
+num_runs = 4
 # List of experiments to run
 object_count = "use_cases/question_answering/bbh/object_count/train_new.py"
+trec_6_classification = "use_cases/classification/train.py"
 hotpot_qa_multi_hop_rag = "benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py"
+hotpot_qa_vanilla_rag = "benchmarks/hotpot_qa/adal_exp/train_vanilla.py"
 
 ckpt_values = []
 experiments = [
     object_count,
+    # trec_6_classification,
+    # hotpot_qa_vanilla_rag,
     # hotpot_qa_multi_hop_rag,
 ]
 
+# set up the strategy for each experiment
+
+argparser = argparse.ArgumentParser()
+argparser.add_argument("--strategy", type=str, default="constrained")
+argparser.add_argument("--use_tg", action="store_true")
+argparser.add_argument("--max_proposals_per_step", type=int, default=5)
+
+args = argparser.parse_args()
+
+strategy = args.strategy
+use_tg = args.use_tg
+max_proposals_per_step = args.max_proposals_per_step
+
 # Optional: Arguments for each experiment (if needed)
+
+setup_str = f"--strategy {strategy}"
+
+if use_tg:
+    setup_str += " --use_tg"
+
+setup_str += f" --max_proposals_per_step {max_proposals_per_step}"
+
+
 experiment_args = {
-    object_count: "",
-    # hotpot_qa_multi_hop_rag: "",
+    object_count: setup_str,
+    trec_6_classification: setup_str,
+    hotpot_qa_vanilla_rag: setup_str,
+    hotpot_qa_multi_hop_rag: setup_str,
 }
 ckpt_values = {}
 
@@ -47,12 +78,117 @@ def run_experiment(script, args):
 
 
 if __name__ == "__main__":
+
+    result_file = "text_grad_2_results"
+    # add important run information in the naming of the file
+    import uuid
+
+    result_file = f"{result_file}_{num_runs}_runs_{uuid.uuid4()}.json"
+
     for experiment in experiments:
         args = experiment_args.get(experiment, "")
-        ckpt = run_experiment(experiment, args)
-        if ckpt:
-            ckpt_values[experiment] = ckpt
+        for i in range(num_runs):
+            print(f"\nRun {i + 1}/{num_runs}")
+            ckpt = run_experiment(experiment, args)
+            ckpt_index = f"{experiment}_{i + 1}"
+            if ckpt:
+                ckpt_values[ckpt_index] = ckpt
+        # load all json files using the ckpt paths
+        highest_test_score, last_test_score, mean_test_score, standard_deviation = (
+            0,
+            0,
+            0,
+            0,
+        )
+        last_test_scores = []
+        highest_val_scores = []
+        total_passes = (
+            []
+        )  # each is the number of unique val scores in the highest val scores
+        total_prompts = []  # how many prompts tried in total
+
+        past_highest_val_scores = []
+        # # average pass rate, average pass prompts
+        # average_pass_rate_list = []
+        # average_pass_prompts_list = []
+        # average_total_prompts = []
+        # highest_test_score_json_file = None
+        total_steps = []
+        training_times = []
+        for experiment_index, ckpt in ckpt_values.items():
+            with open(ckpt, "r") as f:
+                data = json.load(f)
+                print(f"Experiment: {experiment_index}")
+                print(f"Data: {data}")
+                _high_val_score = max(data["val_scores"])
+                _unique_val_scores = len(set(data["val_scores"])) - 1
+                _last_test_score = data["test_score"]
+                # read the effective measures
+                effective_measures = data.get("effective_measure", {})
+
+                _total_prompts = effective_measures.get("subset", {}).get(
+                    "pass", 0
+                ) + effective_measures.get("subset", {}).get("fail", 0)
+                if _total_prompts == 0:
+                    _total_prompts = effective_measures.get("valset", {}).get(
+                        "pass", 0
+                    ) + effective_measures.get("valset", {}).get("fail", 0)
+                _total_steps = len(data["steps"]) - 1
+                _training_time = data.get("total_time", 0)
+                # save the results in the lists
+                past_highest_val_scores.append(_high_val_score)
+                total_passes.append(_unique_val_scores)
+                total_prompts.append(_total_prompts)
+                last_test_scores.append(_last_test_score)
+                total_steps.append(_total_steps)
+                training_times.append(_training_time)
+
+        # ensure all steps are the same
+        assert all(
+            [step == total_steps[0] for step in total_steps]
+        ), "All steps should be the same"
+
+        # compute the metrics
+        mean_test_score = np.mean(last_test_scores)
+        std_test_score = np.std(last_test_scores)
+
+        # val scores
+        mean_val_score = np.mean(past_highest_val_scores)
+        std_val_score = np.std(past_highest_val_scores)
+
+        # pass rate total_passes / steps
+        average_pass_rate = np.mean(total_passes) / total_steps[0]
+
+        # average total prompts
+        average_total_prompts = np.mean(total_prompts)
+
+        # average training time
+        average_training_time = np.mean(training_times)
+
+        # add these numbers in the ckpt_values
+        index = f"{experiment}_summary"
+        ckpt_values[index] = {
+            "config": {
+                "num_runs": num_runs,
+                "args": args,
+            },
+            "metrics": {
+                "mean_test_score": mean_test_score,
+                "std_test_score": std_test_score,
+                "mean_val_score": mean_val_score,
+                "std_val_score": std_val_score,
+                "average_pass_rate": average_pass_rate,
+                "average_total_prompts": average_total_prompts,
+                "average_training_time": average_training_time,
+            },
+        }
 
     print("\nAll Checkpoints:")
     for experiment, ckpt in ckpt_values.items():
         print(f"{experiment}: {ckpt}")
+
+    # Save the results to a file
+    with open(result_file, "w") as f:
+        json.dump(ckpt_values, f, indent=4)
+
+    print(f"\nResults saved to {result_file}")

Name:	{wrap_and_escape(n.id)}
Name:	{wrap_and_escape(n.name)}
Role:	{wrap_and_escape(n.role_desc.capitalize())}
Value:	{wrap_and_escape(n.data)}
Requires Optimization:	{{'Yes'}}
Type:	{wrap_and_escape(n.param_type.name)}
API kwargs:	{wrap_and_escape(str(n.component_trace.api_kwargs))}
Score:	{str(n._score)}
Score:	{str(n.score)}
Gradients:	{wrap_and_escape(n.get_gradients_names())}
Gradient {g.name} Feedback:	{wrap_and_escape(g.data)}
Gradient {g.name} Context:	{wrap_and_escape(gradient_context)}
Gradient {g.name} Context:	{wrap_and_escape(gradient_context)}
Gradient {g.name} Prompt:	{wrap_and_escape(g.prompt)}
Traces: keys:	{wrap_and_escape(str(n._traces.keys()))}
Traces: values:	{wrap_and_escape(str(n._traces.values()))}
TGD Optimizer Trace:	{wrap_and_escape(str(n.tgd_optimizer_trace))}
Component Trace ID:	{wrap_and_escape(str(n.component_trace.id))}
Component Trace Name:	{wrap_and_escape(str(n.component_trace.name))}
Name:	{escaped_name}
Type:	{escaped_param_type}
Value:	{escaped_value}
Component Trace ID:	{escaped_ct_id}
Component Trace Name:	{escaped_ct_name}
Name:	{node.name}
TYPE:	{node.type}
Order:	{component_nodes_orders[node.id]}