rllm-org
diff --git a/‎examples/frozenlake/train_frozenlake_agent.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/frozenlake/train_frozenlake_agent.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/simple_math/train_hendrycks_math.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/simple_math/train_hendrycks_math.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rllm/engine/__init__.py‎
Lines changed: 15 additions & 9 deletions b/‎rllm/engine/__init__.py‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎rllm/engine/agent_execution_engine.py‎
Lines changed: 4 additions & 2 deletions b/‎rllm/engine/agent_execution_engine.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎rllm/engine/rollout/__init__.py‎
Lines changed: 17 additions & 1 deletion b/‎rllm/engine/rollout/__init__.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎rllm/engine/rollout/openai_engine.py‎
Lines changed: 90 additions & 24 deletions b/‎rllm/engine/rollout/openai_engine.py‎
Lines changed: 90 additions & 24 deletions
diff --git a/‎rllm/engine/rollout/rollout_engine.py‎
Lines changed: 38 additions & 6 deletions b/‎rllm/engine/rollout/rollout_engine.py‎
Lines changed: 38 additions & 6 deletions
@@ -52,7 +52,7 @@ python3 -m examples.frozenlake.train_frozenlake_agent \
     trainer.logger=['console','wandb'] \
     trainer.project_name='rllm-agent' \
     trainer.experiment_name='frozenlake-agent-0.6B' \
-    trainer.val_before_train=False \
+    trainer.val_before_train=True \
     trainer.n_gpus_per_node=8 \
     trainer.nnodes=1 \
     trainer.save_freq=40 \
 
@@ -54,7 +54,7 @@ python3 -m examples.simple_math.train_hendrycks_math \
     trainer.critic_warmup=0 \
     trainer.logger=['console','wandb'] \
     trainer.project_name='rllm-agent' \
-    trainer.experiment_name='deepscaler-debug-math-fsdp1' \
+    trainer.experiment_name='simple-math' \
     trainer.val_before_train=True \
     trainer.n_gpus_per_node=8 \
     trainer.nnodes=1 \
 
@@ -4,7 +4,9 @@
 """
 
 from .agent_execution_engine import AgentExecutionEngine, AsyncAgentExecutionEngine
-from .rollout.openai_engine import OpenAIEngine
+
+# Avoid importing rollout submodules eagerly to prevent circular imports with workflows
+# Import base class only (no side effects) and lazy-load specific engines via __getattr__
 from .rollout.rollout_engine import RolloutEngine
 
 __all__ = [
@@ -13,20 +15,24 @@
     "AgentWorkflowEngine",
     "RolloutEngine",
     "OpenAIEngine",
+    "VerlEngine",
 ]
 
-# VerlEngine is optional; only export if verl is installed
-try:
-    from .rollout.verl_engine import VerlEngine
-
-    __all__.append("VerlEngine")
-except Exception:
-    VerlEngine = None
-
 
 def __getattr__(name):
     if name == "AgentWorkflowEngine":
         from .agent_workflow_engine import AgentWorkflowEngine as _AgentWorkflowEngine
 
         return _AgentWorkflowEngine
+    if name == "OpenAIEngine":
+        from .rollout.openai_engine import OpenAIEngine as _OpenAIEngine
+
+        return _OpenAIEngine
+    if name == "VerlEngine":
+        try:
+            from .rollout.verl_engine import VerlEngine as _VerlEngine
+
+            return _VerlEngine
+        except Exception:
+            raise AttributeError(name) from None
     raise AttributeError(name)
@@ -102,6 +102,8 @@ def __init__(
                 **rollout_engine_args,
                 api_retries=api_retries,
                 tokenizer=self.tokenizer,
+                max_prompt_length=self.max_prompt_length,
+                max_response_length=self.max_response_length,
                 disable_thinking=kwargs.get("disable_thinking", False),
             )
         elif self.engine_name == "verl":
@@ -140,12 +142,12 @@ async def get_model_response(self, prompt, application_id, **kwargs) -> str:
         sampling_params.update(kwargs)
 
         if self.engine_name == "openai":
-            output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, **sampling_params)
+            output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, enforce_max_prompt_length=False, **sampling_params)
             return output.text
         elif self.engine_name == "verl":
             meta_data = sampling_params.pop("meta_info", {})
             validate = meta_data.get("validate", False)
-            output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, validate=validate, **sampling_params)
+            output = await self.rollout_engine.get_model_response(prompt, application_id=application_id, validate=validate, enforce_max_prompt_length=False, **sampling_params)
             return output.text
         else:
             raise NotImplementedError(f"Engine type '{self.engine_name}' not supported")
 
@@ -1,8 +1,24 @@
-from .openai_engine import OpenAIEngine
+# Avoid importing concrete engines at module import time to prevent circular imports
 from .rollout_engine import ModelOutput, RolloutEngine
 
 __all__ = [
     "ModelOutput",
     "RolloutEngine",
     "OpenAIEngine",
+    "VerlEngine",
 ]
+
+
+def __getattr__(name):
+    if name == "OpenAIEngine":
+        from .openai_engine import OpenAIEngine as _OpenAIEngine
+
+        return _OpenAIEngine
+    if name == "VerlEngine":
+        try:
+            from .verl_engine import VerlEngine as _VerlEngine
+
+            return _VerlEngine
+        except Exception:
+            raise AttributeError(name) from None
+    raise AttributeError(name)
@@ -6,49 +6,80 @@
 
 from rllm.engine.rollout.rollout_engine import ModelOutput, RolloutEngine
 from rllm.globals import THOUGHT_DELIMITER_END, THOUGHT_DELIMITER_START
-from rllm.parser import ChatTemplateParser, ToolParser
+from rllm.parser import ChatTemplateParser
+from rllm.tools.tool_base import Tool
+from rllm.workflows import TerminationEvent, TerminationReason
 
 
 class OpenAIEngine(RolloutEngine):
-    def __init__(self, model: str, tokenizer=None, api_retries: int = 3, base_url: str = "https://api.openai.com/v1", api_key: str = os.getenv("OPENAI_API_KEY"), sampling_params: dict | None = None, **kwargs):
+    def __init__(self, model: str = "", tokenizer=None, max_prompt_length: int = 4096, max_response_length: int = 4096, max_model_length: int | None = None, api_retries: int = 3, base_url: str = "https://api.openai.com/v1", api_key: str = os.getenv("OPENAI_API_KEY"), sampling_params: dict | None = None, tools: list[Tool | dict] = None, accumulate_reasoning: bool = False, **kwargs):
         self.model = model
+        self.max_prompt_length = max_prompt_length
+        self.max_response_length = max_response_length
+        self.max_model_length = max_model_length - 1 if max_model_length is not None else max_prompt_length + max_response_length - 1
         self.api_retries = api_retries
         self.sampling_params = sampling_params or {}
+        self.tools = tools or []
+        self.accumulate_reasoning = accumulate_reasoning
 
         self.tokenizer = tokenizer
         if self.tokenizer is not None:
             self.chat_parser = ChatTemplateParser.get_parser(self.tokenizer, disable_thinking=kwargs.get("disable_thinking", False))
-            try:
-                self.tool_parser = ToolParser.get_parser(self.tokenizer)
-            except Exception:
-                print(f"Warning: No tool parser found for {self.tokenizer.name_or_path}. Tool calls not be parsed.")
-                self.tool_parser = None
             self._use_chat_completions = False
         else:
-            print("No tokenizer provided, will use the chat completions endpoint. This is not recommended.")
+            # In this case, we cannot enforce max prompt length or dynamically adjust max_tokens <= max_response_length if needed
+            print("No tokenizer provided to OpenAIEngine, will use the chat completions endpoint.")
             self._use_chat_completions = True
 
         self.client = openai.AsyncOpenAI(base_url=base_url, api_key=api_key)
         logging.getLogger("httpx").setLevel(logging.WARNING)
 
     async def chat_completion(self, messages: list[dict], **kwargs) -> ModelOutput:
+        kwargs.pop("application_id", None)
+        kwargs.pop("validate", None)
+        kwargs.pop("model", None)
+        kwargs.pop("enforce_max_prompt_length", None)
+
         sampling_params = self.sampling_params.copy()
         sampling_params.update(kwargs)
-        sampling_params.pop("model", None)
+
+        max_tokens = sampling_params.pop("max_tokens", sampling_params.pop("max_new_tokens", self.max_response_length))
+
         retries = self.api_retries
         while retries > 0:
             try:
-                response = await self.client.chat.completions.create(model=self.model, messages=messages, timeout=3600, **sampling_params)
-                text = response.choices[0].message.content
-                if hasattr(response.choices[0].message, "reasoning") and isinstance(response.choices[0].message.reasoning, str):
-                    text = f"{THOUGHT_DELIMITER_START}\n{response.choices[0].message.reasoning}\n{THOUGHT_DELIMITER_END}\n\n{text}"
-                return ModelOutput(text=text, tool_calls=response.choices[0].message.tool_calls, finish_reason=response.choices[0].finish_reason, completion_tokens=response.usage.completion_tokens, prompt_tokens=response.usage.prompt_tokens)
+                response = await self.client.chat.completions.create(model=self.model, messages=messages, timeout=3600, max_tokens=max_tokens, **sampling_params)
+
+                content = response.choices[0].message.content
+                reasoning = response.choices[0].message.reasoning if hasattr(response.choices[0].message, "reasoning") and isinstance(response.choices[0].message.reasoning, str) else ""
+                tool_calls = response.choices[0].message.tool_calls if hasattr(response.choices[0].message, "tool_calls") and isinstance(response.choices[0].message.tool_calls, list) else []
+
+                if reasoning:
+                    text = f"{THOUGHT_DELIMITER_START}\n{reasoning}\n{THOUGHT_DELIMITER_END}\n\n{content}"  # best guess
+
+                prompt_length = response.usage.prompt_tokens
+                completion_length = response.usage.completion_tokens
+                finish_reason = response.choices[0].finish_reason
+
+                return ModelOutput(
+                    text=text,
+                    content=content,
+                    reasoning=reasoning,
+                    tool_calls=tool_calls,
+                    prompt_ids=[],
+                    completion_ids=[],
+                    prompt_length=prompt_length,
+                    completion_length=completion_length,
+                    finish_reason=finish_reason,
+                )
+
             except openai.RateLimitError:
                 retries -= 1
                 if retries == 0:
                     raise Exception("Rate limit reached and retries exhausted.") from None
                 print("Sleep for 5 seconds for API limit.")
                 await asyncio.sleep(5)
+
             except Exception as e:
                 retries -= 1
                 if retries == 0:
@@ -57,20 +88,58 @@ async def chat_completion(self, messages: list[dict], **kwargs) -> ModelOutput:
                 await asyncio.sleep(1)
 
     async def completion(self, prompt: str, **kwargs) -> ModelOutput:
+        kwargs.pop("application_id", None)
+        kwargs.pop("validate", None)
+        kwargs.pop("model", None)
+        enforce_max_prompt_length = kwargs.pop("enforce_max_prompt_length", True)
+
         sampling_params = self.sampling_params.copy()
         sampling_params.update(kwargs)
-        sampling_params.pop("model", None)
+
+        prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
+        prompt_length = len(prompt_ids)
+
+        if enforce_max_prompt_length and (prompt_length > self.max_prompt_length or prompt_length > self.max_model_length):
+            raise TerminationEvent(TerminationReason.MAX_PROMPT_LENGTH_EXCEEDED)
+
+        max_tokens = sampling_params.pop("max_tokens", sampling_params.pop("max_new_tokens", self.max_response_length))
+        remaining_tokens = self.max_model_length - prompt_length
+        if remaining_tokens <= max_tokens:
+            max_tokens = remaining_tokens
+            print(f"Warning: Decreasing max_tokens to {max_tokens} to stay within max_model_length")
+
         retries = self.api_retries
         while retries > 0:
             try:
-                response = await self.client.completions.create(model=self.model, prompt=prompt, timeout=3600, **sampling_params)
-                return ModelOutput(text=response.choices[0].text, tool_calls=[], finish_reason=response.choices[0].finish_reason, completion_tokens=response.usage.completion_tokens, prompt_tokens=response.usage.prompt_tokens)
+                response = await self.client.completions.create(model=self.model, prompt=prompt, timeout=3600, max_tokens=max_tokens, **sampling_params)
+
+                text = response.choices[0].text
+                completion_ids = self.tokenizer.encode(text, add_special_tokens=False)
+                parsed_output = self.chat_parser.parse_completion(completion_ids)
+
+                prompt_length = response.usage.prompt_tokens
+                completion_length = response.usage.completion_tokens
+                finish_reason = response.choices[0].finish_reason
+
+                return ModelOutput(
+                    text=text,
+                    content=parsed_output["content"],
+                    reasoning=parsed_output["reasoning"],
+                    tool_calls=parsed_output["tool_calls"],
+                    prompt_ids=prompt_ids,
+                    completion_ids=completion_ids,
+                    prompt_length=prompt_length,
+                    completion_length=completion_length,
+                    finish_reason=finish_reason,
+                )
+
             except openai.RateLimitError:
                 retries -= 1
                 if retries == 0:
                     raise Exception("Rate limit reached and retries exhausted.") from None
                 print("Sleep for 5 seconds for API limit.")
                 await asyncio.sleep(5)
+
             except Exception as e:
                 retries -= 1
                 if retries == 0:
@@ -79,13 +148,10 @@ async def completion(self, prompt: str, **kwargs) -> ModelOutput:
                 await asyncio.sleep(1)
 
     async def get_model_response(self, messages: list[dict], **kwargs) -> ModelOutput:
-        kwargs.pop("application_id", None)  # only needed for verl engine
-        kwargs.pop("validate", None)  # only needed for verl engine
         if self._use_chat_completions:
             return await self.chat_completion(messages, **kwargs)
         else:
-            prompt = self.chat_parser.parse(messages, add_generation_prompt=True, is_first_msg=True)
-            output = await self.completion(prompt, **kwargs)
-            if self.tool_parser is not None:
-                output.tool_calls = self.tool_parser.parse(output.text)
-            return output
+            tools = kwargs.pop("tools", self.tools)
+            accumulate_reasoning = kwargs.pop("accumulate_reasoning", self.accumulate_reasoning)
+            prompt = self.chat_parser.parse(messages, add_generation_prompt=True, is_first_msg=True, tools=tools, accumulate_reasoning=accumulate_reasoning)
+            return await self.completion(prompt, **kwargs)
@@ -1,19 +1,51 @@
 from dataclasses import dataclass
 
+from rllm.tools.tool_base import ToolCall
+
 
 @dataclass
 class ModelOutput:
     text: str
-    tool_calls: list
+    content: str
+    reasoning: str
+    tool_calls: list[ToolCall]
+    prompt_ids: list[int]
+    completion_ids: list[int]
+    prompt_length: int
+    completion_length: int
     finish_reason: str
-    completion_tokens: int
-    prompt_tokens: int
+
+    def to_dict(self):
+        return {
+            "text": self.text,
+            "content": self.content,
+            "reasoning": self.reasoning,
+            "tool_calls": [tool_call.to_dict() for tool_call in self.tool_calls],
+            "prompt_ids": self.prompt_ids,
+            "completion_ids": self.completion_ids,
+            "prompt_length": self.prompt_length,
+            "completion_length": self.completion_length,
+            "finish_reason": self.finish_reason,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(
+            text=data["text"],
+            content=data["content"],
+            reasoning=data["reasoning"],
+            tool_calls=[ToolCall(**tool_call) for tool_call in data["tool_calls"]],
+            prompt_ids=data["prompt_ids"],
+            completion_ids=data["completion_ids"],
+            prompt_length=data["prompt_length"],
+            completion_length=data["completion_length"],
+            finish_reason=data["finish_reason"],
+        )
 
 
 class RolloutEngine:
-    def __init__(self, model: str, tokenizer=None, **kwargs):
-        self.model = model
-        self.tokenizer = tokenizer
+    def __init__(self, *args, **kwargs):
+        pass
 
     async def get_model_response(self, messages: list[dict], **kwargs) -> ModelOutput:
         raise NotImplementedError("get_model_response is not implemented")