ServiceNow · AlexPiche · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -120,6 +120,7 @@ celerybeat.pid
 
 # SageMath parsed files
 *.sage.py
+node_modules/
 
 # Environments
 .env
@@ -185,4 +186,4 @@ results
 results/
 data/
 cache/
-dump.rdb
+dump.rdb
diff --git a/conf/base.yaml b/conf/base.yaml
@@ -23,9 +23,9 @@ preprocess:
   input: actor
   output: training_data
   n_workers: 8
-  chunk_n_groups: 2
+  chunk_n_groups: 8
   # queue for loaded raw groups
-  raw_queue_size: 8
+  raw_queue_size: 128
   # queue for processed chunks of multiple groups  
   input_queue_size: 32
   # queue for ready chunks for multiple groups
@@ -67,7 +67,7 @@ vllm_config:
     tensor-parallel-size: 1
     pipeline-parallel-size: 1
     generation-config: vllm
-    max_model_len: 10000
+    max_model_len: 16000
 
 world:
   replicas: 1
@@ -81,6 +81,8 @@ world:
 
   actor_group_port: 9000
   environment_start_port: 7777
+# Remote vs embedded environment execution strategy
+  environment_mode: remote
 # this will be autocreated based on the config
 jobs: []
 

diff --git a/conf/finetune/base.yaml b/conf/finetune/base.yaml
@@ -36,7 +36,7 @@ learning_rate: 1e-6
 # How much to clip the gradient (no clipping if null)
 gradient_clipping_threshold: 0.3
 # Learning rate scheduler type (indexed by completed_steps).
-lr_scheduler_type: cosine # could be cosine, constant_with_warmup
+lr_scheduler_type: constant # could be cosine, constant_with_warmup
 # Number of warmup (completed) steps in the learning rate schedule.
 num_warmup_steps: 50
 # Number of gradient accumulation steps.

diff --git a/conf/mcp.yaml b/conf/mcp.yaml
@@ -0,0 +1,164 @@
+defaults:
+    - base
+    - override finetune: grpo
+    - _self_
+
+llm:
+  parameters:
+    max_tokens: 8192
+
+test_llm:
+  parameters:
+    max_tokens: 8192
+
+actor:
+  rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
+  system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
+  llm_max_rollouts: 64
+  task_template: |-
+    {task}
+  shared_memory_entry_size: 200000000
+
+preprocess:
+  shared_memory_entry_size: 2000000000
+
+finetune:
+  seq_length: 128000
+  seq_parallel: 8
+
+dataset_loader: pipelinerl.domains.math.load_datasets
+train_dataset_names:
+- open_reasoner_zero_57k
+- open_reasoner_zero_extended_72k 
+test_dataset_names:
+  - aime_2025
+
+vllm_config:
+  use_v1: false
+  vllm_kwargs:
+    enable-auto-tool-choice: ""
+    tool-call-parser: rl_tool
+    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
+    max-num-seqs: ${actor.llm_max_rollouts}
+    max-num-batched-tokens: 4096
+    max_model_len: 128000
+    gpu-memory-utilization: 0.85
+
+environment:
+  _target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
+  config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
+  tools_whitelist:
+    - run_python_code
+  read_timeout_seconds: 600
+  use_cache: false
+  runtime_pool_workers: 4
+  offload_tools:
+    - run_python_code
+
+
+world:
+  env_replicas_per_actor: 8
+  environment_mode: embedded
+
+agent_max_loops: 3
+agent:
+  _target_: tapeagents.agent.Agent
+  name : mcp_agent
+  max_iterations: 3
+  store_llm_calls: true
+  templates:
+    system_prompt: |
+      You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
+      with short, deterministic Python code.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Always present the final answer in LaTeX \boxed{{}}.
+      Do not express emotions or opinions about user questions.
+
+      Workflow:
+      1. Draft a brief plan in plain text.
+      2. Execute one run_python_code call to compute or verify the result.
+      3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
+
+      Python execution policy (run_python_code):
+      - Use Python strictly for pure computation to verify and validate the final answer.
+      - No network, file system, OS or environment access.
+      - Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.
+
+      Validation:
+      - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
+      - If execution fails, propose the minimal fix and retry.
+      Keep replies direct and avoid unnecessary text.
+    allowed_tools: |
+      You can call the following tools:
+      {tools_description}
+      - run_python_code: deterministic math code; print only the final value.
+      - MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
+      Always verify with run_python_code before invoking MathAnswer.
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      Workflow summary:
+      - Plan briefly in plain text.
+      - Call run_python_code exactly once per loop to compute/verify.
+      - Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
+    format: |
+      For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
+      immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
+      Never emit unrelated JSON wrappers or duplicate the final thought.
+
+
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: plan
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+      trim_obs_except_last_n: 2
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: code
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        ALWAYS call run_python_code once to compute/verify the result.
+        Use exact, deterministic code; print only the final scalar or tuple.
+        If code fails, fix minimally and call run_python_code again after reviewing the error.
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: finalize
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
+        Immediately after that sentence, call the MathAnswer tool exactly once with:
+          name: MathAnswer
+          arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
+        Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
+      steps:
+        - pipelinerl.domains.mcp.steps.MathAnswer
+      use_known_actions: true
+      use_function_calls: true
+      trim_obs_except_last_n: 2
+      next_node: code
+
+# model_path: Qwen/Qwen3-8B
+model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft
+
+# Local reward shaping for tool usage
+python_tool_shaping:
+  bonus_on_correct_with_python: 0.2
+  penalty_on_incorrect_without_python: 0.1
+  max_abs: 0.2
+
+# Encourage concise outputs (penalize long completions)
+length_shaping:
+  target_ratio: 0.1                # 10% of max_tokens; auto scales with max_tokens
+  min_target_tokens: 256           # lower clamp
+  max_target_tokens: 2048          # upper clamp
+  slope: 0.001                     # penalty per token beyond target
+  max_penalty: 0.2                 # clamp absolute penalty
+  bonus_on_short_correct: 0.05     # bonus if correct and concise
diff --git a/conf/mcp/python.json b/conf/mcp/python.json
@@ -0,0 +1,11 @@
+{
+    "mcpServers": {
+        "python_exec": {
+            "command": "bash",
+            "args": [
+                "-c",
+                "JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
+                ]
+        }
+    }
+}
diff --git a/pipelinerl/actor.py b/pipelinerl/actor.py
@@ -135,7 +135,7 @@ async def schedule_rollouts(
     # Track rollouts per problem group
     group_rollouts = {}
     rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
-    logger.info(f"Use rollout policy: {rollout_policy}")
+    logger.info(f"Use rollout policy: {rollout_policy.__name__}")
 
     async def rollout_and_maybe_produce_result(
         problem: dict,
@@ -349,6 +349,8 @@ def update_stats(self, rollout_results: List[RolloutResult]):
             self.model_versions_list.append(result.model_version)
             domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result) 
             all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
+            all_metrics["used_python"] = int(all_metrics.get("used_python", False))
+            all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
             for k, v in all_metrics.items():
                 if isinstance(v, list):
                     self.stats[k][dataset_name][group_id] += v
@@ -502,6 +504,7 @@ def run(self, dataset: list[tuple[str, dict]]):
                             "finished_groups": finished_groups,
                             "trainer_model_version": trainer_version_to_publish, 
                             "time_since_start": time.time() - loop_start_time,
+                            "groups_in_progress": in_progress,
                         }
                         trainer_version_to_publish = None
                     else:
@@ -549,6 +552,21 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
         stats |= loop_stats
         for k, v in self.sliding_stats.items():
             stats[k] = sum(v) / len(v) if v else 0
+
+        rename_suffixes = {
+            "num_python_calls_mean": "python_calls_mean",
+            "used_python_mean": "python_usage_rate",
+            "num_math_answer_calls_mean": "math_answer_calls_mean",
+            "used_math_answer_mean": "math_answer_usage_rate",
+        }
+
+        for key in list(stats.keys()):
+            for old_suffix, new_suffix in rename_suffixes.items():
+                if key.endswith(old_suffix):
+                    prefix = key[: -len(old_suffix)]
+                    stats[f"{prefix}{new_suffix}"] = stats[key]
+                    break
+
         if self.cfg.wandb.use_wandb:
             wandb.log({f"actor/{k}": v for k, v in stats.items()})
         stats_writer.write(stats)
@@ -592,11 +610,18 @@ def run_actor_loop(cfg: DictConfig):
     else:
         actor_model_path = cfg.model_path
 
+    # Align client-side context size with vLLM server max_model_len when available
+    try:
+        _context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
+    except Exception:
+        _context_size = 32000
+
     train_llms = [
         TrainableLLM(
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.llm.parameters,
             use_cache=False,
             collect_logprobs=True,
@@ -609,6 +634,7 @@ def run_actor_loop(cfg: DictConfig):
             base_url=url,
             model_name=str(actor_model_path),
             tokenizer_name=str(actor_model_path),
+            context_size=_context_size,
             parameters=cfg.test_llm.parameters,
             use_cache=False,
             collect_logprobs=True,

diff --git a/pipelinerl/domains/math/__init__.py b/pipelinerl/domains/math/__init__.py
@@ -1,3 +1,3 @@
 from .load_datasets import load_datasets
-from .rollouts import generate_math_rollout, RewardTable
+from .rollouts import generate_math_rollout, RewardTable, get_reward
 from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
diff --git a/pipelinerl/domains/math/load_datasets.py b/pipelinerl/domains/math/load_datasets.py
@@ -170,6 +170,26 @@ def _load_aime_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     return add_ids(samples)
 
 
+def _load_aime_2025_opencompass(upsample_factor: int = 0) -> list[dict]:
+    configs = ["AIME2025-I", "AIME2025-II"]
+    dataset_name = "aime_2025" + ("" if upsample_factor > 0 else "_original")
+
+    samples: list[dict] = []
+    for config_name in configs:
+        ds = load_dataset("opencompass/AIME2025", config_name, split="test")
+        samples.extend([s for s in process_math(ds, dataset_name) if s is not None])
+
+    original_size = len(samples)
+    if upsample_factor > 0:
+        samples *= upsample_factor
+
+    logger.info(
+        f"Loading aime 2025 (OpenCompass) dataset: {len(samples)} samples"
+        + (f" (upsampled from {original_size})" if upsample_factor > 0 else "")
+    )
+    return add_ids(samples)
+
+
 def _load_amc_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
     amc_dataset = load_dataset("AI-MO/aimo-validation-amc", split="train", trust_remote_code=True)
     amc_dataset = amc_dataset.filter(lambda x: str(year) in x["url"])
@@ -335,6 +355,12 @@ def load_datasets(dataset_names: List[str] | str | None, seed: int | None = None
     if "aime_2024_original" in dataset_names:
         datasets += _load_aime_dataset(2024)
 
+    if "aime_2025" in dataset_names:
+        datasets += _load_aime_2025_opencompass(upsample_factor=16)
+
+    if "aime_2025_original" in dataset_names:
+        datasets += _load_aime_2025_opencompass()
+
     if "amc_2022" in dataset_names:
         # TODO: AMC 2022 is 43 problems, is that to be expected?
         datasets += _load_amc_dataset(2022, upsample_factor=16)