Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
3d57d2e
processed_entries_queue_popped_data
AlexPiche Jul 25, 2025
4fbc5c7
faster preprocess
AlexPiche Jul 25, 2025
91acbc4
more logging
AlexPiche Jul 25, 2025
fb5a0bd
better namming
AlexPiche Jul 25, 2025
8c78c45
clean up
AlexPiche Jul 25, 2025
1b90a4b
add groups_in_progress
AlexPiche Jul 26, 2025
3c8f338
raise when finetune is done
AlexPiche Jul 26, 2025
f88dceb
cte lr
AlexPiche Jul 27, 2025
812aafc
first mcp
AlexPiche Aug 15, 2025
ca8516b
fix the env server
AlexPiche Aug 16, 2025
f3af1bc
tweak prompt
AlexPiche Aug 16, 2025
5b10c33
upd
AlexPiche Aug 16, 2025
d2e6d09
clean up
AlexPiche Aug 18, 2025
228cb42
hard code dino
AlexPiche Aug 18, 2025
fdf3c83
less envs
AlexPiche Aug 18, 2025
1165397
less envs
AlexPiche Aug 18, 2025
40a144a
longer timeout
AlexPiche Aug 18, 2025
2d25d88
longer seq length
AlexPiche Aug 18, 2025
2036167
more envs
AlexPiche Aug 18, 2025
664b539
more llms per actor
AlexPiche Aug 18, 2025
4b0db03
even more envs
AlexPiche Aug 18, 2025
63d4092
longer timeout and revert prompt
AlexPiche Aug 18, 2025
6d81456
retry task
AlexPiche Aug 18, 2025
373b0ac
pid deno module
AlexPiche Aug 18, 2025
e2de768
diff deno tmp dir
AlexPiche Aug 18, 2025
763b594
none node modules
AlexPiche Aug 18, 2025
0783570
bigger timeout
AlexPiche Aug 19, 2025
b284fcb
diff temp dir for each mcp
AlexPiche Aug 19, 2025
eb48d90
0.0.0.0
AlexPiche Aug 19, 2025
efa2717
filter based on port
AlexPiche Aug 19, 2025
3d86a28
change port to 7778
AlexPiche Aug 21, 2025
96a75c1
mcp and verify server
AlexPiche Aug 21, 2025
0b4c992
use custom parser
AlexPiche Aug 21, 2025
471d28d
relative path
AlexPiche Aug 21, 2025
8e0eeff
test apth
AlexPiche Aug 21, 2025
f93d756
typo
AlexPiche Aug 21, 2025
32e3eb6
clean up
AlexPiche Aug 21, 2025
5a3ab0e
clean up
AlexPiche Aug 21, 2025
436e233
rename domain to mcp
AlexPiche Aug 22, 2025
366263b
more envs
AlexPiche Aug 22, 2025
371be6e
less env replicas
AlexPiche Aug 22, 2025
1045868
Merge remote-tracking branch 'origin/debug_miniwob' into mcp_tir
AlexPiche Aug 22, 2025
05f7667
Merge remote-tracking branch 'origin/debug_miniwob' into mcp_tir
AlexPiche Aug 22, 2025
46b39d1
clean up tmp
AlexPiche Aug 22, 2025
af63f51
change mcp dir
AlexPiche Aug 22, 2025
55a96e5
bigger model len
AlexPiche Aug 22, 2025
dd0ea2b
typo
AlexPiche Aug 22, 2025
dc4052d
typo
AlexPiche Aug 23, 2025
bb4d0c5
clean up
AlexPiche Aug 26, 2025
ccdcd32
center reward
AlexPiche Aug 26, 2025
7f5ed95
running avg reward
AlexPiche Aug 26, 2025
88a0ee7
start from real mean
AlexPiche Aug 26, 2025
66bcfbd
Fix paths
rafapi Aug 28, 2025
3fcb847
Use relative path
rafapi Aug 28, 2025
9f239c6
Fix path
rafapi Aug 28, 2025
020a021
revert mktemp changes
rafapi Aug 28, 2025
4323f57
Fix deno paths
rafapi Aug 29, 2025
2b5e9f5
udt
rafapi Aug 29, 2025
565d25c
make the cache tag stable across all processes
rafapi Aug 29, 2025
e39ff7b
remove running avg
AlexPiche Aug 29, 2025
fc17df7
fix
rafapi Aug 30, 2025
115f629
Merge branch 'clean_up_running_avg' into mcp_tir
rafapi Sep 2, 2025
f4d8e0d
Avoid hot-spotting env; add extra metrics
rafapi Sep 5, 2025
23decf7
Print correct policy info
rafapi Sep 5, 2025
29118b7
Add aime2025
rafapi Sep 5, 2025
8882859
Test on aime2025
rafapi Sep 5, 2025
ea2d393
kl new old
AlexPiche Sep 22, 2025
eb7eb0d
loo
AlexPiche Sep 25, 2025
1247360
Add new metrics
rafapi Sep 26, 2025
8cb5ef3
Merge remote-tracking branch 'origin/new_metrics' into mcp_tir
rafapi Sep 26, 2025
61c91c7
Embedded envs
rafapi Sep 30, 2025
bd46a7d
Remove imports
rafapi Sep 30, 2025
550cb63
Increase shared_memory_entry_size
rafapi Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ celerybeat.pid

# SageMath parsed files
*.sage.py
node_modules/

# Environments
.env
Expand Down Expand Up @@ -185,4 +186,4 @@ results
results/
data/
cache/
dump.rdb
dump.rdb
8 changes: 5 additions & 3 deletions conf/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ preprocess:
input: actor
output: training_data
n_workers: 8
chunk_n_groups: 2
chunk_n_groups: 8
# queue for loaded raw groups
raw_queue_size: 8
raw_queue_size: 128
# queue for processed chunks of multiple groups
input_queue_size: 32
# queue for ready chunks for multiple groups
Expand Down Expand Up @@ -67,7 +67,7 @@ vllm_config:
tensor-parallel-size: 1
pipeline-parallel-size: 1
generation-config: vllm
max_model_len: 10000
max_model_len: 16000

world:
replicas: 1
Expand All @@ -81,6 +81,8 @@ world:

actor_group_port: 9000
environment_start_port: 7777
# Remote vs embedded environment execution strategy
environment_mode: remote
# this will be autocreated based on the config
jobs: []

Expand Down
2 changes: 1 addition & 1 deletion conf/finetune/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ learning_rate: 1e-6
# How much to clip the gradient (no clipping if null)
gradient_clipping_threshold: 0.3
# Learning rate scheduler type (indexed by completed_steps).
lr_scheduler_type: cosine # could be cosine, constant_with_warmup
lr_scheduler_type: constant # could be cosine, constant_with_warmup
# Number of warmup (completed) steps in the learning rate schedule.
num_warmup_steps: 50
# Number of gradient accumulation steps.
Expand Down
164 changes: 164 additions & 0 deletions conf/mcp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
defaults:
- base
- override finetune: grpo
- _self_

llm:
parameters:
max_tokens: 8192

test_llm:
parameters:
max_tokens: 8192

actor:
rollout_policy: pipelinerl.domains.mcp.generate_mcp_rollout
system_prompt: Please reason step by step, and put your final answer within \boxed{{}}.
llm_max_rollouts: 64
task_template: |-
{task}
shared_memory_entry_size: 200000000

preprocess:
shared_memory_entry_size: 2000000000

finetune:
seq_length: 128000
seq_parallel: 8

dataset_loader: pipelinerl.domains.math.load_datasets
train_dataset_names:
- open_reasoner_zero_57k
- open_reasoner_zero_extended_72k
test_dataset_names:
- aime_2025

vllm_config:
use_v1: false
vllm_kwargs:
enable-auto-tool-choice: ""
tool-call-parser: rl_tool
tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
max-num-seqs: ${actor.llm_max_rollouts}
max-num-batched-tokens: 4096
max_model_len: 128000
gpu-memory-utilization: 0.85

environment:
_target_: pipelinerl.domains.mcp.env_server.EmbeddedMCPEnvironment
config_path: ${hydra:runtime.cwd}/conf/mcp/python.json
tools_whitelist:
- run_python_code
read_timeout_seconds: 600
use_cache: false
runtime_pool_workers: 4
offload_tools:
- run_python_code


world:
env_replicas_per_actor: 8
environment_mode: embedded

agent_max_loops: 3
agent:
_target_: tapeagents.agent.Agent
name : mcp_agent
max_iterations: 3
store_llm_calls: true
templates:
system_prompt: |
You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
with short, deterministic Python code.
Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
Always present the final answer in LaTeX \boxed{{}}.
Do not express emotions or opinions about user questions.

Workflow:
1. Draft a brief plan in plain text.
2. Execute one run_python_code call to compute or verify the result.
3. Finalize by calling MathAnswer with the LaTeX-formatted answer.

Python execution policy (run_python_code):
- Use Python strictly for pure computation to verify and validate the final answer.
- No network, file system, OS or environment access.
- Keep snippets minimal and self-contained; avoid large outputs and long-running loops; print only the final result.

Validation:
- Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
- If execution fails, propose the minimal fix and retry.
Keep replies direct and avoid unnecessary text.
allowed_tools: |
You can call the following tools:
{tools_description}
- run_python_code: deterministic math code; print only the final value.
- MathAnswer: return the LaTeX \boxed{{}} answer when the solution is verified.
Always verify with run_python_code before invoking MathAnswer.
thought_format: |
Important! Respond with the plain text, do not include any JSON or code.
Do not output anything besides what I asked in this message.
allowed_steps: |
Workflow summary:
- Plan briefly in plain text.
- Call run_python_code exactly once per loop to compute/verify.
- Finish with a single MathAnswer tool call carrying the \boxed{{}} result.
format: |
For finalization, reply with a single short sentence that ends in the \boxed{{}} answer,
immediately followed by the MathAnswer function call containing the same \boxed{{}} value.
Never emit unrelated JSON wrappers or duplicate the final thought.


nodes:
- _target_: tapeagents.nodes.StandardNode
name: plan
system_prompt: ${agent.templates.system_prompt}
guidance: |
Produce a concise math plan (formulas/checks). You will ALWAYS verify by executing Python code.
${agent.templates.thought_format}
steps_prompt: ${agent.templates.allowed_tools}
trim_obs_except_last_n: 2

- _target_: tapeagents.nodes.StandardNode
name: code
system_prompt: ${agent.templates.system_prompt}
guidance: |
ALWAYS call run_python_code once to compute/verify the result.
Use exact, deterministic code; print only the final scalar or tuple.
If code fails, fix minimally and call run_python_code again after reviewing the error.
use_known_actions: true
use_function_calls: true
trim_obs_except_last_n: 2

- _target_: tapeagents.nodes.StandardNode
name: finalize
system_prompt: ${agent.templates.system_prompt}
guidance: |
Read the last Python stdout value. First, state the answer in one short sentence that ends with LaTeX \boxed{{}}.
Immediately after that sentence, call the MathAnswer tool exactly once with:
name: MathAnswer
arguments: {"answer": "<final answer in LaTeX \\boxed{}>"}
Do not add any extra text around the tool call. Once the sentence is emitted, return only the MathAnswer function call.
steps:
- pipelinerl.domains.mcp.steps.MathAnswer
use_known_actions: true
use_function_calls: true
trim_obs_except_last_n: 2
next_node: code

# model_path: Qwen/Qwen3-8B
model_path: /mnt/llmd/base_models/ServiceNow-AI/7_9_25_14b_text_reasoning_sft

# Local reward shaping for tool usage
python_tool_shaping:
bonus_on_correct_with_python: 0.2
penalty_on_incorrect_without_python: 0.1
max_abs: 0.2

# Encourage concise outputs (penalize long completions)
length_shaping:
target_ratio: 0.1 # 10% of max_tokens; auto scales with max_tokens
min_target_tokens: 256 # lower clamp
max_target_tokens: 2048 # upper clamp
slope: 0.001 # penalty per token beyond target
max_penalty: 0.2 # clamp absolute penalty
bonus_on_short_correct: 0.05 # bonus if correct and concise
11 changes: 11 additions & 0 deletions conf/mcp/python.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"mcpServers": {
"python_exec": {
"command": "bash",
"args": [
"-c",
"JOB_TAG=${MCP_JOB_TAG:-${JOB_ID:-$HOSTNAME}} && BASE=/home/toolkit/.cache && mkdir -p \"$BASE/mcp_tmp/$JOB_TAG\" \"$BASE/deno_mcp/$JOB_TAG\" \"$BASE/tmp/$JOB_TAG\" && export DENO_DIR=\"$BASE/deno_mcp/$JOB_TAG\" TMPDIR=\"$BASE/tmp/$JOB_TAG\" && /home/toolkit/.deno/bin/deno cache jsr:@pydantic/mcp-run-python >/dev/null 2>&1 || true; DIR=$(mktemp -d -p \"$BASE/mcp_tmp/$JOB_TAG\" mcp_XXXXXXXX) && cd \"$DIR\" && /home/toolkit/.deno/bin/deno run -N -R=node_modules -W=node_modules --node-modules-dir=auto jsr:@pydantic/mcp-run-python stdio; EC=$?; cd /; rm -rf \"$DIR\"; exit $EC"
]
}
}
}
28 changes: 27 additions & 1 deletion pipelinerl/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ async def schedule_rollouts(
# Track rollouts per problem group
group_rollouts = {}
rollout_policy = hydra.utils.get_method(cfg.actor.rollout_policy)
logger.info(f"Use rollout policy: {rollout_policy}")
logger.info(f"Use rollout policy: {rollout_policy.__name__}")

async def rollout_and_maybe_produce_result(
problem: dict,
Expand Down Expand Up @@ -349,6 +349,8 @@ def update_stats(self, rollout_results: List[RolloutResult]):
self.model_versions_list.append(result.model_version)
domain_agnostic_metrics = self.compute_domain_agnostic_metrics(result)
all_metrics = result.metrics.model_dump() | domain_agnostic_metrics
all_metrics["used_python"] = int(all_metrics.get("used_python", False))
all_metrics["used_math_answer"] = int(all_metrics.get("used_math_answer", False))
for k, v in all_metrics.items():
if isinstance(v, list):
self.stats[k][dataset_name][group_id] += v
Expand Down Expand Up @@ -502,6 +504,7 @@ def run(self, dataset: list[tuple[str, dict]]):
"finished_groups": finished_groups,
"trainer_model_version": trainer_version_to_publish,
"time_since_start": time.time() - loop_start_time,
"groups_in_progress": in_progress,
}
trainer_version_to_publish = None
else:
Expand Down Expand Up @@ -549,6 +552,21 @@ def publish_stats(self, stats_writer: StreamWriter, loop_stats: Dict):
stats |= loop_stats
for k, v in self.sliding_stats.items():
stats[k] = sum(v) / len(v) if v else 0

rename_suffixes = {
"num_python_calls_mean": "python_calls_mean",
"used_python_mean": "python_usage_rate",
"num_math_answer_calls_mean": "math_answer_calls_mean",
"used_math_answer_mean": "math_answer_usage_rate",
}

for key in list(stats.keys()):
for old_suffix, new_suffix in rename_suffixes.items():
if key.endswith(old_suffix):
prefix = key[: -len(old_suffix)]
stats[f"{prefix}{new_suffix}"] = stats[key]
break

if self.cfg.wandb.use_wandb:
wandb.log({f"actor/{k}": v for k, v in stats.items()})
stats_writer.write(stats)
Expand Down Expand Up @@ -592,11 +610,18 @@ def run_actor_loop(cfg: DictConfig):
else:
actor_model_path = cfg.model_path

# Align client-side context size with vLLM server max_model_len when available
try:
_context_size = int(cfg.vllm_config.vllm_kwargs.max_model_len)
except Exception:
_context_size = 32000

train_llms = [
TrainableLLM(
base_url=url,
model_name=str(actor_model_path),
tokenizer_name=str(actor_model_path),
context_size=_context_size,
parameters=cfg.llm.parameters,
use_cache=False,
collect_logprobs=True,
Expand All @@ -609,6 +634,7 @@ def run_actor_loop(cfg: DictConfig):
base_url=url,
model_name=str(actor_model_path),
tokenizer_name=str(actor_model_path),
context_size=_context_size,
parameters=cfg.test_llm.parameters,
use_cache=False,
collect_logprobs=True,
Expand Down
2 changes: 1 addition & 1 deletion pipelinerl/domains/math/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .load_datasets import load_datasets
from .rollouts import generate_math_rollout, RewardTable
from .rollouts import generate_math_rollout, RewardTable, get_reward
from .verifier_api import MathEnvironment, verify_answer, verify_answer_rpc
26 changes: 26 additions & 0 deletions pipelinerl/domains/math/load_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,26 @@ def _load_aime_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
return add_ids(samples)


def _load_aime_2025_opencompass(upsample_factor: int = 0) -> list[dict]:
configs = ["AIME2025-I", "AIME2025-II"]
dataset_name = "aime_2025" + ("" if upsample_factor > 0 else "_original")

samples: list[dict] = []
for config_name in configs:
ds = load_dataset("opencompass/AIME2025", config_name, split="test")
samples.extend([s for s in process_math(ds, dataset_name) if s is not None])

original_size = len(samples)
if upsample_factor > 0:
samples *= upsample_factor

logger.info(
f"Loading aime 2025 (OpenCompass) dataset: {len(samples)} samples"
+ (f" (upsampled from {original_size})" if upsample_factor > 0 else "")
)
return add_ids(samples)


def _load_amc_dataset(year: int, upsample_factor: int = 0) -> list[dict]:
amc_dataset = load_dataset("AI-MO/aimo-validation-amc", split="train", trust_remote_code=True)
amc_dataset = amc_dataset.filter(lambda x: str(year) in x["url"])
Expand Down Expand Up @@ -335,6 +355,12 @@ def load_datasets(dataset_names: List[str] | str | None, seed: int | None = None
if "aime_2024_original" in dataset_names:
datasets += _load_aime_dataset(2024)

if "aime_2025" in dataset_names:
datasets += _load_aime_2025_opencompass(upsample_factor=16)

if "aime_2025_original" in dataset_names:
datasets += _load_aime_2025_opencompass()

if "amc_2022" in dataset_names:
# TODO: AMC 2022 is 43 problems, is that to be expected?
datasets += _load_amc_dataset(2022, upsample_factor=16)
Expand Down
Loading