From 9707f874a82b415fdadc07ec823fa780170bc39c Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Sun, 7 Sep 2025 06:58:54 -0600
Subject: [PATCH 1/6] Updated benchmark with a bunch of improvements.

---
 open_instruct/actor_manager.py      | 214 +++++++++++++++++++++++++---
 open_instruct/grpo_fast.py          |   5 +
 open_instruct/static/dashboard.css  |  28 ++++
 open_instruct/static/dashboard.html | 138 +++++++++++++++++-
 open_instruct/vllm_utils3.py        |  51 +++++++
 5 files changed, 415 insertions(+), 21 deletions(-)

diff --git a/open_instruct/actor_manager.py b/open_instruct/actor_manager.py
index b1ecc7a45..dee256648 100644
--- a/open_instruct/actor_manager.py
+++ b/open_instruct/actor_manager.py
@@ -41,7 +41,7 @@ def find_free_port():
 class ActorManager:
     """Centralized manager for controlling evaluation and weight updates across all LLMRayActors."""
 
-    def __init__(self, queues: dict, args):
+    def __init__(self, queues: dict, args, vllm_engines=None):
         self._should_stop = False
         self._last_updated = datetime.now()
         self._dashboard_port = None
@@ -56,6 +56,17 @@ def __init__(self, queues: dict, args):
         self._generation_batch_history = collections.deque(maxlen=self._sample_window)
         self._kv_cache_max_concurrency = None
         self._args = args
+        self._vllm_engines = vllm_engines or []
+        self._last_metrics_collection_time = 0
+        # Cache for static token rates (updated only on new batch completion)
+        self._cached_token_rates = {"prefill_tokens_per_sec": 0, "decode_tokens_per_sec": 0, "last_update_count": 0}
+        # Training progress tracking
+        self._current_training_step = 0
+        self._total_training_steps = getattr(args, "num_training_steps", None)
+        self._training_start_time = None
+        # MFU/MBU tracking
+        self._model_utilization_history = collections.deque(maxlen=self._sample_window)
+        self._memory_usage_stats = {"total_gpu_memory_used": 0, "average_kv_cache_size": 0, "peak_memory_usage": 0}
         if self._args.enable_queue_dashboard:
             self._setup_queue_monitoring()
             self._start_dashboard()
@@ -71,13 +82,77 @@ def _setup_queue_monitoring(self):
         self._poll_thread.start()
 
     def _poll_queue_sizes(self):
-        """Background thread to poll queue sizes."""
+        """Background thread to poll queue sizes and collect vLLM metrics."""
         while self._polling_active:
+            # Poll queue sizes
             for queue_name, info in self._queue_info.items():
                 current_size = info["queue"].size()
                 self._queue_sizes[queue_name] = current_size
+
+            # Collect vLLM metrics every 10 seconds
+            current_time = time.time()
+            if (current_time - self._last_metrics_collection_time) >= 10.0:
+                self._collect_vllm_metrics()
+                self._last_metrics_collection_time = current_time
+
             time.sleep(0.5)
 
+    def _collect_vllm_metrics(self):
+        """Collect metrics from all vLLM engines."""
+        if not self._vllm_engines:
+            return
+
+        try:
+            # Collect metrics from all engines asynchronously
+            import ray
+
+            metrics_futures = []
+            for engine in self._vllm_engines:
+                try:
+                    future = engine.get_engine_metrics.remote()
+                    metrics_futures.append(future)
+                except Exception as e:
+                    logger = logger_utils.setup_logger(__name__)
+                    logger.warning(f"Error getting metrics from engine: {e}")
+
+            if metrics_futures:
+                # Get all metrics with a short timeout to avoid blocking
+                try:
+                    all_metrics = ray.get(metrics_futures, timeout=5.0)
+
+                    # Aggregate metrics across all engines
+                    total_gpu_memory = 0
+                    total_kv_cache_memory = 0
+                    total_mfu = 0
+                    total_mbu = 0
+                    valid_engines = 0
+
+                    for metrics in all_metrics:
+                        if metrics and isinstance(metrics, dict):
+                            total_gpu_memory += metrics.get("gpu_memory_reserved_gb", 0)
+                            total_kv_cache_memory += metrics.get("gpu_memory_allocated_gb", 0)  # Approximation
+                            total_mfu += metrics.get("mfu_estimate", 0)
+                            total_mbu += metrics.get("mbu_estimate", 0)
+                            valid_engines += 1
+
+                    if valid_engines > 0:
+                        # Report aggregated metrics
+                        avg_mfu = total_mfu / valid_engines
+                        avg_mbu = total_mbu / valid_engines
+                        self.report_model_utilization(avg_mfu, avg_mbu)
+                        self.report_memory_usage(total_gpu_memory, total_kv_cache_memory)
+
+                except ray.exceptions.GetTimeoutError:
+                    logger = logger_utils.setup_logger(__name__)
+                    logger.warning("Timeout collecting vLLM metrics")
+                except Exception as e:
+                    logger = logger_utils.setup_logger(__name__)
+                    logger.warning(f"Error processing vLLM metrics: {e}")
+
+        except Exception as e:
+            logger = logger_utils.setup_logger(__name__)
+            logger.warning(f"Error in _collect_vllm_metrics: {e}")
+
     def _start_dashboard(self):
         """Start the FastAPI dashboard server in a background thread."""
         if self._args.queue_dashboard_port is None:
@@ -110,6 +185,9 @@ async def api_status():
                 "queues": queues_data,
                 "token_stats": self.get_token_stats(),
                 "timing_stats": self.get_timing_stats(),
+                "training_progress": self.get_training_progress(),
+                "utilization_stats": self.get_utilization_stats(),
+                "memory_stats": self.get_memory_stats(),
                 "kv_cache_max_concurrency": self._kv_cache_max_concurrency,
                 # This is less confusing to users.
                 "inference_batch_size": self._args.inference_batch_size * self._args.num_samples_per_prompt_rollout,
@@ -161,52 +239,76 @@ def report_token_statistics(self, token_stats):
             }
         )
 
-        self._generation_batch_history.append(token_stats.generation_time)
+        # Report batch generation time (avoid double reporting via report_batch_generation_time)
+        # Add validation to prevent extreme outliers (e.g., > 300 seconds)
+        if 0 < token_stats.generation_time < 300:
+            self._generation_batch_history.append(token_stats.generation_time)
 
     def report_training_step_time(self, duration: float):
         """Report the time taken for a training step."""
         self._training_step_history.append(duration)
 
+    def update_training_step(self, step: int):
+        """Update the current training step."""
+        if self._training_start_time is None:
+            self._training_start_time = time.time()
+        self._current_training_step = step
+
     def report_batch_generation_time(self, duration: float):
         """Report the time taken to generate a batch of data."""
-        self._generation_batch_history.append(duration)
+        # Add validation to prevent extreme outliers (e.g., > 300 seconds)
+        if 0 < duration < 300:
+            self._generation_batch_history.append(duration)
 
     def set_kv_cache_max_concurrency(self, max_concurrency: int):
         """Set the KV cache max concurrency value."""
         self._kv_cache_max_concurrency = max_concurrency
 
+    def set_vllm_engines(self, vllm_engines):
+        """Set the vLLM engines for metrics collection."""
+        self._vllm_engines = vllm_engines or []
+
     def get_token_stats(self):
         """Calculate and return current token statistics."""
         if not self._token_history:
             return {
                 "total_prefill_tokens": self._total_prefill_tokens,
                 "total_decode_tokens": self._total_decode_tokens,
-                "prefill_tokens_per_sec": 0,
-                "decode_tokens_per_sec": 0,
+                "prefill_tokens_per_sec": self._cached_token_rates["prefill_tokens_per_sec"],
+                "decode_tokens_per_sec": self._cached_token_rates["decode_tokens_per_sec"],
                 "sample_count": 0,
             }
 
-        current_time = time.time()
+        # Only update rates if we have new token history entries
+        current_sample_count = len(self._token_history)
+        if current_sample_count > self._cached_token_rates["last_update_count"]:
+            current_time = time.time()
 
-        window_prompt_tokens = 0
-        window_generation_tokens = 0
-        oldest_timestamp = self._token_history[0]["timestamp"]
+            window_prompt_tokens = 0
+            window_generation_tokens = 0
+            oldest_timestamp = self._token_history[0]["timestamp"]
 
-        for entry in self._token_history:
-            window_prompt_tokens += entry["prompt_tokens"]
-            window_generation_tokens += entry["generation_tokens"]
+            for entry in self._token_history:
+                window_prompt_tokens += entry["prompt_tokens"]
+                window_generation_tokens += entry["generation_tokens"]
 
-        time_span = current_time - oldest_timestamp if len(self._token_history) > 1 else 1
+            time_span = current_time - oldest_timestamp if len(self._token_history) > 1 else 1
 
-        prompt_tokens_per_sec = window_prompt_tokens / time_span if time_span > 0 else 0
-        generation_tokens_per_sec = window_generation_tokens / time_span if time_span > 0 else 0
+            # Update cached rates
+            self._cached_token_rates["prefill_tokens_per_sec"] = (
+                window_prompt_tokens / time_span if time_span > 0 else 0
+            )
+            self._cached_token_rates["decode_tokens_per_sec"] = (
+                window_generation_tokens / time_span if time_span > 0 else 0
+            )
+            self._cached_token_rates["last_update_count"] = current_sample_count
 
         return {
             "total_prefill_tokens": self._total_prefill_tokens,
             "total_decode_tokens": self._total_decode_tokens,
-            "prefill_tokens_per_sec": prompt_tokens_per_sec,
-            "decode_tokens_per_sec": generation_tokens_per_sec,
-            "sample_count": len(self._token_history),
+            "prefill_tokens_per_sec": self._cached_token_rates["prefill_tokens_per_sec"],
+            "decode_tokens_per_sec": self._cached_token_rates["decode_tokens_per_sec"],
+            "sample_count": current_sample_count,
         }
 
     def get_timing_stats(self):
@@ -228,6 +330,80 @@ def get_timing_stats(self):
             "batch_generation_count": len(self._generation_batch_history),
         }
 
+    def get_training_progress(self):
+        """Calculate and return training progress and ETA."""
+        if not self._total_training_steps or self._current_training_step <= 0:
+            return {
+                "current_step": self._current_training_step,
+                "total_steps": self._total_training_steps,
+                "progress_percent": 0,
+                "eta_seconds": None,
+                "eta_formatted": "N/A",
+            }
+
+        progress_percent = (self._current_training_step / self._total_training_steps) * 100
+        eta_seconds = None
+        eta_formatted = "N/A"
+
+        if self._training_start_time and self._current_training_step > 0:
+            elapsed_time = time.time() - self._training_start_time
+            avg_time_per_step = elapsed_time / self._current_training_step
+            remaining_steps = self._total_training_steps - self._current_training_step
+            eta_seconds = remaining_steps * avg_time_per_step
+
+            if eta_seconds > 0:
+                hours = int(eta_seconds // 3600)
+                minutes = int((eta_seconds % 3600) // 60)
+                if hours > 0:
+                    eta_formatted = f"{hours}h {minutes}m"
+                else:
+                    eta_formatted = f"{minutes}m"
+
+        return {
+            "current_step": self._current_training_step,
+            "total_steps": self._total_training_steps,
+            "progress_percent": progress_percent,
+            "eta_seconds": eta_seconds,
+            "eta_formatted": eta_formatted,
+        }
+
+    def report_model_utilization(self, mfu: float, mbu: float):
+        """Report MFU (Model FLOPs Utilization) and MBU (Memory Bandwidth Utilization)."""
+        current_time = time.time()
+        # Validate and clamp values to reasonable ranges
+        mfu = max(0, min(100, mfu))
+        mbu = max(0, min(100, mbu))
+
+        self._model_utilization_history.append({"timestamp": current_time, "mfu": mfu, "mbu": mbu})
+
+    def report_memory_usage(self, gpu_memory_used: float, kv_cache_size: float):
+        """Report memory usage statistics."""
+        self._memory_usage_stats["total_gpu_memory_used"] = gpu_memory_used
+        self._memory_usage_stats["average_kv_cache_size"] = kv_cache_size
+        self._memory_usage_stats["peak_memory_usage"] = max(
+            self._memory_usage_stats["peak_memory_usage"], gpu_memory_used
+        )
+
+    def get_utilization_stats(self):
+        """Calculate and return current utilization statistics."""
+        if not self._model_utilization_history:
+            return {"mfu": 0, "mbu": 0, "sample_count": 0}
+
+        # Calculate averages over the sample window
+        total_mfu = sum(entry["mfu"] for entry in self._model_utilization_history)
+        total_mbu = sum(entry["mbu"] for entry in self._model_utilization_history)
+        count = len(self._model_utilization_history)
+
+        return {
+            "mfu": total_mfu / count if count > 0 else 0,
+            "mbu": total_mbu / count if count > 0 else 0,
+            "sample_count": count,
+        }
+
+    def get_memory_stats(self):
+        """Return current memory usage statistics."""
+        return self._memory_usage_stats.copy()
+
     def get_dashboard_port(self):
         """Get the port number where the dashboard is running."""
         return self._dashboard_port
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
index b7de1e155..aa90063f2 100644
--- a/open_instruct/grpo_fast.py
+++ b/open_instruct/grpo_fast.py
@@ -2054,6 +2054,10 @@ def create_model_and_optimizer(
     )
     logger.info("======== ✅ model update group setup successfully =========")
 
+    # Set vLLM engines in ActorManager for metrics collection
+    ray.get(actor_manager.set_vllm_engines.remote(vllm_engines))
+    logger.info("======== ✅ vLLM engines set in ActorManager for metrics collection =========")
+
     return policy_group, vllm_engines, tool_objects, resume_training_step, episode, actor_manager
 
 
@@ -2269,6 +2273,7 @@ def one_training_step(
             ray_get_with_progress(update_ref_policy_future, desc="Updating reference policy")
 
     ray.get(actor_manager.report_training_step_time.remote(train_timer.duration))
+    ray.get(actor_manager.update_training_step.remote(training_step))
 
     average_metrics = {k: sum(m[k] for m in metrics_list) / len(metrics_list) for k in metrics_list[0]}
     total_time = time.perf_counter() - start_time
diff --git a/open_instruct/static/dashboard.css b/open_instruct/static/dashboard.css
index 2d16b61ba..88a7b0126 100644
--- a/open_instruct/static/dashboard.css
+++ b/open_instruct/static/dashboard.css
@@ -197,4 +197,32 @@ h2 {
     font-size: 14px;
     color: #999;
     margin-top: 3px;
+}
+
+/* Training progress styles */
+.progress-indicator {
+    width: 100%;
+    height: 8px;
+    background: #e9ecef;
+    border-radius: 4px;
+    margin-top: 8px;
+    overflow: hidden;
+}
+
+.progress-fill {
+    height: 100%;
+    background: linear-gradient(90deg, #4CAF50, #81C784);
+    transition: width 0.3s ease;
+    border-radius: 4px;
+}
+
+/* Memory usage specific styles */
+.memory-card {
+    background: #fff8f0;
+    border-left-color: #FF9800;
+}
+
+.utilization-card {
+    background: #fce4ec;
+    border-left-color: #E91E63;
 }
\ No newline at end of file
diff --git a/open_instruct/static/dashboard.html b/open_instruct/static/dashboard.html
index 233df2b4e..d84d3616c 100644
--- a/open_instruct/static/dashboard.html
+++ b/open_instruct/static/dashboard.html
@@ -39,6 +39,36 @@ <h2>⏱️ Performance Metrics</h2>
                 </div>
             </div>
         </div>
+        <h2>🎯 Training Progress</h2>
+        <div id="training-progress-container">
+            <div class="token-card" style="border-left-color: #4CAF50;">
+                <div class="token-grid">
+                    <div class="token-stat">
+                        <div class="token-label">Loading...</div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <h2>🧠 Model Utilization</h2>
+        <div id="model-utilization-container">
+            <div class="token-card" style="border-left-color: #E91E63;">
+                <div class="token-grid">
+                    <div class="token-stat">
+                        <div class="token-label">Loading...</div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <h2>💾 Memory Usage</h2>
+        <div id="memory-usage-container">
+            <div class="token-card" style="border-left-color: #FF9800;">
+                <div class="token-grid">
+                    <div class="token-stat">
+                        <div class="token-label">Loading...</div>
+                    </div>
+                </div>
+            </div>
+        </div>
         <h2>💾 KV Cache</h2>
         <div id="kv-cache-container">
             <div class="token-card" style="border-left-color: #FF9800;">
@@ -116,18 +146,25 @@ <h2>💾 KV Cache</h2>
                     if (num >= 1000) return (num / 1000).toFixed(2) + 'K';
                     return num.toFixed(0);
                 };
+                
+                const formatTokensPerSecond = (num) => {
+                    if (num >= 1000000) return (num / 1000000).toFixed(2) + 'M';
+                    if (num >= 36500) return (num / 1000).toFixed(1) + 'k';
+                    if (num >= 1000) return num.toLocaleString('en-US');
+                    return num.toFixed(1);
+                };
 
                 document.getElementById('token-container').innerHTML = `
                     <div class="token-card">
                         <div class="token-grid">
                             <div class="token-stat">
                                 <div class="token-label">Prefill Tokens/Sec</div>
-                                <div class="token-value">${stats.prefill_tokens_per_sec.toFixed(1)}</div>
+                                <div class="token-value">${formatTokensPerSecond(stats.prefill_tokens_per_sec)}</div>
                                 <div class="token-rate">Avg over ${stats.sample_count} samples | Total: ${formatNumber(stats.total_prefill_tokens)}</div>
                             </div>
                             <div class="token-stat">
                                 <div class="token-label">Decode Tokens/Sec</div>
-                                <div class="token-value">${stats.decode_tokens_per_sec.toFixed(1)}</div>
+                                <div class="token-value">${formatTokensPerSecond(stats.decode_tokens_per_sec)}</div>
                                 <div class="token-rate">Avg over ${stats.sample_count} samples | Total: ${formatNumber(stats.total_decode_tokens)}</div>
                             </div>
                         </div>
@@ -161,6 +198,103 @@ <h2>💾 KV Cache</h2>
                 `;
             }
 
+            // Update training progress
+            if (data.training_progress) {
+                const progress = data.training_progress;
+                document.getElementById('training-progress-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #4CAF50;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Current Training Step</div>
+                                <div class="token-value" style="color: #4CAF50;">${progress.current_step} / ${progress.total_steps || 'N/A'}</div>
+                                <div class="token-rate">${progress.progress_percent.toFixed(1)}% complete</div>
+                            </div>
+                            <div class="token-stat">
+                                <div class="token-label">Estimated Time Remaining</div>
+                                <div class="token-value" style="color: #4CAF50;">${progress.eta_formatted}</div>
+                                <div class="token-rate">Based on current training speed</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            }
+
+            // Update model utilization (MFU/MBU)
+            if (data.utilization_stats) {
+                const util = data.utilization_stats;
+                document.getElementById('model-utilization-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #E91E63;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Model FLOPs Utilization (MFU)</div>
+                                <div class="token-value" style="color: #E91E63;">${util.mfu.toFixed(1)}%</div>
+                                <div class="token-rate">Percentage of theoretical peak FLOPS (avg over ${util.sample_count} samples)</div>
+                            </div>
+                            <div class="token-stat">
+                                <div class="token-label">Memory Bandwidth Utilization (MBU)</div>
+                                <div class="token-value" style="color: #E91E63;">${util.mbu.toFixed(1)}%</div>
+                                <div class="token-rate">Percentage of theoretical peak memory bandwidth (avg over ${util.sample_count} samples)</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            } else {
+                document.getElementById('model-utilization-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #E91E63;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Model FLOPs Utilization (MFU)</div>
+                                <div class="token-value" style="color: #E91E63;">Collecting...</div>
+                                <div class="token-rate">Percentage of theoretical peak FLOPS</div>
+                            </div>
+                            <div class="token-stat">
+                                <div class="token-label">Memory Bandwidth Utilization (MBU)</div>
+                                <div class="token-value" style="color: #E91E63;">Collecting...</div>
+                                <div class="token-rate">Percentage of theoretical peak memory bandwidth</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            }
+
+            // Update memory usage
+            if (data.memory_stats) {
+                const mem = data.memory_stats;
+                document.getElementById('memory-usage-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #FF9800;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Average KV Cache Size</div>
+                                <div class="token-value" style="color: #FF9800;">${mem.average_kv_cache_size.toFixed(2)} GB</div>
+                                <div class="token-rate">Per actor KV cache memory usage</div>
+                            </div>
+                            <div class="token-stat">
+                                <div class="token-label">Total GPU Memory Usage</div>
+                                <div class="token-value" style="color: #FF9800;">${mem.total_gpu_memory_used.toFixed(2)} GB</div>
+                                <div class="token-rate">Across all inference actors (peak: ${mem.peak_memory_usage.toFixed(2)} GB)</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            } else {
+                document.getElementById('memory-usage-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #FF9800;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Average KV Cache Size</div>
+                                <div class="token-value" style="color: #FF9800;">Collecting...</div>
+                                <div class="token-rate">Per actor KV cache memory usage</div>
+                            </div>
+                            <div class="token-stat">
+                                <div class="token-label">Total GPU Memory Usage</div>
+                                <div class="token-value" style="color: #FF9800;">Collecting...</div>
+                                <div class="token-rate">Across all inference actors</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            }
+
             // Update KV cache statistics
             if (data.kv_cache_max_concurrency !== null && data.kv_cache_max_concurrency !== undefined) {
                 let kvCacheHtml = `
diff --git a/open_instruct/vllm_utils3.py b/open_instruct/vllm_utils3.py
index 293e2542b..2ce7396b2 100644
--- a/open_instruct/vllm_utils3.py
+++ b/open_instruct/vllm_utils3.py
@@ -624,6 +624,57 @@ def wake_up(self, tags: Optional[list[str]] = None):
     def ready(self):
         return True
 
+    def get_engine_metrics(self):
+        """Get comprehensive metrics from the vLLM engine."""
+        try:
+            # Get GPU memory usage
+            if torch.cuda.is_available():
+                gpu_memory_allocated = torch.cuda.memory_allocated(0)  # bytes
+                gpu_memory_reserved = torch.cuda.memory_reserved(0)  # bytes
+                gpu_memory_total = torch.cuda.get_device_properties(0).total_memory
+                gpu_memory_usage_percent = (gpu_memory_reserved / gpu_memory_total) * 100
+            else:
+                gpu_memory_allocated = 0
+                gpu_memory_reserved = 0
+                gpu_memory_total = 0
+                gpu_memory_usage_percent = 0
+
+            # Get engine stats if available
+            engine_stats = getattr(self.llm_engine, "stats", {})
+
+            # Get KV cache info
+            kv_cache_info = self.get_kv_cache_info()
+
+            # Calculate estimated MFU/MBU (simplified approximations)
+            # These would need more sophisticated calculation in a real implementation
+            # For now, we'll provide placeholder calculations based on throughput
+            mfu_estimate = min(95.0, max(0.0, 50.0))  # Placeholder: 50% utilization
+            mbu_estimate = min(95.0, max(0.0, gpu_memory_usage_percent * 0.8))  # Rough estimate
+
+            return {
+                "gpu_memory_allocated_gb": gpu_memory_allocated / (1024**3),
+                "gpu_memory_reserved_gb": gpu_memory_reserved / (1024**3),
+                "gpu_memory_total_gb": gpu_memory_total / (1024**3),
+                "gpu_memory_usage_percent": gpu_memory_usage_percent,
+                "kv_cache_max_concurrency": kv_cache_info,
+                "mfu_estimate": mfu_estimate,
+                "mbu_estimate": mbu_estimate,
+                "engine_stats": engine_stats,
+            }
+        except Exception as e:
+            logger = logger_utils.setup_logger(__name__)
+            logger.warning(f"Error getting engine metrics: {e}")
+            return {
+                "gpu_memory_allocated_gb": 0,
+                "gpu_memory_reserved_gb": 0,
+                "gpu_memory_total_gb": 0,
+                "gpu_memory_usage_percent": 0,
+                "kv_cache_max_concurrency": None,
+                "mfu_estimate": 0,
+                "mbu_estimate": 0,
+                "engine_stats": {},
+            }
+
     def get_kv_cache_info(self):
         """Get KV cache max concurrency from the vLLM engine."""
         kv_cache_specs = self.llm_engine.model_executor.get_kv_cache_specs()

From a237d4a2190942bc1b689166d5c4d8f4b4257430 Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Mon, 8 Sep 2025 12:39:23 -0600
Subject: [PATCH 2/6] Added more functionality. now we track actor status

---
 open_instruct/actor_manager.py      | 28 +++++++++
 open_instruct/static/dashboard.html | 88 +++++++++++++++++++++++++++++
 open_instruct/vllm_utils3.py        | 15 +++++
 3 files changed, 131 insertions(+)

diff --git a/open_instruct/actor_manager.py b/open_instruct/actor_manager.py
index dee256648..5c134998d 100644
--- a/open_instruct/actor_manager.py
+++ b/open_instruct/actor_manager.py
@@ -67,6 +67,8 @@ def __init__(self, queues: dict, args, vllm_engines=None):
         # MFU/MBU tracking
         self._model_utilization_history = collections.deque(maxlen=self._sample_window)
         self._memory_usage_stats = {"total_gpu_memory_used": 0, "average_kv_cache_size": 0, "peak_memory_usage": 0}
+        # Actor status tracking
+        self._actor_status = {}  # actor_id -> {unfinished_requests, inference_batch_size, last_update}
         if self._args.enable_queue_dashboard:
             self._setup_queue_monitoring()
             self._start_dashboard()
@@ -191,6 +193,7 @@ async def api_status():
                 "kv_cache_max_concurrency": self._kv_cache_max_concurrency,
                 # This is less confusing to users.
                 "inference_batch_size": self._args.inference_batch_size * self._args.num_samples_per_prompt_rollout,
+                "actor_status": self.get_actor_status(),
             }
 
         def run_server():
@@ -213,6 +216,15 @@ def should_stop(self) -> bool:
         """Check if actors should stop processing."""
         return self._should_stop
 
+    def report_actor_status(self, actor_id: str, unfinished_requests: int, inference_batch_size: int):
+        """Report status from an individual actor."""
+        current_time = time.time()
+        self._actor_status[actor_id] = {
+            "unfinished_requests": unfinished_requests,
+            "inference_batch_size": inference_batch_size,
+            "last_update": current_time,
+        }
+
     def report_token_stats(self, prompt_tokens: int, generation_tokens: int):
         """Report token statistics from main thread."""
         current_time = time.time()
@@ -404,6 +416,22 @@ def get_memory_stats(self):
         """Return current memory usage statistics."""
         return self._memory_usage_stats.copy()
 
+    def get_actor_status(self):
+        """Return current actor status information."""
+        current_time = time.time()
+        # Filter out stale actor data (older than 60 seconds)
+        active_actors = {}
+        for actor_id, status in self._actor_status.items():
+            if current_time - status["last_update"] < 60:
+                active_actors[actor_id] = {
+                    "actor_id_short": actor_id[:8],  # Short version for display
+                    "unfinished_requests": status["unfinished_requests"],
+                    "inference_batch_size": status["inference_batch_size"],
+                    "last_update": status["last_update"],
+                    "is_active": status["unfinished_requests"] > 0,
+                }
+        return active_actors
+
     def get_dashboard_port(self):
         """Get the port number where the dashboard is running."""
         return self._dashboard_port
diff --git a/open_instruct/static/dashboard.html b/open_instruct/static/dashboard.html
index d84d3616c..544308a10 100644
--- a/open_instruct/static/dashboard.html
+++ b/open_instruct/static/dashboard.html
@@ -79,6 +79,16 @@ <h2>💾 KV Cache</h2>
                 </div>
             </div>
         </div>
+        <h2>🎭 Actor Status</h2>
+        <div id="actor-status-container">
+            <div class="token-card" style="border-left-color: #2196F3;">
+                <div class="token-grid">
+                    <div class="token-stat">
+                        <div class="token-label">Loading...</div>
+                    </div>
+                </div>
+            </div>
+        </div>
     </div>
     <script>
         async function updateStatus() {
@@ -324,6 +334,84 @@ <h2>💾 KV Cache</h2>
 
                 document.getElementById('kv-cache-container').innerHTML = kvCacheHtml;
             }
+
+            // Update actor status
+            if (data.actor_status) {
+                const actors = data.actor_status;
+                const actorCount = Object.keys(actors).length;
+
+                if (actorCount === 0) {
+                    document.getElementById('actor-status-container').innerHTML = `
+                        <div class="token-card" style="border-left-color: #2196F3;">
+                            <div class="token-grid">
+                                <div class="token-stat">
+                                    <div class="token-label">No Active Actors</div>
+                                    <div class="token-value" style="color: #2196F3;">0</div>
+                                    <div class="token-rate">Waiting for actors to report status</div>
+                                </div>
+                            </div>
+                        </div>
+                    `;
+                } else {
+                    let actorHtml = '<div class="token-card" style="border-left-color: #2196F3;"><div class="token-grid">';
+                    
+                    // Summary stats
+                    let totalUnfinished = 0;
+                    let activeCount = 0;
+                    for (const [actorId, status] of Object.entries(actors)) {
+                        totalUnfinished += status.unfinished_requests;
+                        if (status.is_active) activeCount++;
+                    }
+
+                    actorHtml += `
+                        <div class="token-stat">
+                            <div class="token-label">Total Actors</div>
+                            <div class="token-value" style="color: #2196F3;">${actorCount}</div>
+                            <div class="token-rate">${activeCount} active • ${actorCount - activeCount} idle</div>
+                        </div>
+                        <div class="token-stat">
+                            <div class="token-label">Total Unfinished Requests</div>
+                            <div class="token-value" style="color: #2196F3;">${totalUnfinished}</div>
+                            <div class="token-rate">Across all actors</div>
+                        </div>
+                    `;
+                    
+                    actorHtml += '</div></div>';
+
+                    // Individual actor cards
+                    for (const [actorId, status] of Object.entries(actors)) {
+                        const statusColor = status.is_active ? '#4CAF50' : '#9E9E9E';
+                        const statusText = status.is_active ? 'Active' : 'Idle';
+                        const timeSinceUpdate = Math.floor((Date.now() / 1000) - status.last_update);
+                        
+                        actorHtml += `
+                            <div class="queue-card" style="margin-top: 10px;">
+                                <div class="queue-name">Actor ${status.actor_id_short}</div>
+                                <div class="queue-stats" style="color: ${statusColor};">
+                                    ${statusText} • ${status.unfinished_requests} unfinished • Batch size: ${status.inference_batch_size}
+                                </div>
+                                <div class="token-rate" style="font-size: 0.8em; color: #666;">
+                                    Updated ${timeSinceUpdate}s ago
+                                </div>
+                            </div>
+                        `;
+                    }
+
+                    document.getElementById('actor-status-container').innerHTML = actorHtml;
+                }
+            } else {
+                document.getElementById('actor-status-container').innerHTML = `
+                    <div class="token-card" style="border-left-color: #2196F3;">
+                        <div class="token-grid">
+                            <div class="token-stat">
+                                <div class="token-label">Actor Status</div>
+                                <div class="token-value" style="color: #2196F3;">Collecting...</div>
+                                <div class="token-rate">Waiting for actor status reports</div>
+                            </div>
+                        </div>
+                    </div>
+                `;
+            }
         }
 
         // Update immediately and then every second
diff --git a/open_instruct/vllm_utils3.py b/open_instruct/vllm_utils3.py
index 2ce7396b2..220e410cf 100644
--- a/open_instruct/vllm_utils3.py
+++ b/open_instruct/vllm_utils3.py
@@ -18,6 +18,7 @@
 import os
 import queue
 import time
+import uuid
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
@@ -363,10 +364,12 @@ def __init__(
         **kwargs,
     ):
         self.logger = logger_utils.setup_logger(__name__)
+        self.actor_id = str(uuid.uuid4())
         self.tools = tools or {}
         self.max_tool_calls = max_tool_calls or {}
         self.inference_batch_size = inference_batch_size
         self.request_metadata = {}
+        self._step_counter = 0
 
         if self.tools:
             self.executor = ThreadPoolExecutor(max_workers=20)
@@ -459,6 +462,18 @@ def process_from_queue(self, timeout: float = 60.0):
         tracking = _init_tool_tracking()
         outputs = []
         while True:
+            self._step_counter += 1
+
+            # Log actor status every 1000 steps
+            if self._step_counter % 1000 == 0 and self.actor_manager is not None:
+                unfinished_requests = self.llm_engine.get_num_unfinished_requests()
+                try:
+                    self.actor_manager.report_actor_status.remote(
+                        self.actor_id, unfinished_requests, self.inference_batch_size
+                    )
+                except Exception as e:
+                    self.logger.warning(f"Failed to report actor status: {e}")
+
             outputs.extend(self._poll_tool_futures(tracking, self.llm_engine.tokenizer))
 
             # Process engine steps - ONLY if there are unfinished requests (matching ToolUseLLM)

From 68f5a5754bdb4ac6a502aaebd7648e273f77479c Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Mon, 8 Sep 2025 13:06:47 -0600
Subject: [PATCH 3/6] Adds more metrics, actor management

---
 open_instruct/actor_manager.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/open_instruct/actor_manager.py b/open_instruct/actor_manager.py
index 5c134998d..fef226155 100644
--- a/open_instruct/actor_manager.py
+++ b/open_instruct/actor_manager.py
@@ -206,6 +206,20 @@ def run_server():
 
         logger = logger_utils.setup_logger(__name__)
         logger.info(f"Dashboard server started at http://{hostname}:{self._dashboard_port}")
+        
+        # Give server a moment to start, then test if it's responding
+        import time
+        time.sleep(1)
+        try:
+            import socket as sock
+            test_sock = sock.socket(sock.AF_INET, sock.SOCK_STREAM)
+            test_sock.settimeout(2)
+            result = test_sock.connect_ex(("127.0.0.1", self._dashboard_port))
+            test_sock.close()
+            if result != 0:
+                logger.warning("❌ Dashboard server is not responding on localhost")
+        except Exception as e:
+            logger.warning(f"Could not test dashboard server connectivity: {e}")
 
     def set_should_stop(self, should_stop: bool):
         """Set whether actors should stop processing."""

From 7a2d006c89c23ce449116bf742db4a50ede141fa Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Wed, 17 Sep 2025 11:49:55 -0600
Subject: [PATCH 4/6] many changes to dashboard

---
 open_instruct/actor_manager.py |   4 +-
 open_instruct/grpo_fast.py     |   3 +-
 open_instruct/vllm_utils3.py   | 347 +++++++++++++++++++++++++++------
 3 files changed, 294 insertions(+), 60 deletions(-)

diff --git a/open_instruct/actor_manager.py b/open_instruct/actor_manager.py
index fef226155..6cd8329c2 100644
--- a/open_instruct/actor_manager.py
+++ b/open_instruct/actor_manager.py
@@ -206,12 +206,14 @@ def run_server():
 
         logger = logger_utils.setup_logger(__name__)
         logger.info(f"Dashboard server started at http://{hostname}:{self._dashboard_port}")
-        
+
         # Give server a moment to start, then test if it's responding
         import time
+
         time.sleep(1)
         try:
             import socket as sock
+
             test_sock = sock.socket(sock.AF_INET, sock.SOCK_STREAM)
             test_sock.settimeout(2)
             result = test_sock.connect_ex(("127.0.0.1", self._dashboard_port))
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
index aa90063f2..4ca51741e 100644
--- a/open_instruct/grpo_fast.py
+++ b/open_instruct/grpo_fast.py
@@ -2054,9 +2054,8 @@ def create_model_and_optimizer(
     )
     logger.info("======== ✅ model update group setup successfully =========")
 
-    # Set vLLM engines in ActorManager for metrics collection
+    # Pass vllm_engine refs to actor manager for monitoring.
     ray.get(actor_manager.set_vllm_engines.remote(vllm_engines))
-    logger.info("======== ✅ vLLM engines set in ActorManager for metrics collection =========")
 
     return policy_group, vllm_engines, tool_objects, resume_training_step, episode, actor_manager
 
diff --git a/open_instruct/vllm_utils3.py b/open_instruct/vllm_utils3.py
index 220e410cf..de8ca0652 100644
--- a/open_instruct/vllm_utils3.py
+++ b/open_instruct/vllm_utils3.py
@@ -18,9 +18,9 @@
 import os
 import queue
 import time
-import uuid
-from collections import defaultdict
+from collections import defaultdict, deque
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
 from datetime import timedelta
 from typing import Any, Dict, List, Optional, Union
 
@@ -50,6 +50,187 @@
 logger = logger_utils.setup_logger(__name__)
 
 
+@dataclass
+class RequestStats:
+    """Statistics for a single request."""
+
+    prompt_length: int
+    response_lengths: list[int]  # One per sample
+    generation_time: float
+    num_samples: int
+    timestamp: float
+
+
+class ModelDims:
+    """Model dimensions for MFU/MBU calculations."""
+
+    def __init__(
+        self,
+        num_layers: int,
+        hidden_size: int,
+        intermediate_size: int,
+        vocab_size: int,
+        num_attn_heads: int,
+        num_kv_heads: int = None,
+    ):
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.vocab_size = vocab_size
+        self.num_attn_heads = num_attn_heads
+        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else num_attn_heads
+
+        self.FLOP_PER_MAC = 2
+        self.SOFTMAX_FLOPS_PER_SCORE = 4
+        self.head_dim = self.hidden_size // self.num_attn_heads
+
+    def attn_flops(self, query_len: int, kv_len: int) -> int:
+        d = self.head_dim
+        mul = self.FLOP_PER_MAC
+        q_proj = mul * query_len * self.hidden_size * self.hidden_size
+        kv_proj = mul * 2 * query_len * self.hidden_size * (self.num_kv_heads * d)
+        qk = mul * self.num_attn_heads * query_len * kv_len * d
+        softmax = self.SOFTMAX_FLOPS_PER_SCORE * self.num_attn_heads * query_len * kv_len
+        av = mul * self.num_attn_heads * query_len * kv_len * d
+        out_proj = mul * query_len * self.hidden_size * self.hidden_size
+        return q_proj + kv_proj + qk + softmax + av + out_proj
+
+    def mlp_flops(self, seq_len: int) -> int:
+        mul = self.FLOP_PER_MAC
+        first = mul * seq_len * self.hidden_size * self.intermediate_size
+        act = seq_len * self.intermediate_size
+        second = mul * seq_len * self.intermediate_size * self.hidden_size
+        return first + act + second
+
+    def prefill_flops(self, prompt_lengths: list[int]) -> int:
+        total = 0
+        for L in prompt_lengths:
+            total += self.num_layers * (self.attn_flops(L, L) + self.mlp_flops(L))
+            total += self.FLOP_PER_MAC * self.hidden_size * self.vocab_size
+        return total
+
+    def decode_flops(self, prompt_lengths: list[int], response_lengths: list[int], samples_per_prompt: int = 1) -> int:
+        total = 0
+        response_idx = 0
+        for P in prompt_lengths:
+            for _ in range(samples_per_prompt):
+                R = response_lengths[response_idx]
+                total += R * self.num_layers * self.mlp_flops(seq_len=1)
+                for t in range(R):
+                    kv_len = P + t + 1
+                    total += self.num_layers * self.attn_flops(query_len=1, kv_len=kv_len)
+                total += R * self.FLOP_PER_MAC * self.hidden_size * self.vocab_size
+                response_idx += 1
+        return total
+
+    def flops(self, prompt_lengths: list[int], response_lengths: list[int] = None, samples_per_prompt: int = 1) -> int:
+        total = self.prefill_flops(prompt_lengths)
+        if response_lengths is not None:
+            total += self.decode_flops(prompt_lengths, response_lengths, samples_per_prompt)
+        return total
+
+    def memory_bytes(
+        self,
+        prompt_lengths: list[int],
+        response_lengths: list[int] = None,
+        samples_per_prompt: int = 1,
+        dtype_bytes: int = 2,
+    ) -> int:
+        """Calculate total memory bytes for prefill and decode."""
+        # Weight memory for prefill (batched)
+        num_prefill_batches = len(prompt_lengths)
+        weight_bytes = self._weight_memory_bytes(num_prefill_batches, dtype_bytes)
+
+        # KV cache writes for prefill
+        total_prefill_tokens = sum(prompt_lengths)
+        kv_write_bytes = self._kv_cache_write_bytes(total_prefill_tokens, dtype_bytes)
+
+        total_bytes = weight_bytes + kv_write_bytes
+
+        if response_lengths is not None:
+            # Weight memory for decode
+            unique_positions = 0
+            response_idx = 0
+            for _ in prompt_lengths:
+                prompt_responses = response_lengths[response_idx : response_idx + samples_per_prompt]
+                response_idx += samples_per_prompt
+                unique_positions += max(prompt_responses) if prompt_responses else 0
+
+            weight_bytes_decode = self._weight_memory_bytes(unique_positions, dtype_bytes)
+
+            # KV writes for all decode tokens
+            total_decode_tokens = sum(response_lengths)
+            kv_write_bytes_decode = self._kv_cache_write_bytes(total_decode_tokens, dtype_bytes)
+
+            # KV reads during decode
+            kv_read_bytes = self._kv_cache_read_bytes(
+                prompt_lengths, response_lengths, samples_per_prompt, dtype_bytes
+            )
+
+            total_bytes += weight_bytes_decode + kv_write_bytes_decode + kv_read_bytes
+
+        return total_bytes
+
+    def _weight_memory_bytes(self, num_tokens: int, dtype_bytes: int = 2) -> int:
+        hidden_kv = self.num_kv_heads * self.head_dim
+        w_q = self.hidden_size * self.hidden_size
+        w_k = self.hidden_size * hidden_kv
+        w_v = self.hidden_size * hidden_kv
+        w_o = self.hidden_size * self.hidden_size
+        w_up = self.hidden_size * self.intermediate_size
+        w_dn = self.intermediate_size * self.hidden_size
+        per_layer_weight_bytes = (w_q + w_k + w_v + w_o + w_up + w_dn) * dtype_bytes
+        return self.num_layers * num_tokens * per_layer_weight_bytes
+
+    def _kv_cache_write_bytes(self, num_tokens: int, dtype_bytes: int = 2) -> int:
+        kv_write_bytes_per_token = 2 * self.num_kv_heads * self.head_dim * dtype_bytes
+        return self.num_layers * num_tokens * kv_write_bytes_per_token
+
+    def _kv_cache_read_bytes(
+        self, prompt_lengths: list[int], response_lengths: list[int], samples_per_prompt: int = 1, dtype_bytes: int = 2
+    ) -> int:
+        kv_read_terms = 0
+        response_idx = 0
+        for P in prompt_lengths:
+            prompt_responses = []
+            for _ in range(samples_per_prompt):
+                prompt_responses.append(response_lengths[response_idx])
+                response_idx += 1
+            max_response_length = max(prompt_responses) if prompt_responses else 0
+            kv_read_terms += max_response_length * samples_per_prompt * P
+            for R in prompt_responses:
+                kv_read_terms += R * (R - 1) // 2
+        kv_bytes_per_token = 2 * self.num_kv_heads * self.head_dim * dtype_bytes
+        return self.num_layers * kv_bytes_per_token * kv_read_terms
+
+
+def get_gpu_specs():
+    """Get GPU specifications for MFU/MBU calculations."""
+    if not torch.cuda.is_available():
+        return None
+
+    device_name = torch.cuda.get_device_name(0).lower()
+
+    # GPU specs with FP16/BF16 TFLOPS and memory bandwidth
+    gpu_specs = {
+        "a100": {"flops": 312e12, "memory_bandwidth": 1.6e12},
+        "h100": {"flops": 990e12, "memory_bandwidth": 3.35e12},
+        "a6000": {"flops": 155e12, "memory_bandwidth": 768e9},
+        "l40s": {"flops": 362e12, "memory_bandwidth": 864e9},
+        "v100": {"flops": 125e12, "memory_bandwidth": 900e9},
+        "4090": {"flops": 165e12, "memory_bandwidth": 1008e9},
+        "3090": {"flops": 71e12, "memory_bandwidth": 936e9},
+    }
+
+    # Try to match GPU model
+    for gpu_name, specs in gpu_specs.items():
+        if gpu_name in device_name:
+            return specs
+
+    # Default fallback for unknown GPUs
+    return {"flops": 100e12, "memory_bandwidth": 1e12}
+
+
 # Edited from: https://github.com/OpenRLHF/OpenRLHF/pull/971/files
 # Turns out Ray doesnt necessarily place bundles together,
 # so this function is used to get the bundle indices of a placement group
@@ -361,16 +542,24 @@ def __init__(
         eval_results_queue=None,
         actor_manager=None,
         inference_batch_size: Optional[int] = None,
+        actor_index: int = 0,
+        max_request_stats: int = 1000,
         **kwargs,
     ):
         self.logger = logger_utils.setup_logger(__name__)
-        self.actor_id = str(uuid.uuid4())
+        self.actor_id = str(actor_index)
         self.tools = tools or {}
         self.max_tool_calls = max_tool_calls or {}
         self.inference_batch_size = inference_batch_size
         self.request_metadata = {}
         self._step_counter = 0
 
+        # Request statistics tracking
+        self.max_request_stats = max_request_stats
+        self.request_stats = deque(maxlen=max_request_stats)
+        self.model_dims = None  # Will be initialized after engine creation
+        self.gpu_specs = get_gpu_specs()
+
         if self.tools:
             self.executor = ThreadPoolExecutor(max_workers=20)
         else:
@@ -397,6 +586,9 @@ def __init__(
 
         self.llm_engine = vllm.LLMEngine.from_engine_args(vllm.EngineArgs(*args, **kwargs))
 
+        # Initialize model dimensions for MFU/MBU calculations
+        self._init_model_dims()
+
         self.prompt_queue = prompt_queue
         self.results_queue = results_queue
         self.eval_results_queue = eval_results_queue
@@ -407,6 +599,20 @@ def __init__(
         self._should_stop_value = False
         self._should_stop_timeout_s = 5
 
+    def _init_model_dims(self):
+        """Initialize model dimensions from the loaded model config."""
+        # Get model config from engine
+        hf_config = self.llm_engine.model_config.hf_config
+
+        self.model_dims = ModelDims(
+            num_layers=hf_config.num_hidden_layers,
+            hidden_size=hf_config.hidden_size,
+            intermediate_size=hf_config.intermediate_size,
+            vocab_size=hf_config.vocab_size,
+            num_attn_heads=hf_config.num_attention_heads,
+            num_kv_heads=getattr(hf_config, "num_key_value_heads", None),
+        )
+
     def _should_stop(self) -> bool:
         if (time.perf_counter() - self._last_should_stop_update) > self._should_stop_timeout_s:
             should_stop_ref = self.actor_manager.should_stop.remote()
@@ -467,12 +673,9 @@ def process_from_queue(self, timeout: float = 60.0):
             # Log actor status every 1000 steps
             if self._step_counter % 1000 == 0 and self.actor_manager is not None:
                 unfinished_requests = self.llm_engine.get_num_unfinished_requests()
-                try:
-                    self.actor_manager.report_actor_status.remote(
-                        self.actor_id, unfinished_requests, self.inference_batch_size
-                    )
-                except Exception as e:
-                    self.logger.warning(f"Failed to report actor status: {e}")
+                self.actor_manager.report_actor_status.remote(
+                    self.actor_id, unfinished_requests, self.inference_batch_size
+                )
 
             outputs.extend(self._poll_tool_futures(tracking, self.llm_engine.tokenizer))
 
@@ -524,6 +727,21 @@ def process_from_queue(self, timeout: float = 60.0):
                 ),
                 start_time=metadata["start_time"],
             )
+
+            # Track request statistics
+            if not metadata["is_eval"]:
+                # Extract response lengths for each sample
+                response_lengths = [len(output.token_ids) for output in final_output.outputs]
+                self.request_stats.append(
+                    RequestStats(
+                        prompt_length=metadata["prompt_tokens"],
+                        response_lengths=response_lengths,
+                        generation_time=end_time - metadata["start_time"],
+                        num_samples=len(response_lengths),
+                        timestamp=metadata["start_time"],
+                    )
+                )
+
             self._insert_result_to_queue(result, is_eval=metadata["is_eval"])
 
         return len(request_outputs)
@@ -641,54 +859,68 @@ def ready(self):
 
     def get_engine_metrics(self):
         """Get comprehensive metrics from the vLLM engine."""
-        try:
-            # Get GPU memory usage
-            if torch.cuda.is_available():
-                gpu_memory_allocated = torch.cuda.memory_allocated(0)  # bytes
-                gpu_memory_reserved = torch.cuda.memory_reserved(0)  # bytes
-                gpu_memory_total = torch.cuda.get_device_properties(0).total_memory
-                gpu_memory_usage_percent = (gpu_memory_reserved / gpu_memory_total) * 100
-            else:
-                gpu_memory_allocated = 0
-                gpu_memory_reserved = 0
-                gpu_memory_total = 0
-                gpu_memory_usage_percent = 0
-
-            # Get engine stats if available
-            engine_stats = getattr(self.llm_engine, "stats", {})
-
-            # Get KV cache info
-            kv_cache_info = self.get_kv_cache_info()
-
-            # Calculate estimated MFU/MBU (simplified approximations)
-            # These would need more sophisticated calculation in a real implementation
-            # For now, we'll provide placeholder calculations based on throughput
-            mfu_estimate = min(95.0, max(0.0, 50.0))  # Placeholder: 50% utilization
-            mbu_estimate = min(95.0, max(0.0, gpu_memory_usage_percent * 0.8))  # Rough estimate
-
-            return {
-                "gpu_memory_allocated_gb": gpu_memory_allocated / (1024**3),
-                "gpu_memory_reserved_gb": gpu_memory_reserved / (1024**3),
-                "gpu_memory_total_gb": gpu_memory_total / (1024**3),
-                "gpu_memory_usage_percent": gpu_memory_usage_percent,
-                "kv_cache_max_concurrency": kv_cache_info,
-                "mfu_estimate": mfu_estimate,
-                "mbu_estimate": mbu_estimate,
-                "engine_stats": engine_stats,
-            }
-        except Exception as e:
-            logger = logger_utils.setup_logger(__name__)
-            logger.warning(f"Error getting engine metrics: {e}")
-            return {
-                "gpu_memory_allocated_gb": 0,
-                "gpu_memory_reserved_gb": 0,
-                "gpu_memory_total_gb": 0,
-                "gpu_memory_usage_percent": 0,
-                "kv_cache_max_concurrency": None,
-                "mfu_estimate": 0,
-                "mbu_estimate": 0,
-                "engine_stats": {},
-            }
+        # Get GPU memory usage
+        gpu_memory_allocated = torch.cuda.memory_allocated(0)  # bytes
+        gpu_memory_reserved = torch.cuda.memory_reserved(0)  # bytes
+        gpu_memory_total = torch.cuda.get_device_properties(0).total_memory
+        gpu_memory_usage_percent = (gpu_memory_reserved / gpu_memory_total) * 100
+
+        # Get engine stats if available
+        engine_stats = getattr(self.llm_engine, "stats", {})
+
+        # Get KV cache info
+        kv_cache_info = self.get_kv_cache_info()
+
+        # Build prompt and response length lists from all tracked requests
+        all_prompt_lengths = []
+        all_response_lengths = []
+        samples_per_prompt_list = []
+        total_generation_time = 0
+
+        for stats in self.request_stats:
+            all_prompt_lengths.append(stats.prompt_length)
+            all_response_lengths.extend(stats.response_lengths)
+            samples_per_prompt_list.append(stats.num_samples)
+            total_generation_time += stats.generation_time
+
+        # Calculate MFU/MBU if we have data
+        if total_generation_time > 0 and all_prompt_lengths:
+            # Single call to calculate total FLOPs
+            total_flops = self.model_dims.flops(
+                all_prompt_lengths,
+                all_response_lengths,
+                samples_per_prompt=1,  # Response lengths already expanded per sample
+            )
+
+            # Single call to calculate total memory bytes
+            total_memory_bytes = self.model_dims.memory_bytes(
+                all_prompt_lengths,
+                all_response_lengths,
+                samples_per_prompt=1,  # Response lengths already expanded per sample
+            )
+
+            # MFU = (FLOPs / time) / peak_FLOPS * 100
+            flops_per_second = total_flops / total_generation_time
+            mfu_estimate = min(100.0, (flops_per_second / self.gpu_specs["flops"]) * 100)
+
+            # MBU = (Memory bytes / time) / peak_bandwidth * 100
+            bytes_per_second = total_memory_bytes / total_generation_time
+            mbu_estimate = min(100.0, (bytes_per_second / self.gpu_specs["memory_bandwidth"]) * 100)
+        else:
+            mfu_estimate = "n/a"
+            mbu_estimate = "n/a"
+
+        return {
+            "gpu_memory_allocated_gb": gpu_memory_allocated / (1024**3),
+            "gpu_memory_reserved_gb": gpu_memory_reserved / (1024**3),
+            "gpu_memory_total_gb": gpu_memory_total / (1024**3),
+            "gpu_memory_usage_percent": gpu_memory_usage_percent,
+            "kv_cache_max_concurrency": kv_cache_info,
+            "mfu_estimate": mfu_estimate,
+            "mbu_estimate": mbu_estimate,
+            "num_request_stats": len(self.request_stats),
+            "engine_stats": engine_stats,
+        }
 
     def get_kv_cache_info(self):
         """Get KV cache max concurrency from the vLLM engine."""
@@ -851,6 +1083,7 @@ def create_vllm_engines(
                 tools=tools,
                 max_tool_calls=max_tool_calls_dict,
                 inference_batch_size=inference_batch_size,
+                actor_index=i,
                 kv_cache_dtype="auto" if not use_fp8_kv_cache else "fp8",
                 calculate_kv_scales=use_fp8_kv_cache,
             )

From b02290ca87741e62e48db82492cd5a934b5b9a5e Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Wed, 17 Sep 2025 12:26:32 -0600
Subject: [PATCH 5/6] fixed errors

---
 open_instruct/vllm_utils3.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/open_instruct/vllm_utils3.py b/open_instruct/vllm_utils3.py
index f686ed3c8..3236e24fb 100644
--- a/open_instruct/vllm_utils3.py
+++ b/open_instruct/vllm_utils3.py
@@ -20,8 +20,8 @@
 import queue
 import time
 from collections import defaultdict, deque
-from dataclasses import dataclass
 from concurrent import futures
+from dataclasses import dataclass
 from datetime import timedelta
 from typing import Any, Dict, List, Optional, Union
 
@@ -691,6 +691,8 @@ def process_from_queue(self, timeout: float = 60.0):
         Returns:
             int: Number of requests processed
         """
+        iteration_count = 0
+        total_processed = 0
 
         while not self._should_exit():
             iteration_count += 1
@@ -937,22 +939,6 @@ def _finalize_sub_request(self, sub_request_id, request_output_for_prompts, comp
                 finished=True,
             )
 
-            # Track request statistics
-            if not metadata["is_eval"]:
-                # Extract response lengths for each sample
-                response_lengths = [len(output.token_ids) for output in final_output.outputs]
-                self.request_stats.append(
-                    RequestStats(
-                        prompt_length=metadata["prompt_tokens"],
-                        response_lengths=response_lengths,
-                        generation_time=end_time - metadata["start_time"],
-                        num_samples=len(response_lengths),
-                        timestamp=metadata["start_time"],
-                    )
-                )
-
-            self._insert_result_to_queue(result, is_eval=metadata["is_eval"])
-
         # Add the completion output (with index field already set if needed)
         self.request_outputs[base_request_id].outputs.append(complete_output)
 

From 692ec67799f7cb4678cd11101591c6d334b2deef Mon Sep 17 00:00:00 2001
From: Finbarr Timbers <finbarrtimbers@gmail.com>
Date: Wed, 17 Sep 2025 12:37:58 -0600
Subject: [PATCH 6/6] set host networking

---
 scripts/train/debug/tool_grpo_fast.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/train/debug/tool_grpo_fast.sh b/scripts/train/debug/tool_grpo_fast.sh
index 9fdd158da..0ced037f2 100755
--- a/scripts/train/debug/tool_grpo_fast.sh
+++ b/scripts/train/debug/tool_grpo_fast.sh
@@ -24,7 +24,6 @@ uv run python mason.py \
        --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
        --env GIT_COMMIT="$(git rev-parse --short HEAD)" \
        --budget ai2/oe-adapt \
-       --no-host-networking \
        --gpus 1 \
 	   -- source configs/beaker_configs/ray_node_setup.sh \&\& python open_instruct/grpo_fast.py \
     --dataset_mixer_list hamishivi/tulu_3_rewritten_100k_with_tool_prompt 1.0 \