feat: Add e2e latnecy and plotting script

huggingface · Sep 17, 2024 · 6f7f9d1 · 6f7f9d1
1 parent 1ed5dbf
commit 6f7f9d1
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 10 deletions.
diff --git a/plot.py b/plot.py
@@ -20,35 +20,34 @@ def plot():
     constant_vus = [result for result in results_filtered if result['executor_type'] == 'ConstantVUs']
     constant_vus_x = [result['config']['vus'] for result in constant_vus]
     if len(constant_rate) > 0:
-        plot_inner('Requests/s', constant_rate_x, constant_rate, 'Constant Rate')
+        plot_inner('Requests/s', constant_rate_x, constant_rate, 'Constant Rate benchmark')
     if len(constant_vus) > 0:
-        plot_inner('VUs', constant_vus_x, constant_vus, 'Constant VUs')
+        plot_inner('VUs', constant_vus_x, constant_vus, 'Constant VUs benchmark')
 
 
-def plot_inner(x_name, x_values, results, title):
+def plot_inner(x_name, x_values, results, chart_title):
     fig, axs = plt.subplots(3, 2, figsize=(15, 20))
     fig.tight_layout(pad=6.0)
-    fig.subplots_adjust(hspace=0.4, wspace=0.2, bottom=0.15)
+    fig.subplots_adjust(hspace=0.2, wspace=0.2, bottom=0.15, top=0.92)
     # compute error rate
     for result in results:
         result['error_rate'] = result['failed_requests'] / (
                 result['failed_requests'] + result['successful_requests']) * 100.0
 
-    metrics = ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'token_throughput_secs',
+    metrics = ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90', 'token_throughput_secs',
                'successful_requests', 'error_rate']
 
-    titles = ['Inter Token Latency P90 (lower is better)', 'TTFT P90 (lower is better)',
+    titles = ['Inter Token Latency P90 (lower is better)', 'TTFT P90 (lower is better)', 'End to End Latency P90 (lower is better)',
               'Token Throughput (higher is better)', 'Successful requests', 'Error Rate % (lower is better)']
 
-    labels = ['Time (ms)', 'Time (ms)', 'Tokens/s', 'Count', '%']
+    labels = ['Time (ms)', 'Time (ms)', 'Time (ms)', 'Tokens/s', 'Count', '%']
 
-    x = [result['config']['rate'] for result in results]
     colors = ['#FF9D00', '#2F5BA1']
 
     # Plot each metric in its respective subplot
     for ax, metric, title, label in zip(axs.flatten(), metrics, titles, labels):
         data = list(map(lambda result: result[metric], results))
-        ax.plot(x, data, marker='o', color=colors[0])
+        ax.plot(x_values, data, marker='o', color=colors[0])
         ax.set_title(title)
         ax.tick_params(axis='x', rotation=0)
         ax.set_ylabel(label)
@@ -64,6 +63,7 @@ def plot_inner(x_name, x_values, results, title):
         # Add grid lines for better readability
         ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.5)
         ax.set_axisbelow(True)  # Ensure grid lines are below the bars
+    plt.suptitle(chart_title, fontsize=16)
 
     plt.show()
 

diff --git a/src/benchmark.rs b/src/benchmark.rs
@@ -373,6 +373,9 @@ pub struct BenchmarkResultsWriter {
     successful_requests: u64,
     request_rate: f64,
     total_tokens_sent: u64,
+    e2e_latency_ms_avg: u128,
+    e2e_latency_ms_p90: u128,
+    e2e_latency_ms_p95: u128,
 }
 
 impl BenchmarkResultsWriter {
@@ -395,6 +398,9 @@ impl BenchmarkResultsWriter {
             successful_requests: results.successful_requests() as u64,
             request_rate: results.successful_request_rate()?,
             total_tokens_sent: results.total_tokens_sent(),
+            e2e_latency_ms_avg: results.e2e_latency_avg().ok().unwrap().as_millis(),
+            e2e_latency_ms_p90: results.e2e_latency_percentile(0.9)?.as_millis(),
+            e2e_latency_ms_p95: results.e2e_latency_percentile(0.95)?.as_millis(),
         })
     }
 }

diff --git a/src/requests.rs b/src/requests.rs
@@ -447,4 +447,21 @@ impl TextGenerationAggregatedResponse {
             }
         }
     }
+    pub fn e2e_latency(&self) -> Option<std::time::Duration> {
+        match self.start_time {
+            Some(start_time) => {
+                match self.end_time {
+                    Some(end_time) => {
+                        Some(end_time - start_time)
+                    }
+                    None => {
+                        None
+                    }
+                }
+            }
+            None => {
+                None
+            }
+        }
+    }
 }
diff --git a/src/results.rs b/src/results.rs
@@ -100,6 +100,29 @@ impl BenchmarkResults {
         }
     }
 
+    pub fn e2e_latency_avg(&self) -> anyhow::Result<std::time::Duration> {
+        if self.is_ready() {
+            let mut total_time = std::time::Duration::new(0, 0);
+            for response in self.get_successful_responses() {
+                total_time += response.e2e_latency().unwrap_or_default();
+            }
+            Ok(total_time / self.total_requests() as u32)
+        } else {
+            Err(anyhow::anyhow!(NoResponses))
+        }
+    }
+
+    pub fn e2e_latency_percentile(&self, percentile: f64) -> anyhow::Result<std::time::Duration> {
+        if self.is_ready() {
+            let mut times: Vec<std::time::Duration> = self.get_successful_responses().iter().map(|response| response.e2e_latency().unwrap_or_default()).collect();
+            times.sort();
+            let index = (percentile * times.len() as f64) as usize;
+            Ok(times[index])
+        } else {
+            Err(anyhow::anyhow!(NoResponses))
+        }
+    }
+
     pub fn time_to_first_token_avg(&self) -> anyhow::Result<std::time::Duration> {
         if self.is_ready() {
             let mut total_time = std::time::Duration::new(0, 0);
@@ -172,13 +195,14 @@ impl Debug for BenchmarkResults {
             .field("end_time", &self.end_time())
             .field("total_tokens", &self.total_tokens())
             .field("token_throughput_secs", &self.token_throughput_secs().or::<anyhow::Result<f64>>(Ok(-1.0)))
-            .field("duration", &self.duration().or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))))
+            .field("duration_ms", &self.duration().or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))))
             .field("average_time_to_first_token", &self.time_to_first_token_avg().or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))))
             .field("average_inter_token_latency", &self.inter_token_latency_avg().or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))))
             .field("failed_requests", &self.failed_requests())
             .field("successful_requests", &self.successful_requests())
             .field("request_rate", &self.successful_request_rate().or::<anyhow::Result<f64>>(Ok(-1.0)))
             .field("sent_prompt_tokens", &self.total_tokens_sent())
+            .field("e2e_latency_avg", &self.e2e_latency_avg().or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))))
             .finish()
     }
 }