From c5e33f4b99193c895e5e7dba43ddcf25c243ed20 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 2 Jun 2025 09:28:17 +0200
Subject: [PATCH] whisper-cli : align token timestamps with VAD ts

This commit aligns the token timestamps with the VAD timestamps when VAD
is enabled.

The motivation of this is that currently the token timestamps that are
reported in the full json output are the timestamps that whisper sees
after the VAD has processed the audio. This means that whisper only sees
possibly filtered audio and the token timestamps are related to the
filtered audio, not the original audio. For the segment timestamps we
map/align them with original timestamps but this is not currenly done
for the token timestamps which is what this commit aims to address.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3174
---
 examples/cli/cli.cpp | 8 +++++++-
 include/whisper.h    | 2 ++
 src/whisper.cpp      | 4 ++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
index f73ed9ae078..5aca1962aa7 100644
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@@ -727,7 +727,13 @@ static void output_json(
                                 value_s("text", whisper_token_to_str(ctx, token.id), false);
                                 if(token.t0 > -1 && token.t1 > -1) {
                                     // If we have per-token timestamps, write them out
-                                    times_o(token.t0, token.t1, false);
+                                    if (params.vad) {
+                                        times_o(vad_ts_to_original_ts(token.t0, ctx),
+                                                vad_ts_to_original_ts(token.t1, ctx),
+                                                false);
+                                    } else {
+                                        times_o(token.t0, token.t1, false);
+                                    }
                                 }
                                 value_i("id", token.id, false);
                                 value_f("p", token.p, false);
diff --git a/include/whisper.h b/include/whisper.h
index 4aeda98f334..e3e1dac761e 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -712,6 +712,8 @@ extern "C" {
     WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment);
     WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment);
 
+    WHISPER_API int64_t vad_ts_to_original_ts(int64_t vad_ts, struct whisper_context * ctx);
+
     WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments);
     WHISPER_API void whisper_vad_free         (struct whisper_vad_context  * ctx);
 
diff --git a/src/whisper.cpp b/src/whisper.cpp
index fe3e135bee6..f6664167244 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -7968,6 +7968,10 @@ int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment)
     return whisper_full_get_segment_t1_from_state(ctx->state, i_segment);
 }
 
+int64_t vad_ts_to_original_ts(int64_t vad_ts, struct whisper_context * ctx) {
+    return map_processed_to_original_time(vad_ts, ctx->state->vad_mapping_table);
+}
+
 bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {
     return state->result_all[i_segment].speaker_turn_next;
 }