diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index 3b2b4ff466..cbb932a74d 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try { ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config(); config.max_new_tokens = 100; // increase this based on your speech length // 'task' and 'language' parameters are supported for multilingual models only - config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language config.task = "transcribe"; config.return_timestamps = true; diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 0b2a083908..1c8df0edd9 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -22,6 +22,23 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: return std::make_shared(models_path, device, properties); } +std::pair WhisperDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + Tensor input_ids_tensor{ov::element::i64, {1, 1}}; + input_ids_tensor.data()[0] = decoder_start_token_id; + + Tensor beam_idx_tensor{ov::element::i32, {1}}; + beam_idx_tensor.data()[0] = 0; + + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + /** * Encoder hidden states expected to be with batch 1 * Copy encoder hidden state tensor from batch 1 to requested batch_size. diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index 66e86a0733..6eeba2b387 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -15,8 +15,7 @@ class WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - virtual std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) = 0; + std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id); virtual std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 9c0c4a0b3f..5208f496fb 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -18,23 +18,6 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo m_request = compiled_model.create_infer_request(); } -std::pair WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - Tensor input_ids_tensor{ov::element::i64, {1, 1}}; - input_ids_tensor.data()[0] = decoder_start_token_id; - - Tensor beam_idx_tensor{ov::element::i32, {1}}; - beam_idx_tensor.data()[0] = 0; - - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - reset_state(); - - return {output_token, infer_ms}; -} - std::pair WhisperStatefullDecoder::decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) { diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 44156fc6aa..c8c733e943 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -14,9 +14,6 @@ class WhisperStatefullDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) override; - std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override; diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 2ab07112fa..1ade0dea6b 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -97,23 +97,6 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode m_request_decoder_with_past = compiled_model.create_infer_request(); } -std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - Tensor input_ids_tensor{ov::element::i64, {1, 1}}; - input_ids_tensor.data()[0] = decoder_start_token_id; - - Tensor beam_idx_tensor{ov::element::i32, {1}}; - beam_idx_tensor.data()[0] = 0; - - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - reset_state(); - - return {output_token, infer_ms}; -} - std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) { diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index 3cf4404092..1610c60d4e 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -14,9 +14,6 @@ class WhisperWithPastDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) override; - std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override;