|
| 1 | +# |
| 2 | +# Copyright (c) 2024–2025, Daily |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD 2-Clause License |
| 5 | +# |
| 6 | + |
| 7 | +"""Configuration for the AWS Transcribe STT service.""" |
| 8 | + |
| 9 | +from typing import List, Optional |
| 10 | + |
| 11 | +from pydantic import BaseModel |
| 12 | + |
| 13 | +from pipecat.transcriptions.language import Language |
| 14 | + |
| 15 | + |
| 16 | +class AWSInputParams(BaseModel): |
| 17 | + """Configuration parameters for the AWS Transcribe STT service. |
| 18 | +
|
| 19 | + Parameters: |
| 20 | + sample_rate: Audio sample rate in Hz. Must be 8000 or 16000. Defaults to 16000. |
| 21 | + language_code: Language for transcription. Cannot be used with identify_multiple_languages. |
| 22 | + language_options: List of languages for multi-language identification. Required when identify_multiple_languages is True. |
| 23 | + identify_multiple_languages: Enable multiple language identification. Defaults to False. |
| 24 | + identify_language: Enable language identification. Defaults to False. |
| 25 | + preferred_language: Preferred language from language_options to speed up identification. |
| 26 | + enable_partial_results_stabilization: Enable stabilization of partial results. Defaults to True. |
| 27 | + partial_results_stability: Stability level: "low" (faster, less stable) or "high" (slower, more stable). Defaults to "low". |
| 28 | + media_encoding: Audio encoding format. Defaults to "linear16". |
| 29 | + number_of_channels: Number of audio channels. Defaults to 1. |
| 30 | + show_speaker_label: Enable speaker identification in transcription results. Defaults to False. |
| 31 | + enable_channel_identification: Enable channel identification for multi-channel audio. Defaults to False. |
| 32 | + vocabulary_name: Name of custom vocabulary to use for improved transcription accuracy. |
| 33 | + vocabulary_filter_name: Name of vocabulary filter to apply for content filtering. |
| 34 | +
|
| 35 | + Note: |
| 36 | + For real-time conversations, use partial_results_stability="low" for faster responses. |
| 37 | + Multi-language identification may have higher latency than single language mode. |
| 38 | + """ |
| 39 | + |
| 40 | + sample_rate: int = 16000 |
| 41 | + language_code: Optional[Language] = None |
| 42 | + language_options: Optional[List[Language]] = None |
| 43 | + identify_multiple_languages: bool = False |
| 44 | + identify_language: bool = False |
| 45 | + preferred_language: Optional[Language] = None |
| 46 | + enable_partial_results_stabilization: bool = True |
| 47 | + partial_results_stability: str = "high" |
| 48 | + media_encoding: str = "linear16" |
| 49 | + number_of_channels: int = 1 |
| 50 | + show_speaker_label: bool = False |
| 51 | + enable_channel_identification: bool = False |
| 52 | + vocabulary_name: Optional[str] = None |
| 53 | + vocabulary_filter_name: Optional[str] = None |
0 commit comments