Skip to content

Commit 1b18abb

Browse files
add language detection for AWS Transcribe Service
1 parent 0588c82 commit 1b18abb

File tree

4 files changed

+384
-258
lines changed

4 files changed

+384
-258
lines changed

src/pipecat/services/aws/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from pipecat.services import DeprecatedModuleProxy
1010

11+
from .config import *
1112
from .llm import *
1213
from .nova_sonic import *
1314
from .stt import *

src/pipecat/services/aws/config.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#
2+
# Copyright (c) 2024–2025, Daily
3+
#
4+
# SPDX-License-Identifier: BSD 2-Clause License
5+
#
6+
7+
"""Configuration for the AWS Transcribe STT service."""
8+
9+
from typing import List, Optional
10+
11+
from pydantic import BaseModel
12+
13+
from pipecat.transcriptions.language import Language
14+
15+
16+
class AWSInputParams(BaseModel):
17+
"""Configuration parameters for the AWS Transcribe STT service.
18+
19+
Parameters:
20+
sample_rate: Audio sample rate in Hz. Must be 8000 or 16000. Defaults to 16000.
21+
language_code: Language for transcription. Cannot be used with identify_multiple_languages.
22+
language_options: List of languages for multi-language identification. Required when identify_multiple_languages is True.
23+
identify_multiple_languages: Enable multiple language identification. Defaults to False.
24+
identify_language: Enable language identification. Defaults to False.
25+
preferred_language: Preferred language from language_options to speed up identification.
26+
enable_partial_results_stabilization: Enable stabilization of partial results. Defaults to True.
27+
partial_results_stability: Stability level: "low" (faster, less stable) or "high" (slower, more stable). Defaults to "low".
28+
media_encoding: Audio encoding format. Defaults to "linear16".
29+
number_of_channels: Number of audio channels. Defaults to 1.
30+
show_speaker_label: Enable speaker identification in transcription results. Defaults to False.
31+
enable_channel_identification: Enable channel identification for multi-channel audio. Defaults to False.
32+
vocabulary_name: Name of custom vocabulary to use for improved transcription accuracy.
33+
vocabulary_filter_name: Name of vocabulary filter to apply for content filtering.
34+
35+
Note:
36+
For real-time conversations, use partial_results_stability="low" for faster responses.
37+
Multi-language identification may have higher latency than single language mode.
38+
"""
39+
40+
sample_rate: int = 16000
41+
language_code: Optional[Language] = None
42+
language_options: Optional[List[Language]] = None
43+
identify_multiple_languages: bool = False
44+
identify_language: bool = False
45+
preferred_language: Optional[Language] = None
46+
enable_partial_results_stabilization: bool = True
47+
partial_results_stability: str = "high"
48+
media_encoding: str = "linear16"
49+
number_of_channels: int = 1
50+
show_speaker_label: bool = False
51+
enable_channel_identification: bool = False
52+
vocabulary_name: Optional[str] = None
53+
vocabulary_filter_name: Optional[str] = None

0 commit comments

Comments
 (0)