From 27b0c6936c36375fdc27d8ef73d7d91754e7372e Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 14 Oct 2025 12:14:35 +0100 Subject: [PATCH 1/8] feat: adjust relatime examples for Realtime API Beta --- examples/realtime/azure_realtime.py | 61 ++++++++++++++++++++++----- examples/realtime/push_to_talk_app.py | 38 ++++++++++++----- 2 files changed, 78 insertions(+), 21 deletions(-) mode change 100644 => 100755 examples/realtime/azure_realtime.py diff --git a/examples/realtime/azure_realtime.py b/examples/realtime/azure_realtime.py old mode 100644 new mode 100755 index 3cf64b8be9..27f21136e2 --- a/examples/realtime/azure_realtime.py +++ b/examples/realtime/azure_realtime.py @@ -1,5 +1,29 @@ -import os +#!/usr/bin/env uv run +# +# /// script +# requires-python = ">=3.9" +# dependencies = [ +# "textual", +# "numpy", +# "pyaudio", +# "pydub", +# "sounddevice", +# "openai[realtime]", +# "azure-identity", +# "aiohttp", +# "python-dotenv", +# ] +# +# [tool.uv.sources] +# openai = { path = "../../", editable = true } +# /// + +from dotenv import load_dotenv +load_dotenv() + import asyncio +import base64 +import os from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider @@ -22,20 +46,23 @@ async def main() -> None: credential = DefaultAzureCredential() client = AsyncAzureOpenAI( + azure_deployment="gpt-realtime", azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], azure_ad_token_provider=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default"), - api_version="2024-10-01-preview", + api_version="2025-04-01-preview", ) - async with client.realtime.connect( + + async with client.beta.realtime.connect( model="gpt-realtime", # deployment name for your model ) as connection: await connection.session.update( session={ - "output_modalities": ["text"], - "model": "gpt-realtime", - "type": "realtime", + # "output_modalities": ["text"], + # "model": "gpt-realtime", + # "type": "realtime", } ) + while True: user_input = input("Enter a message: ") if user_input == "q": @@ -48,14 +75,28 @@ async def main() -> None: "content": [{"type": "input_text", "text": user_input}], } ) + await connection.response.create() async for event in connection: - if event.type == "response.output_text.delta": + print(f"Event: {event.type}") + + if event.type == "error": + print(f"ERROR: {event}") + + if event.type == "response.text.delta": print(event.delta, flush=True, end="") - elif event.type == "response.output_text.done": + if event.type == "response.text.done": print() - elif event.type == "response.done": - break + if event.type == "response.done": + print(f"final response: {event.response.output[0].content[0].transcript}") + print(f"usage: {event.response.usage}") + + if event.type == "response.audio.delta": + audio_data = base64.b64decode(event.delta) + print(f"Received {len(audio_data)} bytes of audio data.") + + if event.type == "response.audio_transcript.delta": + print(f"Received text delta: {event.delta}") await credential.close() diff --git a/examples/realtime/push_to_talk_app.py b/examples/realtime/push_to_talk_app.py index acf38995b2..880addc24f 100755 --- a/examples/realtime/push_to_talk_app.py +++ b/examples/realtime/push_to_talk_app.py @@ -18,6 +18,9 @@ # "pydub", # "sounddevice", # "openai[realtime]", +# "azure-identity", +# "aiohttp", +# "python-dotenv", # ] # # [tool.uv.sources] @@ -25,21 +28,26 @@ # /// from __future__ import annotations +from dotenv import load_dotenv +load_dotenv() + import base64 import asyncio +import os from typing import Any, cast from typing_extensions import override from textual import events from audio_util import CHANNELS, SAMPLE_RATE, AudioPlayerAsync from textual.app import App, ComposeResult -from textual.widgets import Button, Static, RichLog +from textual.widgets import Static, RichLog from textual.reactive import reactive from textual.containers import Container -from openai import AsyncOpenAI -from openai.types.realtime.session import Session +from openai import AsyncAzureOpenAI +from openai.types.realtime.session_update_event import Session from openai.resources.realtime.realtime import AsyncRealtimeConnection +from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider class SessionDisplay(Static): @@ -60,7 +68,9 @@ class AudioStatusIndicator(Static): @override def render(self) -> str: status = ( - "🔴 Recording... (Press K to stop)" if self.is_recording else "⚪ Press K to start recording (Q to quit)" + "🔴 Recording... (Press K to stop)" + if self.is_recording + else "⚪ Press K to start recording (Q to quit)" ) return status @@ -123,7 +133,7 @@ class RealtimeApp(App[None]): } """ - client: AsyncOpenAI + client: AsyncAzureOpenAI should_send_audio: asyncio.Event audio_player: AudioPlayerAsync last_audio_item_id: str | None @@ -135,7 +145,15 @@ def __init__(self) -> None: super().__init__() self.connection = None self.session = None - self.client = AsyncOpenAI() + credential = DefaultAzureCredential() + self.client = AsyncAzureOpenAI( + azure_deployment="gpt-realtime", + azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], + azure_ad_token_provider=get_bearer_token_provider( + credential, "https://cognitiveservices.azure.com/.default" + ), + api_version="2025-04-01-preview", + ) self.audio_player = AudioPlayerAsync() self.last_audio_item_id = None self.should_send_audio = asyncio.Event() @@ -247,7 +265,9 @@ async def send_mic_audio(self) -> None: asyncio.create_task(connection.send({"type": "response.cancel"})) sent_audio = True - await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8")) + await connection.input_audio_buffer.append( + audio=base64.b64encode(cast(Any, data)).decode("utf-8") + ) await asyncio.sleep(0) except KeyboardInterrupt: @@ -258,10 +278,6 @@ async def send_mic_audio(self) -> None: async def on_key(self, event: events.Key) -> None: """Handle key press events.""" - if event.key == "enter": - self.query_one(Button).press() - return - if event.key == "q": self.exit() return From 14b2a2e3522048006b127798cb677f3807a6f0f4 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 14 Oct 2025 12:30:36 +0100 Subject: [PATCH 2/8] fix: migrate push_to_talk_app to Beta --- examples/realtime/push_to_talk_app.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/realtime/push_to_talk_app.py b/examples/realtime/push_to_talk_app.py index 880addc24f..6a122ef997 100755 --- a/examples/realtime/push_to_talk_app.py +++ b/examples/realtime/push_to_talk_app.py @@ -172,21 +172,21 @@ async def on_mount(self) -> None: self.run_worker(self.send_mic_audio()) async def handle_realtime_connection(self) -> None: - async with self.client.realtime.connect(model="gpt-realtime") as conn: + async with self.client.beta.realtime.connect(model="gpt-realtime") as conn: self.connection = conn self.connected.set() # note: this is the default and can be omitted # if you want to manually handle VAD yourself, then set `'turn_detection': None` - await conn.session.update( - session={ - "audio": { - "input": {"turn_detection": {"type": "server_vad"}}, - }, - "model": "gpt-realtime", - "type": "realtime", - } - ) + # await conn.session.update( + # session={ + # "audio": { + # "input": {"turn_detection": {"type": "server_vad"}}, + # }, + # "model": "gpt-realtime", + # "type": "realtime", + # } + # ) acc_items: dict[str, Any] = {} @@ -202,7 +202,7 @@ async def handle_realtime_connection(self) -> None: self.session = event.session continue - if event.type == "response.output_audio.delta": + if event.type == "response.audio.delta": if event.item_id != self.last_audio_item_id: self.audio_player.reset_frame_count() self.last_audio_item_id = event.item_id @@ -211,7 +211,7 @@ async def handle_realtime_connection(self) -> None: self.audio_player.add_data(bytes_data) continue - if event.type == "response.output_audio_transcript.delta": + if event.type == "response.audio_transcript.delta": try: text = acc_items[event.item_id] except KeyError: From 03a788bfb8f81a2f3c81304bd5306f169a98d608 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 14 Oct 2025 12:39:18 +0100 Subject: [PATCH 3/8] fix: formatting fixes --- examples/realtime/azure_realtime.py | 5 +++-- examples/realtime/push_to_talk_app.py | 15 ++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/realtime/azure_realtime.py b/examples/realtime/azure_realtime.py index 27f21136e2..8885714452 100755 --- a/examples/realtime/azure_realtime.py +++ b/examples/realtime/azure_realtime.py @@ -19,11 +19,12 @@ # /// from dotenv import load_dotenv + load_dotenv() -import asyncio -import base64 import os +import base64 +import asyncio from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider diff --git a/examples/realtime/push_to_talk_app.py b/examples/realtime/push_to_talk_app.py index 6a122ef997..97ee8b7300 100755 --- a/examples/realtime/push_to_talk_app.py +++ b/examples/realtime/push_to_talk_app.py @@ -29,11 +29,12 @@ from __future__ import annotations from dotenv import load_dotenv + load_dotenv() +import os import base64 import asyncio -import os from typing import Any, cast from typing_extensions import override @@ -42,12 +43,12 @@ from textual.app import App, ComposeResult from textual.widgets import Static, RichLog from textual.reactive import reactive +from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider from textual.containers import Container from openai import AsyncAzureOpenAI -from openai.types.realtime.session_update_event import Session from openai.resources.realtime.realtime import AsyncRealtimeConnection -from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider +from openai.types.realtime.session_update_event import Session class SessionDisplay(Static): @@ -68,9 +69,7 @@ class AudioStatusIndicator(Static): @override def render(self) -> str: status = ( - "🔴 Recording... (Press K to stop)" - if self.is_recording - else "⚪ Press K to start recording (Q to quit)" + "🔴 Recording... (Press K to stop)" if self.is_recording else "⚪ Press K to start recording (Q to quit)" ) return status @@ -265,9 +264,7 @@ async def send_mic_audio(self) -> None: asyncio.create_task(connection.send({"type": "response.cancel"})) sent_audio = True - await connection.input_audio_buffer.append( - audio=base64.b64encode(cast(Any, data)).decode("utf-8") - ) + await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8")) await asyncio.sleep(0) except KeyboardInterrupt: From 4d39eb780cdc5ca5c1163ae05ab0bc69c7299b9b Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Tue, 14 Oct 2025 13:33:31 +0100 Subject: [PATCH 4/8] fix: remove dead code --- examples/realtime/audio_util.py | 57 +-------------------------------- 1 file changed, 1 insertion(+), 56 deletions(-) diff --git a/examples/realtime/audio_util.py b/examples/realtime/audio_util.py index 954a508675..d8840bc411 100644 --- a/examples/realtime/audio_util.py +++ b/examples/realtime/audio_util.py @@ -1,18 +1,13 @@ from __future__ import annotations import io -import base64 -import asyncio import threading -from typing import Callable, Awaitable import numpy as np import pyaudio import sounddevice as sd from pydub import AudioSegment -from openai.resources.realtime.realtime import AsyncRealtimeConnection - CHUNK_LENGTH_S = 0.05 # 100ms SAMPLE_RATE = 24000 FORMAT = pyaudio.paInt16 @@ -89,54 +84,4 @@ def stop(self): self.queue = [] def terminate(self): - self.stream.close() - - -async def send_audio_worker_sounddevice( - connection: AsyncRealtimeConnection, - should_send: Callable[[], bool] | None = None, - start_send: Callable[[], Awaitable[None]] | None = None, -): - sent_audio = False - - device_info = sd.query_devices() - print(device_info) - - read_size = int(SAMPLE_RATE * 0.02) - - stream = sd.InputStream( - channels=CHANNELS, - samplerate=SAMPLE_RATE, - dtype="int16", - ) - stream.start() - - try: - while True: - if stream.read_available < read_size: - await asyncio.sleep(0) - continue - - data, _ = stream.read(read_size) - - if should_send() if should_send else True: - if not sent_audio and start_send: - await start_send() - await connection.send( - {"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")} - ) - sent_audio = True - - elif sent_audio: - print("Done, triggering inference") - await connection.send({"type": "input_audio_buffer.commit"}) - await connection.send({"type": "response.create", "response": {}}) - sent_audio = False - - await asyncio.sleep(0) - - except KeyboardInterrupt: - pass - finally: - stream.stop() - stream.close() + self.stream.close() \ No newline at end of file From a145f0cbc621db1f65850ddc6eadba076c09381b Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Thu, 16 Oct 2025 12:22:58 +0100 Subject: [PATCH 5/8] fix: add debug logging --- examples/realtime/azure_realtime.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/realtime/azure_realtime.py b/examples/realtime/azure_realtime.py index 8885714452..91ba3e6267 100755 --- a/examples/realtime/azure_realtime.py +++ b/examples/realtime/azure_realtime.py @@ -18,6 +18,7 @@ # openai = { path = "../../", editable = true } # /// +import logging from dotenv import load_dotenv load_dotenv() @@ -36,6 +37,14 @@ # Supported models and API versions: https://learn.microsoft.com/azure/ai-services/openai/how-to/realtime-audio#supported-models # Entra ID auth: https://learn.microsoft.com/azure/ai-services/openai/how-to/managed-identity +logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger("websockets").setLevel(logging.DEBUG) + +logging.basicConfig( + format="%(asctime)s %(message)s", + level=logging.DEBUG, +) + async def main() -> None: """The following example demonstrates how to configure Azure OpenAI to use the Realtime API. From 6e5ab7f58ef3d6157a0c2a224f84a6b240d5af12 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Thu, 16 Oct 2025 16:02:38 +0100 Subject: [PATCH 6/8] feat: add AZURE_OPENAI_API_KEY env var --- examples/realtime/azure_realtime.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/realtime/azure_realtime.py b/examples/realtime/azure_realtime.py index 91ba3e6267..17b0e50bc7 100755 --- a/examples/realtime/azure_realtime.py +++ b/examples/realtime/azure_realtime.py @@ -55,10 +55,17 @@ async def main() -> None: """ credential = DefaultAzureCredential() + + if not (api_key := os.environ.get("AZURE_OPENAI_API_KEY")): + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + else: + token_provider = None + client = AsyncAzureOpenAI( azure_deployment="gpt-realtime", azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], - azure_ad_token_provider=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default"), + azure_ad_token_provider=token_provider, + api_key=api_key, api_version="2025-04-01-preview", ) From 778c3afcf410f380279551622f6852a4cdf4caa2 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Fri, 17 Oct 2025 12:01:00 +0100 Subject: [PATCH 7/8] fix: minor fix --- examples/realtime/azure_realtime.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/realtime/azure_realtime.py b/examples/realtime/azure_realtime.py index 17b0e50bc7..131c8a35ae 100755 --- a/examples/realtime/azure_realtime.py +++ b/examples/realtime/azure_realtime.py @@ -20,6 +20,7 @@ import logging from dotenv import load_dotenv +import httpx load_dotenv() @@ -61,13 +62,22 @@ async def main() -> None: else: token_provider = None + endpoint = httpx.URL(os.environ["AZURE_OPENAI_ENDPOINT"]) + if endpoint.scheme in ("ws", "wss"): + websocket_base_url, azure_endpoint = f"{endpoint}/openai", None + else: + websocket_base_url, azure_endpoint = None, endpoint + + print(f"{websocket_base_url=}, {azure_endpoint=}") + client = AsyncAzureOpenAI( azure_deployment="gpt-realtime", - azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], + azure_endpoint=str(azure_endpoint), + websocket_base_url=websocket_base_url, azure_ad_token_provider=token_provider, api_key=api_key, - api_version="2025-04-01-preview", - ) + api_version="2025-04-01-preview" + ) # type: ignore async with client.beta.realtime.connect( model="gpt-realtime", # deployment name for your model From 849fe02a5c04a5cd4ebd6361085c70edb5252b09 Mon Sep 17 00:00:00 2001 From: Anton Dubovik Date: Fri, 17 Oct 2025 13:04:25 +0100 Subject: [PATCH 8/8] fix: fixed push_to_talk_app --- examples/realtime/push_to_talk_app.py | 30 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/examples/realtime/push_to_talk_app.py b/examples/realtime/push_to_talk_app.py index 97ee8b7300..c2837a6646 100755 --- a/examples/realtime/push_to_talk_app.py +++ b/examples/realtime/push_to_talk_app.py @@ -29,6 +29,7 @@ from __future__ import annotations from dotenv import load_dotenv +import httpx load_dotenv() @@ -144,15 +145,30 @@ def __init__(self) -> None: super().__init__() self.connection = None self.session = None - credential = DefaultAzureCredential() + + if not (api_key := os.environ.get("AZURE_OPENAI_API_KEY")): + credential = DefaultAzureCredential() + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + else: + token_provider = None + + endpoint = httpx.URL(os.environ["AZURE_OPENAI_ENDPOINT"]) + if endpoint.scheme in ("ws", "wss"): + websocket_base_url, azure_endpoint = f"{endpoint}/openai", None + else: + websocket_base_url, azure_endpoint = None, endpoint + + print(f"{websocket_base_url=}, {azure_endpoint=}") + self.client = AsyncAzureOpenAI( azure_deployment="gpt-realtime", - azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], - azure_ad_token_provider=get_bearer_token_provider( - credential, "https://cognitiveservices.azure.com/.default" - ), - api_version="2025-04-01-preview", - ) + azure_endpoint=str(azure_endpoint), + websocket_base_url=websocket_base_url, + azure_ad_token_provider=token_provider, + api_key=api_key, + api_version="2025-04-01-preview" + ) # type: ignore + self.audio_player = AudioPlayerAsync() self.last_audio_item_id = None self.should_send_audio = asyncio.Event()