Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 51 additions & 10 deletions examples/realtime/azure_realtime.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
import os
#!/usr/bin/env uv run
#
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "textual",
# "numpy",
# "pyaudio",
# "pydub",
# "sounddevice",
# "openai[realtime]",
# "azure-identity",
# "aiohttp",
# "python-dotenv",
# ]
#
# [tool.uv.sources]
# openai = { path = "../../", editable = true }
# ///

from dotenv import load_dotenv
load_dotenv()

import asyncio
import base64
import os

from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider

Expand All @@ -22,20 +46,23 @@ async def main() -> None:

credential = DefaultAzureCredential()
client = AsyncAzureOpenAI(
azure_deployment="gpt-realtime",
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
azure_ad_token_provider=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default"),
api_version="2024-10-01-preview",
api_version="2025-04-01-preview",
)
async with client.realtime.connect(

async with client.beta.realtime.connect(
model="gpt-realtime", # deployment name for your model
) as connection:
await connection.session.update(
session={
"output_modalities": ["text"],
"model": "gpt-realtime",
"type": "realtime",
# "output_modalities": ["text"],
# "model": "gpt-realtime",
# "type": "realtime",
}
)

while True:
user_input = input("Enter a message: ")
if user_input == "q":
Expand All @@ -48,14 +75,28 @@ async def main() -> None:
"content": [{"type": "input_text", "text": user_input}],
}
)

await connection.response.create()
async for event in connection:
if event.type == "response.output_text.delta":
print(f"Event: {event.type}")

if event.type == "error":
print(f"ERROR: {event}")

if event.type == "response.text.delta":
print(event.delta, flush=True, end="")
elif event.type == "response.output_text.done":
if event.type == "response.text.done":
print()
elif event.type == "response.done":
break
if event.type == "response.done":
print(f"final response: {event.response.output[0].content[0].transcript}")
print(f"usage: {event.response.usage}")

if event.type == "response.audio.delta":
audio_data = base64.b64decode(event.delta)
print(f"Received {len(audio_data)} bytes of audio data.")

if event.type == "response.audio_transcript.delta":
print(f"Received text delta: {event.delta}")

await credential.close()

Expand Down
38 changes: 27 additions & 11 deletions examples/realtime/push_to_talk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,36 @@
# "pydub",
# "sounddevice",
# "openai[realtime]",
# "azure-identity",
# "aiohttp",
# "python-dotenv",
# ]
#
# [tool.uv.sources]
# openai = { path = "../../", editable = true }
# ///
from __future__ import annotations

from dotenv import load_dotenv
load_dotenv()

import base64
import asyncio
import os
from typing import Any, cast
from typing_extensions import override

from textual import events
from audio_util import CHANNELS, SAMPLE_RATE, AudioPlayerAsync
from textual.app import App, ComposeResult
from textual.widgets import Button, Static, RichLog
from textual.widgets import Static, RichLog
from textual.reactive import reactive
from textual.containers import Container

from openai import AsyncOpenAI
from openai.types.realtime.session import Session
from openai import AsyncAzureOpenAI
from openai.types.realtime.session_update_event import Session
from openai.resources.realtime.realtime import AsyncRealtimeConnection
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider


class SessionDisplay(Static):
Expand All @@ -60,7 +68,9 @@ class AudioStatusIndicator(Static):
@override
def render(self) -> str:
status = (
"🔴 Recording... (Press K to stop)" if self.is_recording else "⚪ Press K to start recording (Q to quit)"
"🔴 Recording... (Press K to stop)"
if self.is_recording
else "⚪ Press K to start recording (Q to quit)"
)
return status

Expand Down Expand Up @@ -123,7 +133,7 @@ class RealtimeApp(App[None]):
}
"""

client: AsyncOpenAI
client: AsyncAzureOpenAI
should_send_audio: asyncio.Event
audio_player: AudioPlayerAsync
last_audio_item_id: str | None
Expand All @@ -135,7 +145,15 @@ def __init__(self) -> None:
super().__init__()
self.connection = None
self.session = None
self.client = AsyncOpenAI()
credential = DefaultAzureCredential()
self.client = AsyncAzureOpenAI(
azure_deployment="gpt-realtime",
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
azure_ad_token_provider=get_bearer_token_provider(
credential, "https://cognitiveservices.azure.com/.default"
),
api_version="2025-04-01-preview",
)
self.audio_player = AudioPlayerAsync()
self.last_audio_item_id = None
self.should_send_audio = asyncio.Event()
Expand Down Expand Up @@ -247,7 +265,9 @@ async def send_mic_audio(self) -> None:
asyncio.create_task(connection.send({"type": "response.cancel"}))
sent_audio = True

await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8"))
await connection.input_audio_buffer.append(
audio=base64.b64encode(cast(Any, data)).decode("utf-8")
)

await asyncio.sleep(0)
except KeyboardInterrupt:
Expand All @@ -258,10 +278,6 @@ async def send_mic_audio(self) -> None:

async def on_key(self, event: events.Key) -> None:
"""Handle key press events."""
if event.key == "enter":
self.query_one(Button).press()
return

if event.key == "q":
self.exit()
return
Expand Down