Skip to content

Speaker Identification #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ htmlcov/

# Poetry
.venv/
poetry.lock

# React
node_modules/
Expand All @@ -52,3 +51,4 @@ build/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
tmp
Empty file added db/__init__.py
Empty file.
12 changes: 10 additions & 2 deletions db/create_tables.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from src.models.meeting import Meeting
from botocore.exceptions import ClientError

def create_tables():
print("Creating DynamoDB tables if they don't exist...")
Meeting.create_table(wait=True, billing_mode="PAY_PER_REQUEST")
print("All tables created!")
try:
Meeting.create_table(wait=True, billing_mode="PAY_PER_REQUEST")
print("Meeting table created!")
except ClientError as e:
if e.response['Error']['Code'] == 'ResourceInUseException':
print("Meeting table already exists - skipping creation.")
else:
raise
print("All tables ready!")

if __name__ == "__main__":
create_tables()
14 changes: 14 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
######## For Files ########
## Use local files
ENV=local

## For S3 bucket access
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=us-east-1
AWS_BUCKET=tulsa-city-council
MINUTES_FOLDER=minutes

######## For Speaker Identification ########
OPENAI_API_KEY=
GOOGLE_API_KEY=
34 changes: 34 additions & 0 deletions flows/check_for_new_minutes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from datetime import datetime
from prefect import flow

from tasks.minutes import get_new_minutes



@flow(log_prints=True)
async def check_for_new_minutes():
# TODO: Get the last run page from the database
# last_run_page = get_last_run_page()
last_run_page = 47711
new_minutes, new_last_run_page = await get_new_minutes(last_run_page)
# TODO: Save last run page to the database
# set_last_run_page(new_last_run_page)
# TODO: get meetings with no speaker ids from db
meetings = [datetime(2025, 2, 26, 17, 0)]
# TODO: get transcriptions from db
for minutes in new_minutes:
if minutes.meeting_date in meetings:
# TODO: Get diarisation
# diarisation = await get_diarisation(minutes)
# TODO: Get the transcription instead of diarisation
# transcription = get_transcriptions(meeting)
# TODO: Get the speaker names
# new_transcription = get_speaker_names(minutes, diarisation)
# TODO: Replace transcription
# replace_transcription(new_diarization)


if __name__ == "__main__":
import asyncio

asyncio.run(check_for_new_minutes())
1 change: 1 addition & 0 deletions notebooks/experiments/minutes_diarization/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test_data/
Binary file not shown.
170 changes: 170 additions & 0 deletions notebooks/experiments/minutes_diarization/process_min_diarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from openai import OpenAI
from google import genai
from google.genai import types
import json

import tiktoken

# Initialize OpenAI client
client = OpenAI()


def extract_text_from_pdf(pdf_path):
"""Extract text from PDF using PyPDFLoader."""
loader = PyPDFLoader(str(pdf_path))
pages = loader.load()
return "\n".join(page.page_content for page in pages)


def get_diarization():
"""Get the diarization data from the JSON file."""
diarization_path = Path(
"./notebooks/experiments/minutes_diarization/regular_council_meeting___2025_02_26.diarized.json"
)
if not diarization_path.exists():
raise FileNotFoundError("Diarization JSON file not found")

with open(diarization_path, "r") as f:
return json.load(f)


def simplify_diarization(transcript_data):
def format_timestamp(seconds: float) -> str:
"""Convert seconds to HH:MM:SS format"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{secs:02d}"

# Copied from John's transcription creator to simplify the diarization. Can probably turn this into a function or use the VTT instead.
speaker_lines = ["Meeting Script - Combined by Speaker"]

current_speaker = None
current_text = []
current_start = None

for segment in transcript_data["segments"]:
if current_speaker != segment["speaker"]:
# Output previous speaker's text
if current_speaker:
timestamp = format_timestamp(current_start)
wrapped_text = " ".join(current_text)
speaker_lines.append(
f"[{timestamp}] {current_speaker}:\n{wrapped_text}\n"
)

# Start new speaker
current_speaker = segment["speaker"]
current_text = [segment["text"].strip()]
current_start = segment["start"]
else:
# Continue current speaker
current_text.append(segment["text"].strip())

# Output final speaker
if current_speaker:
timestamp = format_timestamp(current_start)
wrapped_text = " ".join(current_text)
speaker_lines.append(f"[{timestamp}] {current_speaker}:\n{wrapped_text}")
return "\n".join(speaker_lines)


def match_speakers_with_gemini(minutes_text, diarization):
"""Use Gemini to match speakers from diarization with names from minutes."""
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

instruction = f"""I have a city council meeting minutes document and a diarization of the audio recording.
The diarization has identified different speakers but doesn't know their names.
Please analyze the minutes text and match the speakers from the diarization with the names mentioned in the minutes.

Minutes text:
{minutes_text}

Diarization segments:
{diarization}

For each speaker in the diarization, please identify who they are based on the minutes text.
If you can't determine who they are, mark them as "Unknown".
Format your response as a JSON object where the keys are the speaker numbers (e.g., "SPEAKER_00")
and the values are the identified names or "Unknown".
"""
# Start a chat with automatic function calling enabled.
chat = client.chats.create(
model="gemini-2.0-flash",
config=types.GenerateContentConfig(
system_instruction=instruction,
),
)
resp = chat.send_message("Show me speaker identification")

result = resp.candidates[0].content.parts[0].text
print(f"result: {result}")
return result


def match_speakers_with_chatgpt(minutes_text, diarization):
"""Use ChatGPT to match speakers from diarization with names from minutes."""
# Format diarization data for the prompt

prompt = f"""I have a city council meeting minutes document and a diarization of the audio recording.
The diarization has identified different speakers but doesn't know their names.
Please analyze the minutes text and match the speakers from the diarization with the names mentioned in the minutes.

Minutes text:
{minutes_text}

Diarization segments:
{diarization}

For each speaker in the diarization, please identify who they are based on the minutes text.
If you can't determine who they are, mark them as "Unknown".
Format your response as a JSON object where the keys are the speaker numbers (e.g., "SPEAKER_00")
and the values are the identified names or "Unknown".
"""

response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that analyzes meeting minutes and audio diarization to identify speakers.",
},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

return json.loads(response.choices[0].message.content)


def main():
minutes_path = Path(
"./notebooks/experiments/minutes_diarization/test_data/25-173-2_25-173-2 2025-02-26 5PM Minutes.pdf"
)
# Extract text from PDF
minutes_text = extract_text_from_pdf(minutes_path)

# Get diarization data
diarization = get_diarization()

simple_diarization = simplify_diarization(diarization)
print(simple_diarization)

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

print(
f"Diarization segments length: {len(encoding.encode(str(simple_diarization)))}"
)
print(f"Minutes text length: {len(encoding.encode(minutes_text))}")

# Use ChatGPT to match speakers
speaker_matches = match_speakers_with_chatgpt(minutes_text, simple_diarization)
print(speaker_matches)


if __name__ == "__main__":
main()
Loading