codefortulsa · kendallcorner · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -39,7 +39,6 @@ htmlcov/
 
 # Poetry
 .venv/
-poetry.lock
 
 # React
 node_modules/
@@ -52,3 +51,4 @@ build/
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
+tmp
diff --git a/db/__init__.py b/db/__init__.py
diff --git a/db/create_tables.py b/db/create_tables.py
@@ -1,9 +1,17 @@
 from src.models.meeting import Meeting
+from botocore.exceptions import ClientError
 
 def create_tables():
     print("Creating DynamoDB tables if they don't exist...")
-    Meeting.create_table(wait=True, billing_mode="PAY_PER_REQUEST")
-    print("All tables created!")
+    try:
+        Meeting.create_table(wait=True, billing_mode="PAY_PER_REQUEST")
+        print("Meeting table created!")
+    except ClientError as e:
+        if e.response['Error']['Code'] == 'ResourceInUseException':
+            print("Meeting table already exists - skipping creation.")
+        else:
+            raise
+    print("All tables ready!")
 
 if __name__ == "__main__":
     create_tables()
diff --git a/example.env b/example.env
@@ -0,0 +1,14 @@
+######## For Files ########
+## Use local files
+ENV=local
+
+##  For S3 bucket access
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=us-east-1
+AWS_BUCKET=tulsa-city-council
+MINUTES_FOLDER=minutes
+
+######## For Speaker Identification ########
+OPENAI_API_KEY=
+GOOGLE_API_KEY=
diff --git a/flows/check_for_new_minutes.py b/flows/check_for_new_minutes.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+from prefect import flow
+
+from tasks.minutes import get_new_minutes
+
+
+
+@flow(log_prints=True)
+async def check_for_new_minutes():
+    # TODO: Get the last run page from the database
+    # last_run_page = get_last_run_page()
+    last_run_page = 47711
+    new_minutes, new_last_run_page = await get_new_minutes(last_run_page)
+    # TODO: Save last run page to the database
+    # set_last_run_page(new_last_run_page)
+    # TODO: get meetings with no speaker ids from db
+    meetings = [datetime(2025, 2, 26, 17, 0)]
+    # TODO: get transcriptions from db
+    for minutes in new_minutes:
+        if minutes.meeting_date in meetings:
+            # TODO: Get diarisation
+            # diarisation = await get_diarisation(minutes)
+            # TODO: Get the transcription instead of diarisation
+            # transcription = get_transcriptions(meeting)
+            # TODO: Get the speaker names
+            # new_transcription = get_speaker_names(minutes, diarisation)
+            # TODO: Replace transcription
+            # replace_transcription(new_diarization)
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(check_for_new_minutes())
diff --git a/notebooks/experiments/minutes_diarization/.gitignore b/notebooks/experiments/minutes_diarization/.gitignore
@@ -0,0 +1 @@
+test_data/
diff --git a/notebooks/experiments/minutes_diarization/city_council_Feb_26_2025_5pm.pdf b/notebooks/experiments/minutes_diarization/city_council_Feb_26_2025_5pm.pdf
diff --git a/notebooks/experiments/minutes_diarization/process_min_diarization.py b/notebooks/experiments/minutes_diarization/process_min_diarization.py
@@ -0,0 +1,170 @@
+import os
+from pathlib import Path
+from langchain_community.document_loaders import PyPDFLoader
+from openai import OpenAI
+from google import genai
+from google.genai import types
+import json
+
+import tiktoken
+
+# Initialize OpenAI client
+client = OpenAI()
+
+
+def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF using PyPDFLoader."""
+    loader = PyPDFLoader(str(pdf_path))
+    pages = loader.load()
+    return "\n".join(page.page_content for page in pages)
+
+
+def get_diarization():
+    """Get the diarization data from the JSON file."""
+    diarization_path = Path(
+        "./notebooks/experiments/minutes_diarization/regular_council_meeting___2025_02_26.diarized.json"
+    )
+    if not diarization_path.exists():
+        raise FileNotFoundError("Diarization JSON file not found")
+
+    with open(diarization_path, "r") as f:
+        return json.load(f)
+
+
+def simplify_diarization(transcript_data):
+    def format_timestamp(seconds: float) -> str:
+        """Convert seconds to HH:MM:SS format"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+
+    # Copied from John's transcription creator to simplify the diarization. Can probably turn this into a function or use the VTT instead.
+    speaker_lines = ["Meeting Script - Combined by Speaker"]
+
+    current_speaker = None
+    current_text = []
+    current_start = None
+
+    for segment in transcript_data["segments"]:
+        if current_speaker != segment["speaker"]:
+            # Output previous speaker's text
+            if current_speaker:
+                timestamp = format_timestamp(current_start)
+                wrapped_text = " ".join(current_text)
+                speaker_lines.append(
+                    f"[{timestamp}] {current_speaker}:\n{wrapped_text}\n"
+                )
+
+            # Start new speaker
+            current_speaker = segment["speaker"]
+            current_text = [segment["text"].strip()]
+            current_start = segment["start"]
+        else:
+            # Continue current speaker
+            current_text.append(segment["text"].strip())
+
+    # Output final speaker
+    if current_speaker:
+        timestamp = format_timestamp(current_start)
+        wrapped_text = " ".join(current_text)
+        speaker_lines.append(f"[{timestamp}] {current_speaker}:\n{wrapped_text}")
+    return "\n".join(speaker_lines)
+
+
+def match_speakers_with_gemini(minutes_text, diarization):
+    """Use Gemini to match speakers from diarization with names from minutes."""
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+
+    instruction = f"""I have a city council meeting minutes document and a diarization of the audio recording.
+The diarization has identified different speakers but doesn't know their names.
+Please analyze the minutes text and match the speakers from the diarization with the names mentioned in the minutes.
+
+Minutes text:
+{minutes_text}
+
+Diarization segments:
+{diarization}
+
+For each speaker in the diarization, please identify who they are based on the minutes text.
+If you can't determine who they are, mark them as "Unknown".
+Format your response as a JSON object where the keys are the speaker numbers (e.g., "SPEAKER_00")
+and the values are the identified names or "Unknown".
+"""
+    # Start a chat with automatic function calling enabled.
+    chat = client.chats.create(
+        model="gemini-2.0-flash",
+        config=types.GenerateContentConfig(
+            system_instruction=instruction,
+        ),
+    )
+    resp = chat.send_message("Show me speaker identification")
+
+    result = resp.candidates[0].content.parts[0].text
+    print(f"result: {result}")
+    return result
+
+
+def match_speakers_with_chatgpt(minutes_text, diarization):
+    """Use ChatGPT to match speakers from diarization with names from minutes."""
+    # Format diarization data for the prompt
+
+    prompt = f"""I have a city council meeting minutes document and a diarization of the audio recording.
+The diarization has identified different speakers but doesn't know their names.
+Please analyze the minutes text and match the speakers from the diarization with the names mentioned in the minutes.
+
+Minutes text:
+{minutes_text}
+
+Diarization segments:
+{diarization}
+
+For each speaker in the diarization, please identify who they are based on the minutes text.
+If you can't determine who they are, mark them as "Unknown".
+Format your response as a JSON object where the keys are the speaker numbers (e.g., "SPEAKER_00")
+and the values are the identified names or "Unknown".
+"""
+
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that analyzes meeting minutes and audio diarization to identify speakers.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format={"type": "json_object"},
+    )
+
+    return json.loads(response.choices[0].message.content)
+
+
+def main():
+    minutes_path = Path(
+        "./notebooks/experiments/minutes_diarization/test_data/25-173-2_25-173-2 2025-02-26 5PM Minutes.pdf"
+    )
+    # Extract text from PDF
+    minutes_text = extract_text_from_pdf(minutes_path)
+
+    # Get diarization data
+    diarization = get_diarization()
+
+    simple_diarization = simplify_diarization(diarization)
+    print(simple_diarization)
+
+    encoding = tiktoken.encoding_for_model("gpt-4o-mini")
+
+    print(
+        f"Diarization segments length: {len(encoding.encode(str(simple_diarization)))}"
+    )
+    print(f"Minutes text length: {len(encoding.encode(minutes_text))}")
+
+    # Use ChatGPT to match speakers
+    speaker_matches = match_speakers_with_chatgpt(minutes_text, simple_diarization)
+    print(speaker_matches)
+
+
+if __name__ == "__main__":
+    main()