From 385140ed2061131e7ddcf28dfd375f2628820e75 Mon Sep 17 00:00:00 2001
From: Shun Liang <shun-liang@users.noreply.github.com>
Date: Wed, 16 Oct 2024 22:37:42 +0100
Subject: [PATCH] Support Apple Podcast (#18)

* Support Apple Podcast

* Better prompting cleaning
---
 README.md                                     |  1 +
 pyproject.toml                                |  2 +-
 src/yt2doc/extraction/extractor.py            | 33 ++++----
 src/yt2doc/extraction/file_cache.py           | 34 ++++-----
 src/yt2doc/extraction/interfaces.py           |  5 +-
 src/yt2doc/factories.py                       |  4 +-
 src/yt2doc/formatting/llm_topic_segmenter.py  | 16 ++--
 src/yt2doc/{youtube => i18n}/__init__.py      |  0
 src/yt2doc/i18n/punctuations.py               | 42 ++++++++++
 src/yt2doc/media/__init__.py                  |  0
 src/yt2doc/{youtube => media}/interfaces.py   |  8 +-
 .../media_info_extractor.py}                  | 22 +++---
 src/yt2doc/transcription/interfaces.py        | 11 ++-
 src/yt2doc/transcription/transcriber.py       | 76 ++++++++-----------
 uv.lock                                       |  8 +-
 15 files changed, 148 insertions(+), 114 deletions(-)
 rename src/yt2doc/{youtube => i18n}/__init__.py (100%)
 create mode 100644 src/yt2doc/i18n/punctuations.py
 create mode 100644 src/yt2doc/media/__init__.py
 rename src/yt2doc/{youtube => media}/interfaces.py (72%)
 rename src/yt2doc/{youtube/yt_video_info_extractor.py => media/media_info_extractor.py} (82%)

diff --git a/README.md b/README.md
index e8fcccd..5bbbf5e 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ yt2doc transcribes videos online into readable Markdown documents.
 
 Supported video sources:
 * YouTube
+* Apple Podcast
 * Twitter
 
 yt2doc is meant to work fully locally, without invoking any external API. The OpenAI SDK dependency is required solely to interact with [Ollama](https://github.com/ollama/ollama).
diff --git a/pyproject.toml b/pyproject.toml
index 1937aa8..7c29dae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
     "tqdm>=4.66.5",
     "typer-slim>=0.12.5",
     "wtpsplit>=2.0.8",
-    "yt-dlp>=2024.9.27",
+    "yt-dlp>=2024.10.07",
 ]
 
 [tool.uv]
diff --git a/src/yt2doc/extraction/extractor.py b/src/yt2doc/extraction/extractor.py
index 3e4addd..ce8ea23 100644
--- a/src/yt2doc/extraction/extractor.py
+++ b/src/yt2doc/extraction/extractor.py
@@ -1,7 +1,7 @@
 import logging
 
 from yt2doc.timer import Timer
-from yt2doc.youtube import interfaces as youtube_interfaces
+from yt2doc.media import interfaces as youtube_interfaces
 from yt2doc.transcription import interfaces as transcription_interfaces
 from yt2doc.extraction import interfaces
 
@@ -36,12 +36,7 @@ def extract_by_chapter(
             )
             is not None
         ):
-            return interfaces.ChapteredTranscript(
-                url=video_url,
-                title=video_info.title,
-                chapters=cached_chaptered_transcript,
-                chaptered_at_source=len(video_info.chapters) > 0,
-            )
+            return cached_chaptered_transcript
 
         with Timer() as yt_dlp_timer:
             audio_path = self.yt_dlp_adapter.extract_audio(video_url=video_url)
@@ -49,30 +44,34 @@ def extract_by_chapter(
         logger.info(f"Video download and convert time: {yt_dlp_timer.seconds} seconds")
 
         with Timer() as transcribe_timer:
+            transcript = self.transcriber.transcribe(
+                audio_path=audio_path,
+                video_info=video_info,
+            )
             transcripts_by_chapter = [
                 interfaces.TranscriptChapter(
                     title=chapter.title, segments=chapter.segments
                 )
-                for chapter in self.transcriber.transcribe(
-                    audio_path=audio_path,
-                    video_info=video_info,
-                )
+                for chapter in transcript.chapters
             ]
 
         logger.info(f"Transcription time: {transcribe_timer.seconds} seconds")
 
-        self.file_cache.cache_chaptered_transcript(
-            video_id=video_info.video_id,
-            chapters=transcripts_by_chapter,
-        )
-
-        return interfaces.ChapteredTranscript(
+        chaptered_transcript = interfaces.ChapteredTranscript(
             url=video_url,
             title=video_info.title,
             chapters=transcripts_by_chapter,
             chaptered_at_source=len(video_info.chapters) > 0,
+            language=transcript.language,
+        )
+
+        self.file_cache.cache_chaptered_transcript(
+            video_id=video_info.video_id,
+            transcript=chaptered_transcript,
         )
 
+        return chaptered_transcript
+
     def extract_playlist_by_chapter(
         self, playlist_url: str, skip_cache: bool
     ) -> interfaces.ChapteredTranscribedPlaylist:
diff --git a/src/yt2doc/extraction/file_cache.py b/src/yt2doc/extraction/file_cache.py
index 44e66ac..ad1aa49 100644
--- a/src/yt2doc/extraction/file_cache.py
+++ b/src/yt2doc/extraction/file_cache.py
@@ -5,13 +5,18 @@
 
 from pathlib import Path
 
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
 
 from yt2doc.extraction import interfaces
 
 logger = logging.getLogger(__file__)
 
 
+class CachedTranscript(BaseModel):
+    transcript: interfaces.ChapteredTranscript
+    meta: interfaces.MetaDict
+
+
 class FileCache:
     def __init__(self, cache_dir: Path, meta: interfaces.MetaDict) -> None:
         self.cache_dir = cache_dir
@@ -22,7 +27,7 @@ def __init__(self, cache_dir: Path, meta: interfaces.MetaDict) -> None:
 
     def get_chaptered_transcript(
         self, video_id: str
-    ) -> typing.Optional[typing.Sequence[interfaces.TranscriptChapter]]:
+    ) -> typing.Optional[interfaces.ChapteredTranscript]:
         file_path = (
             self.cache_dir
             / video_id
@@ -33,33 +38,24 @@ def get_chaptered_transcript(
             return None
 
         with open(file_path, "r") as f:
-            chaptered_transcript_dict = json.load(f)
+            cached_transcript_dict = json.load(f)
 
-        chapter_dicts: typing.List[typing.Dict[str, typing.Any]] = (
-            chaptered_transcript_dict["chapters"]
-        )
         try:
-            return [
-                interfaces.TranscriptChapter(**chapter) for chapter in chapter_dicts
-            ]
+            cached_transcript = CachedTranscript(**cached_transcript_dict)
         except ValidationError as e:
             logger.warning(f"Validation error while trying to read from cache: {e}")
             return None
 
+        return cached_transcript.transcript
+
     def cache_chaptered_transcript(
-        self,
-        video_id: str,
-        chapters: typing.Sequence[interfaces.TranscriptChapter],
+        self, video_id: str, transcript: interfaces.ChapteredTranscript
     ) -> None:
         dir = self.cache_dir / video_id / "chaptered_transcript"
         dir.mkdir(exist_ok=True, parents=True)
         file_path = dir / f"{self.hashed_meta}.json"
+        transcript_to_cache = CachedTranscript(transcript=transcript, meta=self.meta)
 
         with open(file_path, "w+") as f:
-            json.dump(
-                {
-                    "chapters": [chapter.model_dump() for chapter in chapters],
-                    "meta": self.meta,
-                },
-                f,
-            )
+            serialized = transcript_to_cache.model_dump()
+            json.dump(serialized, f)
diff --git a/src/yt2doc/extraction/interfaces.py b/src/yt2doc/extraction/interfaces.py
index 945fb35..a9f1992 100644
--- a/src/yt2doc/extraction/interfaces.py
+++ b/src/yt2doc/extraction/interfaces.py
@@ -13,6 +13,7 @@ class TranscriptChapter(BaseModel):
 class ChapteredTranscript(BaseModel):
     url: str
     title: str
+    language: str
     chapters: typing.Sequence[TranscriptChapter]
     chaptered_at_source: bool
 
@@ -29,11 +30,11 @@ class ChapteredTranscribedPlaylist(BaseModel):
 class IFileCache(typing.Protocol):
     def get_chaptered_transcript(
         self, video_id: str
-    ) -> typing.Optional[typing.Sequence[TranscriptChapter]]: ...
+    ) -> typing.Optional[ChapteredTranscript]: ...
     def cache_chaptered_transcript(
         self,
         video_id: str,
-        chapters: typing.Sequence[TranscriptChapter],
+        transcript: ChapteredTranscript,
     ) -> None: ...
 
 
diff --git a/src/yt2doc/factories.py b/src/yt2doc/factories.py
index fed2b3b..4ae427a 100644
--- a/src/yt2doc/factories.py
+++ b/src/yt2doc/factories.py
@@ -6,7 +6,7 @@
 from wtpsplit import SaT
 from openai import OpenAI
 
-from yt2doc.youtube.yt_video_info_extractor import YtVideoInfoExtractor
+from yt2doc.media.media_info_extractor import MediaInfoExtractor
 from yt2doc.transcription.transcriber import Transcriber
 from yt2doc.transcription import interfaces as transcription_interfaces
 from yt2doc.extraction.file_cache import FileCache
@@ -57,7 +57,7 @@ def get_yt2doc(
     else:
         formatter = MarkdownFormatter(sat=sat)
 
-    video_info_extractor = YtVideoInfoExtractor(temp_dir=temp_dir)
+    video_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
     transcriber = Transcriber(
         temp_dir=temp_dir,
         whisper_adapter=whisper_adapter,
diff --git a/src/yt2doc/formatting/llm_topic_segmenter.py b/src/yt2doc/formatting/llm_topic_segmenter.py
index 6aeee7e..be30aa2 100644
--- a/src/yt2doc/formatting/llm_topic_segmenter.py
+++ b/src/yt2doc/formatting/llm_topic_segmenter.py
@@ -70,10 +70,16 @@ def validate_paragraph_indexes(v: typing.List[int]) -> typing.List[int]:
                 unique_values = set(v)
                 if len(unique_values) != len(v):
                     raise ValueError("All elements must be unique")
-                if any(i <= 0 or i >= n for i in v):
-                    raise ValueError(
-                        f"All elements must be greater than 0 and less than {n}"
-                    )
+                for i in v:
+                    if i <= 0:
+                        raise ValueError(
+                            f"All elements must be greater than 0 and less than {n}. Paragraph index {i} is less than or equal to 0"
+                        )
+                    if i >= n:
+                        raise ValueError(
+                            f"All elements must be greater than 0 and less than {n}. Paragraph index {i} is greater or equal to {n}"
+                        )
+
                 return v
 
             class Result(BaseModel):
@@ -88,7 +94,7 @@ class Result(BaseModel):
                     {
                         "role": "system",
                         "content": """
-                            You are an smart assistant who reads paragraphs of text from an audio transcript and
+                            You are a smart assistant who reads paragraphs of text from an audio transcript and
                             find the paragraphs that significantly change topic from the previous paragraph.
 
                             Make sure only mark paragraphs that talks about a VERY DIFFERENT topic from the previous one.
diff --git a/src/yt2doc/youtube/__init__.py b/src/yt2doc/i18n/__init__.py
similarity index 100%
rename from src/yt2doc/youtube/__init__.py
rename to src/yt2doc/i18n/__init__.py
diff --git a/src/yt2doc/i18n/punctuations.py b/src/yt2doc/i18n/punctuations.py
new file mode 100644
index 0000000..d3aa691
--- /dev/null
+++ b/src/yt2doc/i18n/punctuations.py
@@ -0,0 +1,42 @@
+import typing
+
+from pydantic import BaseModel
+
+
+class Punctuations(BaseModel):
+    full_stop: str
+    comma: str
+    question_mark: str
+    exclamation_mark: str
+    white_space: str
+
+
+PUNCTUATIONS_IN_LANGUAGES: typing.Dict[str, Punctuations] = {
+    "en": Punctuations(
+        full_stop=".",
+        comma=", ",
+        question_mark="?",
+        exclamation_mark="!",
+        white_space=" ",
+    ),
+    "zh": Punctuations(
+        full_stop="。",
+        comma="，",
+        question_mark="？",
+        exclamation_mark="！",
+        white_space="",
+    ),
+    "jp": Punctuations(
+        full_stop="。",
+        comma="、",
+        question_mark="？",
+        exclamation_mark="！",
+        white_space="",
+    ),
+}
+
+
+def get_punctuations(language_code: str) -> Punctuations:
+    if language_code not in PUNCTUATIONS_IN_LANGUAGES.keys():
+        return PUNCTUATIONS_IN_LANGUAGES["en"]
+    return PUNCTUATIONS_IN_LANGUAGES[language_code]
diff --git a/src/yt2doc/media/__init__.py b/src/yt2doc/media/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/yt2doc/youtube/interfaces.py b/src/yt2doc/media/interfaces.py
similarity index 72%
rename from src/yt2doc/youtube/interfaces.py
rename to src/yt2doc/media/interfaces.py
index ed24984..c6ce725 100644
--- a/src/yt2doc/youtube/interfaces.py
+++ b/src/yt2doc/media/interfaces.py
@@ -5,16 +5,16 @@
 from pydantic import BaseModel
 
 
-class YtChapter(BaseModel):
+class MediaChapter(BaseModel):
     title: str
     start_time: float
     end_time: float
 
 
-class YtVideoInfo(BaseModel):
+class MediaInfo(BaseModel):
     video_id: str
     title: str
-    chapters: typing.Sequence[YtChapter]
+    chapters: typing.Sequence[MediaChapter]
     description: str
 
 
@@ -24,6 +24,6 @@ class YtPlaylistInfo(BaseModel):
 
 
 class IYtVideoInfoExtractor(typing.Protocol):
-    def extract_video_info(self, video_url: str) -> YtVideoInfo: ...
+    def extract_video_info(self, video_url: str) -> MediaInfo: ...
     def extract_audio(self, video_url: str) -> Path: ...
     def extract_playlist_info(self, playlist_url: str) -> YtPlaylistInfo: ...
diff --git a/src/yt2doc/youtube/yt_video_info_extractor.py b/src/yt2doc/media/media_info_extractor.py
similarity index 82%
rename from src/yt2doc/youtube/yt_video_info_extractor.py
rename to src/yt2doc/media/media_info_extractor.py
index b83d718..93cdf81 100644
--- a/src/yt2doc/youtube/yt_video_info_extractor.py
+++ b/src/yt2doc/media/media_info_extractor.py
@@ -5,33 +5,33 @@
 
 from pathlib import Path
 
-from yt2doc.youtube import interfaces
+from yt2doc.media import interfaces
 
 
 logger = logging.getLogger(__file__)
 
 
-def _length(chapter: interfaces.YtChapter) -> float:
+def _length(chapter: interfaces.MediaChapter) -> float:
     return chapter.end_time - chapter.start_time
 
 
 def _merge_short_chapters(
-    chapters: typing.Sequence[interfaces.YtChapter],
-) -> typing.Sequence[interfaces.YtChapter]:
+    chapters: typing.Sequence[interfaces.MediaChapter],
+) -> typing.Sequence[interfaces.MediaChapter]:
     threshold_seconds = 60
-    merged_chapters: typing.List[interfaces.YtChapter] = []
+    merged_chapters: typing.List[interfaces.MediaChapter] = []
     for idx, chapter in enumerate(chapters):
         if idx == 0:
             merged_chapters.append(chapter)
             continue
 
-        merged_target: interfaces.YtChapter
+        merged_target: interfaces.MediaChapter
         merged_target = merged_chapters[-1]
         if (
             _length(chapter) < threshold_seconds
             and _length(merged_target) < threshold_seconds
         ):
-            merged_chapter = interfaces.YtChapter(
+            merged_chapter = interfaces.MediaChapter(
                 title=merged_target.title + " & " + chapter.title,
                 start_time=merged_target.start_time,
                 end_time=chapter.end_time,
@@ -43,11 +43,11 @@ def _merge_short_chapters(
     return merged_chapters
 
 
-class YtVideoInfoExtractor:
+class MediaInfoExtractor:
     def __init__(self, temp_dir: Path):
         self.temp_dir = temp_dir
 
-    def extract_video_info(self, video_url: str) -> interfaces.YtVideoInfo:
+    def extract_video_info(self, video_url: str) -> interfaces.MediaInfo:
         ydl_opts = {
             "quiet": True,
         }
@@ -59,11 +59,11 @@ def extract_video_info(self, video_url: str) -> interfaces.YtVideoInfo:
         title = response["title"]
         chapter_objects = response.get("chapters") or []
         chapters = _merge_short_chapters(
-            [interfaces.YtChapter(**chapter) for chapter in chapter_objects]
+            [interfaces.MediaChapter(**chapter) for chapter in chapter_objects]
         )
         description = response["description"]
 
-        return interfaces.YtVideoInfo(
+        return interfaces.MediaInfo(
             video_id=video_id,
             title=title,
             chapters=chapters,
diff --git a/src/yt2doc/transcription/interfaces.py b/src/yt2doc/transcription/interfaces.py
index dcd961a..3427d15 100644
--- a/src/yt2doc/transcription/interfaces.py
+++ b/src/yt2doc/transcription/interfaces.py
@@ -4,7 +4,7 @@
 
 from pydantic import BaseModel
 
-from yt2doc.youtube import interfaces as youtube_interfaces
+from yt2doc.media import interfaces as youtube_interfaces
 
 
 class Segment(BaseModel):
@@ -18,6 +18,11 @@ class ChapterTranscription(BaseModel):
     segments: typing.Sequence[Segment]
 
 
+class Transcription(BaseModel):
+    language: str
+    chapters: typing.Sequence[ChapterTranscription]
+
+
 class IWhisperAdapter(typing.Protocol):
     def detect_language(self, audio_path: Path) -> str: ...
 
@@ -28,5 +33,5 @@ def transcribe(
 
 class ITranscriber(typing.Protocol):
     def transcribe(
-        self, audio_path: Path, video_info: youtube_interfaces.YtVideoInfo
-    ) -> typing.Sequence[ChapterTranscription]: ...
+        self, audio_path: Path, video_info: youtube_interfaces.MediaInfo
+    ) -> Transcription: ...
diff --git a/src/yt2doc/transcription/transcriber.py b/src/yt2doc/transcription/transcriber.py
index e8244c9..a3cbfcd 100644
--- a/src/yt2doc/transcription/transcriber.py
+++ b/src/yt2doc/transcription/transcriber.py
@@ -10,28 +10,15 @@
 from pathlib import Path
 
 from tqdm import tqdm
-from pydantic import BaseModel
 
-from yt2doc.youtube import interfaces as youtube_interfaces
+from yt2doc.media import interfaces as youtube_interfaces
 from yt2doc.transcription import interfaces
+from yt2doc.i18n import punctuations
 
 
 logger = logging.getLogger(__file__)
 
 
-class Punctuations(BaseModel):
-    full_stop: str
-    comma: str
-    white_space: str
-
-
-PUNCTUATIONS_IN_LANGUAGES: typing.Dict[str, Punctuations] = {
-    "en": Punctuations(full_stop=".", comma=", ", white_space=" "),
-    "zh": Punctuations(full_stop="。", comma="，", white_space=""),
-    "jp": Punctuations(full_stop="。", comma="、", white_space=""),
-}
-
-
 class Transcriber:
     def __init__(
         self,
@@ -42,20 +29,17 @@ def __init__(
         self.whisper_adapter = whisper_adapter
 
     @staticmethod
-    def _get_punctuations(language_code: str) -> Punctuations:
-        if language_code not in PUNCTUATIONS_IN_LANGUAGES.keys():
-            return PUNCTUATIONS_IN_LANGUAGES["en"]
-        return PUNCTUATIONS_IN_LANGUAGES[language_code]
-
-    @staticmethod
-    def _clean_video_description(description: str, punctuations: Punctuations) -> str:
+    def _clean_video_description(
+        description: str, punctuations: punctuations.Punctuations
+    ) -> str:
         url_pattern = r"https?://\S+"
-        timestamp_line_pattern = r"^\d+:\d+.*\n?"
+        timestamp_min_sec_pattern = r"\d\d:\d\d"
+        timestamp_hr_min_sec_pattern = r"\d\d:\d\d:\d\d"
         hashtag_pattern = r"#\w+"
         normalized_text = unicodedata.normalize("NFKD", description)
 
         text = re.sub(
-            f"{url_pattern}|{timestamp_line_pattern}|{hashtag_pattern}",
+            f"{url_pattern}|{timestamp_hr_min_sec_pattern}|{timestamp_min_sec_pattern}|{hashtag_pattern}",
             "",
             normalized_text,
             flags=re.MULTILINE,
@@ -64,21 +48,19 @@ def _clean_video_description(description: str, punctuations: Punctuations) -> st
         text = re.sub(r"\n+| +", punctuations.white_space, text)
         text = text.replace(":", punctuations.comma)
         text = emoji.replace_emoji(text, "")
-        # non_char_symbol_pattern = (
-        #     f"[^\\w\\s{punctuations.comma}{punctuations.full_stop}]"
-        # )
-        # text = re.sub(non_char_symbol_pattern, "", text, flags=re.UNICODE)
+
+        non_char_symbol_pattern = f"[^\\w\\s{punctuations.comma}{punctuations.full_stop}{punctuations.question_mark}{punctuations.exclamation_mark}]"
+        text = re.sub(non_char_symbol_pattern, "", text, flags=re.UNICODE)
+
         text = re.sub(r"\s+", punctuations.white_space, text).strip()
         text = unicodedata.normalize("NFKC", text)
 
         return text
 
     @staticmethod
-    def _clean_title(title: str, punctuations: Punctuations) -> str:
+    def _clean_title(title: str, punctuations: punctuations.Punctuations) -> str:
         normalized_text = unicodedata.normalize("NFKD", title)
-        non_char_symbol_pattern = (
-            f"[^\\w\\s{punctuations.comma}{punctuations.full_stop}]"
-        )
+        non_char_symbol_pattern = f"[^\\w\\s{punctuations.comma}{punctuations.full_stop}{punctuations.question_mark}{punctuations.exclamation_mark}]"
         text = re.sub(non_char_symbol_pattern, "", normalized_text, flags=re.UNICODE)
         text = re.sub(r"\s+", punctuations.white_space, text).strip()
         text = unicodedata.normalize("NFKC", text)
@@ -87,24 +69,23 @@ def _clean_title(title: str, punctuations: Punctuations) -> str:
     def _get_initial_prompt(
         self,
         language_code: str,
-        video_info: youtube_interfaces.YtVideoInfo,
+        video_info: youtube_interfaces.MediaInfo,
     ) -> str:
-        punctuations = self._get_punctuations(language_code=language_code)
+        punctuations_ = punctuations.get_punctuations(language_code=language_code)
         cleaned_title = self._clean_title(
             title=video_info.title,
-            punctuations=punctuations,
+            punctuations=punctuations_,
         )
         cleaned_video_description = self._clean_video_description(
-            video_info.description, punctuations=punctuations
+            video_info.description, punctuations=punctuations_
         )
-        punctuations = self._get_punctuations(language_code=language_code)
-        chapter_titles = f"{punctuations.comma}".join(
+        chapter_titles = f"{punctuations_.comma}".join(
             c.title for c in video_info.chapters
         )
-        return f"{cleaned_title}{punctuations.full_stop} {cleaned_video_description} {chapter_titles}"
+        return f"{cleaned_title}{punctuations_.full_stop} {cleaned_video_description} {chapter_titles}"
 
     def _get_audio_chunk_for_chapter(
-        self, audio_path: Path, chapter: youtube_interfaces.YtChapter
+        self, audio_path: Path, chapter: youtube_interfaces.MediaChapter
     ) -> Path:
         duration = chapter.end_time - chapter.start_time
         ext = audio_path.suffix
@@ -116,8 +97,8 @@ def _get_audio_chunk_for_chapter(
 
     def _fix_comma(self, segment_text: str, language_code: str) -> str:
         if language_code in ["zh"]:
-            punctuations = self._get_punctuations(language_code=language_code)
-            return segment_text.replace(",", punctuations.comma)
+            punctuations_ = punctuations.get_punctuations(language_code=language_code)
+            return segment_text.replace(",", punctuations_.comma)
         return segment_text
 
     @staticmethod
@@ -133,8 +114,8 @@ def _convert_audio_to_wav(audio_path: Path) -> Path:
         return wav_audio_path
 
     def transcribe(
-        self, audio_path: Path, video_info: youtube_interfaces.YtVideoInfo
-    ) -> typing.Sequence[interfaces.ChapterTranscription]:
+        self, audio_path: Path, video_info: youtube_interfaces.MediaInfo
+    ) -> interfaces.Transcription:
         wav_audio_path = self._convert_audio_to_wav(audio_path=audio_path)
 
         language_code = self.whisper_adapter.detect_language(audio_path=wav_audio_path)
@@ -156,7 +137,7 @@ def transcribe(
             chapters = video_info.chapters
         else:
             chapters = [
-                youtube_interfaces.YtChapter(
+                youtube_interfaces.MediaChapter(
                     title="Untitled chapter",
                     start_time=0.0,
                     end_time=full_audio_duration,
@@ -202,4 +183,7 @@ def transcribe(
             ):  # silence at the end of the audio
                 progress_bar.update(full_audio_duration - current_timestamp)
 
-        return chaptered_transcriptions
+        return interfaces.Transcription(
+            language=language_code,
+            chapters=chaptered_transcriptions,
+        )
diff --git a/uv.lock b/uv.lock
index 82f3bd4..7a613ef 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2356,7 +2356,7 @@ wheels = [
 
 [[package]]
 name = "yt-dlp"
-version = "2024.9.27"
+version = "2024.10.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "brotli", marker = "implementation_name == 'cpython'" },
@@ -2368,9 +2368,9 @@ dependencies = [
     { name = "urllib3" },
     { name = "websockets" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/3f/c63b11419d7adacde35b39e7e62aef1be6ce9ff3d2219fae024962ab4a53/yt_dlp-2024.9.27.tar.gz", hash = "sha256:86605542e17e2e23ad23145b637ec308133762a15a5dedac4ae50b7973237026", size = 2876534 }
+sdist = { url = "https://files.pythonhosted.org/packages/2e/b1/08679efb4c1932dc6420deda8a89f03d7440d6462b7f61d339db2732a497/yt_dlp-2024.10.7.tar.gz", hash = "sha256:0baf1ab517c9748d7e337ced91c5543c36fc16246a9ebedac32ebf20c1998ceb", size = 2877443 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c6/26/5dc3a802bd85c19d3d2fa746e6b36979801e2f32a433799c57f646335a7d/yt_dlp-2024.9.27-py3-none-any.whl", hash = "sha256:2717468dd697fcfcf9a89f493ba30a3830cdfb276c09750e5b561b08b9ef5f69", size = 3148509 },
+    { url = "https://files.pythonhosted.org/packages/6e/91/ecb07d66110334cdb01e94b187577af3b041897090203c9957728825d46f/yt_dlp-2024.10.7-py3-none-any.whl", hash = "sha256:9e336ae663bfd7ad3ea1c02e722747388172719efc0fc39a807dace3073aa704", size = 3149082 },
 ]
 
 [[package]]
@@ -2413,7 +2413,7 @@ requires-dist = [
     { name = "tqdm", specifier = ">=4.66.5" },
     { name = "typer-slim", specifier = ">=0.12.5" },
     { name = "wtpsplit", specifier = ">=2.0.8" },
-    { name = "yt-dlp", specifier = ">=2024.9.27" },
+    { name = "yt-dlp", specifier = ">=2024.10.7" },
 ]
 
 [package.metadata.requires-dev]