Skip to content

Commit

Permalink
Add paragraph timestamping cli
Browse files Browse the repository at this point in the history
  • Loading branch information
shun-liang committed Nov 6, 2024
1 parent 7af3f09 commit 1f6b8e1
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 13 deletions.
6 changes: 6 additions & 0 deletions src/yt2doc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def main(
"--llm-api-key",
help="API key for the LLM server; No need to set if using local Ollama server",
),
to_timestamp_paragraphs: bool = typer.Option(
False,
"--timestamp-paragraphs",
help="Prepend timestamp to paragraphs",
),
skip_cache: typing.Annotated[
bool,
typer.Option("--skip-cache", help="If should skip reading from cache"),
Expand Down Expand Up @@ -145,6 +150,7 @@ def main(
sat_model=sat_model,
segment_unchaptered=segment_unchaptered,
ignore_source_chapters=ignore_source_chapters,
to_timestamp_paragraphs=to_timestamp_paragraphs,
llm_model=llm_model,
llm_server=llm_server,
llm_api_key=llm_api_key,
Expand Down
2 changes: 1 addition & 1 deletion src/yt2doc/extraction/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def extract_by_chapter(

chaptered_transcript = interfaces.ChapteredTranscript(
url=video_url,
video_id=media_info.video_id,
title=media_info.title,
webpage_url=media_info.webpage_url,
webpage_url_domain=media_info.webpage_url_domain,
chapters=transcripts_by_chapter,
chaptered_at_source=len(media_info.chapters) > 0,
Expand Down
2 changes: 1 addition & 1 deletion src/yt2doc/extraction/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class TranscriptChapter(BaseModel):
class ChapteredTranscript(BaseModel):
url: str
title: str
webpage_url: str
video_id: str
webpage_url_domain: str
language: str
chapters: typing.Sequence[TranscriptChapter]
Expand Down
7 changes: 6 additions & 1 deletion src/yt2doc/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_yt2doc(
sat_model: str,
segment_unchaptered: bool,
ignore_source_chapters: bool,
to_timestamp_paragraphs: bool,
llm_model: typing.Optional[str],
llm_server: str,
llm_api_key: str,
Expand Down Expand Up @@ -61,10 +62,14 @@ def get_yt2doc(
llm_topic_segmenter = LLMTopicSegmenter(llm_adapter=llm_adapter)
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
topic_segmenter=llm_topic_segmenter,
)
else:
formatter = MarkdownFormatter(paragraphs_segmenter=paragraphs_segmenter)
formatter = MarkdownFormatter(
paragraphs_segmenter=paragraphs_segmenter,
to_timestamp_paragraphs=to_timestamp_paragraphs,
)

media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
transcriber = Transcriber(
Expand Down
39 changes: 29 additions & 10 deletions src/yt2doc/formatting/formatter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import typing
import logging

from datetime import timedelta

from yt2doc.extraction import interfaces as extraction_interfaces
from yt2doc.formatting import interfaces

Expand All @@ -11,29 +13,35 @@ class MarkdownFormatter:
def __init__(
self,
paragraphs_segmenter: interfaces.IParagraphsSegmenter,
# timestamp_paragraphs: bool,
to_timestamp_paragraphs: bool,
topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
) -> None:
self.paragraphs_segmenter = paragraphs_segmenter
self.topic_segmenter = topic_segmenter
self.video_title_template = "# {name}"
self.chapter_title_template = "## {name}"
# self.timestamp_paragraphs = timestamp_paragraphs
self.to_timestamp_paragraphs = to_timestamp_paragraphs

@staticmethod
def _paragraphs_to_text(
paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
# timestamp_paragraphs: bool,
# webpage_url: str,
# webpage_url_domain: str,
video_id: str,
webpage_url_domain: str,
to_timestamp_paragraphs: bool,
) -> str:
paragraph_texts = []
for paragraph in paragraphs:
first_sentence = paragraph[0]
paragraph_text = "".join(sentence.text for sentence in paragraph)
# if timestamp_paragraphs:
# if webpage_url_domain == "youtube.com":
# timestamp_prefix = "[\({}\)]()"
paragraph_text = paragraph_text.strip()
if to_timestamp_paragraphs:
paragraph_start_second = round(first_sentence.start_second)
paragraph_start_h_m_s = str(timedelta(seconds=paragraph_start_second))
if webpage_url_domain == "youtube.com":
timestamp_prefix = f"[({paragraph_start_h_m_s})](https://youtu.be/{video_id}?t={paragraph_start_second})"
else:
timestamp_prefix = f"({paragraph_start_h_m_s})"
paragraph_text = f"{timestamp_prefix} {paragraph_text}"
paragraph_texts.append(paragraph_text)
return "\n\n".join(paragraph_texts)

Expand All @@ -55,7 +63,15 @@ def format_chaptered_transcript(
sentences_in_paragraphs=paragraphed_sentences
)
chapter_and_text_list = [
(chapter.title, self._paragraphs_to_text(chapter.paragraphs))
(
chapter.title,
self._paragraphs_to_text(
paragraphs=chapter.paragraphs,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
),
)
for chapter in chapters
]

Expand All @@ -65,7 +81,10 @@ def format_chaptered_transcript(
transcription_segments=chapter.segments
)
chapter_full_text = self._paragraphs_to_text(
paragraphs=paragraphed_sentences
paragraphs=paragraphed_sentences,
video_id=chaptered_transcript.video_id,
webpage_url_domain=chaptered_transcript.webpage_url_domain,
to_timestamp_paragraphs=self.to_timestamp_paragraphs,
)
chapter_and_text_list.append((chapter.title, chapter_full_text.strip()))

Expand Down

0 comments on commit 1f6b8e1

Please sign in to comment.