Skip to content

Commit

Permalink
Simplify requirement to only track strating time of a sentence
Browse files Browse the repository at this point in the history
  • Loading branch information
shun-liang committed Nov 3, 2024
1 parent abac0eb commit 959f57b
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 83 deletions.
1 change: 0 additions & 1 deletion src/yt2doc/formatting/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

class Sentence(BaseModel):
start_second: float
end_second: float
text: str


Expand Down
91 changes: 49 additions & 42 deletions src/yt2doc/formatting/paragraphs_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,56 @@ def segment(
full_text = "".join(s.text for s in transcription_segments)
paragraphed_texts = self.sat.split(full_text, do_paragraph_segmentation=True, verbose=True)

# Find which segment contains each sentence's start/end
result = []
text_pos = 0
# Align timestamps
segments_text = "".join(s.text for s in transcription_segments)
segments_pos = 0 # Position in segments text
curr_segment_idx = 0 # Current segment index
curr_segment_offset = 0 # Position within current segment

result_paragraphs = []

for paragraph in paragraphed_texts:
sentences = []
result_sentences = []

for sentence in paragraph:
if not sentence:
continue

# Find start segment
start_idx = 0
pos = text_pos
while start_idx < len(transcription_segments):
if pos < len(transcription_segments[start_idx].text):
break
pos -= len(transcription_segments[start_idx].text)
start_idx += 1

# If sentence starts after a period, use next segment's start time
start_time = transcription_segments[start_idx].start_second
if pos > 0 and transcription_segments[start_idx].text[:pos].strip().endswith('.'):
start_time = transcription_segments[min(start_idx + 1, len(transcription_segments) - 1)].start_second

# Find end segment
end_idx = start_idx
remaining = len(sentence)
while remaining > 0 and end_idx < len(transcription_segments):
segment_remaining = len(transcription_segments[end_idx].text) - pos
if remaining <= segment_remaining:
# Find matching position for this sentence
sentence_pos = 0 # Position in current sentence

# Find start position
start_segment_idx = curr_segment_idx

# Match characters exactly including spaces
while sentence_pos < len(sentence):
if segments_pos >= len(segments_text):
break
remaining -= segment_remaining
pos = 0
end_idx += 1

sentences.append(interfaces.Sentence(
text=sentence,
start_second=start_time,
end_second=transcription_segments[end_idx].end_second
))
text_pos += len(sentence)

if sentences:
result.append(sentences)

return result

# Match characters exactly
if sentence[sentence_pos] == segments_text[segments_pos]:
sentence_pos += 1
segments_pos += 1
curr_segment_offset += 1
# Update segment index if needed
while (curr_segment_idx < len(transcription_segments) - 1 and
curr_segment_offset >= len(transcription_segments[curr_segment_idx].text)):
curr_segment_offset = 0
curr_segment_idx += 1
else:
# If no match, move forward in segments
segments_pos += 1
curr_segment_offset += 1
while (curr_segment_idx < len(transcription_segments) - 1 and
curr_segment_offset >= len(transcription_segments[curr_segment_idx].text)):
curr_segment_offset = 0
curr_segment_idx += 1

# Create sentence with aligned timestamp
result_sentences.append(
interfaces.Sentence(
text=sentence,
start_second=transcription_segments[start_segment_idx].start_second
)
)

result_paragraphs.append(result_sentences)

return result_paragraphs
68 changes: 28 additions & 40 deletions tests/unit/formatting/test_paragraphs_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ def test_segment_aligns_timestamps_correctly() -> None:

# Create test transcription segments that split text differently
segments = [
Segment(start_second=0.0, end_second=1.0, text="Hello"),
Segment(start_second=0.0, end_second=1.0, text=" Hello"),
Segment(start_second=1.0, end_second=2.0, text=" world! This"),
Segment(start_second=2.0, end_second=3.0, text=" is a"),
Segment(start_second=3.0, end_second=4.0, text=" test. Another "),
Segment(start_second=4.0, end_second=5.0, text="paragraph here. Only "),
Segment(start_second=5.0, end_second=6.0, text="0.1 percent people get it. And "),
Segment(start_second=6.0, end_second=7.0, text="even more. This is a "),
Segment(start_second=7.0, end_second=8.0, text="longer sentence that spans "),
Segment(start_second=8.0, end_second=9.0, text="multiple segments and tests "),
Segment(start_second=9.0, end_second=10.0, text="our handling of longer "),
Segment(start_second=10.0, end_second=11.0, text="text blocks. Short "),
Segment(start_second=11.0, end_second=12.0, text="text. Followed by "),
Segment(start_second=12.0, end_second=13.0, text="another. And one "),
Segment(start_second=13.0, end_second=14.0, text="more for good measure. Final "),
Segment(start_second=14.0, end_second=15.0, text="paragraph to conclude our "),
Segment(start_second=15.0, end_second=16.0, text="test. With some extra "),
Segment(start_second=16.0, end_second=17.0, text="content. And a final "),
Segment(start_second=17.0, end_second=18.0, text="closing statement. ")
Segment(start_second=3.0, end_second=4.0, text=" test. Another"),
Segment(start_second=4.0, end_second=5.0, text=" paragraph here. Only"),
Segment(start_second=5.0, end_second=6.0, text=" 0.1 percent people get it. And"),
Segment(start_second=6.0, end_second=7.0, text=" even more. This is a"),
Segment(start_second=7.0, end_second=8.0, text=" longer sentence that spans"),
Segment(start_second=8.0, end_second=9.0, text=" multiple segments and tests"),
Segment(start_second=9.0, end_second=10.0, text=" our handling of longer"),
Segment(start_second=10.0, end_second=11.0, text=" text blocks. Short"),
Segment(start_second=11.0, end_second=12.0, text=" text. Followed by"),
Segment(start_second=12.0, end_second=13.0, text=" another. And one"),
Segment(start_second=13.0, end_second=14.0, text=" more for good measure. Final"),
Segment(start_second=14.0, end_second=15.0, text=" paragraph to conclude our"),
Segment(start_second=15.0, end_second=16.0, text=" test. With some extra"),
Segment(start_second=16.0, end_second=17.0, text=" content. And a final"),
Segment(start_second=17.0, end_second=18.0, text=" closing statement.")
]

segmenter = ParagraphsSegmenter(mock_sat)
Expand All @@ -70,63 +70,51 @@ def test_segment_aligns_timestamps_correctly() -> None:
# Verify first paragraph
assert result[0][0].text == "Hello world! "
assert result[0][0].start_second == 0.0
assert result[0][0].end_second == 2.0

assert result[0][1].text == "This is a test. "
assert result[0][1].start_second == 1.0
assert result[0][1].end_second == 4.0

# Verify second paragraph
assert result[1][0].text == "Another paragraph here. "
assert result[1][0].start_second == 4.0
assert result[1][0].end_second == 5.0
assert result[1][0].start_second == 3.0

assert result[1][1].text == "Only 0.1 percent people get it. "
assert result[1][1].start_second == 5.0
assert result[1][1].end_second == 6.0
assert result[1][1].start_second == 4.0

assert result[1][2].text == "And even more. "
assert result[1][2].start_second == 6.0
assert result[1][2].end_second == 7.0
assert result[1][2].start_second == 5.0

# Verify third paragraph (long sentence spanning multiple segments)
assert result[2][0].text == "This is a longer sentence that spans multiple segments and tests our handling of longer text blocks. "
assert result[2][0].start_second == 7.0
assert result[2][0].end_second == 11.0
assert result[2][0].start_second == 6.0

# Verify fourth paragraph
assert result[3][0].text == "Short text. "
assert result[3][0].start_second == 11.0
assert result[3][0].end_second == 12.0
assert result[3][0].start_second == 10.0

assert result[3][1].text == "Followed by another. "
assert result[3][1].start_second == 12.0
assert result[3][1].end_second == 13.0
assert result[3][1].start_second == 11.0

assert result[3][2].text == "And one more for good measure. "
assert result[3][2].start_second == 13.0
assert result[3][2].end_second == 14.0
assert result[3][2].start_second == 12.0

# Verify fifth paragraph
assert result[4][0].text == "Final paragraph to conclude our test. "
assert result[4][0].start_second == 14.0
assert result[4][0].end_second == 16.0
assert result[4][0].start_second == 13.0

assert result[4][1].text == "With some extra content. "
assert result[4][1].start_second == 16.0
assert result[4][1].end_second == 17.0
assert result[4][1].start_second == 15.0

assert result[4][2].text == "And a final closing statement. "
assert result[4][2].start_second == 17.0
assert result[4][2].end_second == 18.0
assert result[4][2].start_second == 16.0

# Verify SaT was called correctly with complete text
mock_sat.split.assert_called_once_with(
"Hello world! This is a test. Another paragraph here. Only 0.1 percent people get it. "
" Hello world! This is a test. Another paragraph here. Only 0.1 percent people get it. "
"And even more. This is a longer sentence that spans multiple segments and tests "
"our handling of longer text blocks. Short text. Followed by another. And one "
"more for good measure. Final paragraph to conclude our test. With some extra "
"content. And a final closing statement. ",
"content. And a final closing statement.",
do_paragraph_segmentation=True,
verbose=True
)

0 comments on commit 959f57b

Please sign in to comment.