Simplify requirement to only track strating time of a sentence

shun-liang · Nov 3, 2024 · 959f57b · 959f57b
1 parent abac0eb
commit 959f57b
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 83 deletions.
diff --git a/src/yt2doc/formatting/interfaces.py b/src/yt2doc/formatting/interfaces.py
@@ -8,7 +8,6 @@
 
 class Sentence(BaseModel):
     start_second: float
-    end_second: float
     text: str
 
 

diff --git a/src/yt2doc/formatting/paragraphs_segmenter.py b/src/yt2doc/formatting/paragraphs_segmenter.py
@@ -17,49 +17,56 @@ def segment(
         full_text = "".join(s.text for s in transcription_segments)
         paragraphed_texts = self.sat.split(full_text, do_paragraph_segmentation=True, verbose=True)
 
-        # Find which segment contains each sentence's start/end
-        result = []
-        text_pos = 0
+        # Align timestamps
+        segments_text = "".join(s.text for s in transcription_segments)
+        segments_pos = 0  # Position in segments text
+        curr_segment_idx = 0  # Current segment index
+        curr_segment_offset = 0  # Position within current segment
+
+        result_paragraphs = []
 
         for paragraph in paragraphed_texts:
-            sentences = []
+            result_sentences = []
+
             for sentence in paragraph:
-                if not sentence:
-                    continue
-
-                # Find start segment
-                start_idx = 0
-                pos = text_pos
-                while start_idx < len(transcription_segments):
-                    if pos < len(transcription_segments[start_idx].text):
-                        break
-                    pos -= len(transcription_segments[start_idx].text)
-                    start_idx += 1
-
-                # If sentence starts after a period, use next segment's start time
-                start_time = transcription_segments[start_idx].start_second
-                if pos > 0 and transcription_segments[start_idx].text[:pos].strip().endswith('.'):
-                    start_time = transcription_segments[min(start_idx + 1, len(transcription_segments) - 1)].start_second
-
-                # Find end segment
-                end_idx = start_idx
-                remaining = len(sentence)
-                while remaining > 0 and end_idx < len(transcription_segments):
-                    segment_remaining = len(transcription_segments[end_idx].text) - pos
-                    if remaining <= segment_remaining:
+                # Find matching position for this sentence
+                sentence_pos = 0  # Position in current sentence
+
+                # Find start position
+                start_segment_idx = curr_segment_idx
+
+                # Match characters exactly including spaces
+                while sentence_pos < len(sentence):
+                    if segments_pos >= len(segments_text):
                         break
-                    remaining -= segment_remaining
-                    pos = 0
-                    end_idx += 1
-
-                sentences.append(interfaces.Sentence(
-                    text=sentence,
-                    start_second=start_time,
-                    end_second=transcription_segments[end_idx].end_second
-                ))
-                text_pos += len(sentence)
-
-            if sentences:
-                result.append(sentences)
-
-        return result
+
+                    # Match characters exactly
+                    if sentence[sentence_pos] == segments_text[segments_pos]:
+                        sentence_pos += 1
+                        segments_pos += 1
+                        curr_segment_offset += 1
+                        # Update segment index if needed
+                        while (curr_segment_idx < len(transcription_segments) - 1 and 
+                               curr_segment_offset >= len(transcription_segments[curr_segment_idx].text)):
+                            curr_segment_offset = 0
+                            curr_segment_idx += 1
+                    else:
+                        # If no match, move forward in segments
+                        segments_pos += 1
+                        curr_segment_offset += 1
+                        while (curr_segment_idx < len(transcription_segments) - 1 and 
+                               curr_segment_offset >= len(transcription_segments[curr_segment_idx].text)):
+                            curr_segment_offset = 0
+                            curr_segment_idx += 1
+
+                # Create sentence with aligned timestamp
+                result_sentences.append(
+                    interfaces.Sentence(
+                        text=sentence,
+                        start_second=transcription_segments[start_segment_idx].start_second
+                    )
+                )
+
+            result_paragraphs.append(result_sentences)
+
+        return result_paragraphs
diff --git a/tests/unit/formatting/test_paragraphs_segmenter.py b/tests/unit/formatting/test_paragraphs_segmenter.py
@@ -36,24 +36,24 @@ def test_segment_aligns_timestamps_correctly() -> None:
 
     # Create test transcription segments that split text differently
     segments = [
-        Segment(start_second=0.0, end_second=1.0, text="Hello"),
+        Segment(start_second=0.0, end_second=1.0, text=" Hello"),
         Segment(start_second=1.0, end_second=2.0, text=" world! This"),
         Segment(start_second=2.0, end_second=3.0, text=" is a"),
-        Segment(start_second=3.0, end_second=4.0, text=" test. Another "),
-        Segment(start_second=4.0, end_second=5.0, text="paragraph here. Only "),
-        Segment(start_second=5.0, end_second=6.0, text="0.1 percent people get it. And "),
-        Segment(start_second=6.0, end_second=7.0, text="even more. This is a "),
-        Segment(start_second=7.0, end_second=8.0, text="longer sentence that spans "),
-        Segment(start_second=8.0, end_second=9.0, text="multiple segments and tests "),
-        Segment(start_second=9.0, end_second=10.0, text="our handling of longer "),
-        Segment(start_second=10.0, end_second=11.0, text="text blocks. Short "),
-        Segment(start_second=11.0, end_second=12.0, text="text. Followed by "),
-        Segment(start_second=12.0, end_second=13.0, text="another. And one "),
-        Segment(start_second=13.0, end_second=14.0, text="more for good measure. Final "),
-        Segment(start_second=14.0, end_second=15.0, text="paragraph to conclude our "),
-        Segment(start_second=15.0, end_second=16.0, text="test. With some extra "),
-        Segment(start_second=16.0, end_second=17.0, text="content. And a final "),
-        Segment(start_second=17.0, end_second=18.0, text="closing statement. ")
+        Segment(start_second=3.0, end_second=4.0, text=" test. Another"),
+        Segment(start_second=4.0, end_second=5.0, text=" paragraph here. Only"),
+        Segment(start_second=5.0, end_second=6.0, text=" 0.1 percent people get it. And"),
+        Segment(start_second=6.0, end_second=7.0, text=" even more. This is a"),
+        Segment(start_second=7.0, end_second=8.0, text=" longer sentence that spans"),
+        Segment(start_second=8.0, end_second=9.0, text=" multiple segments and tests"),
+        Segment(start_second=9.0, end_second=10.0, text=" our handling of longer"),
+        Segment(start_second=10.0, end_second=11.0, text=" text blocks. Short"),
+        Segment(start_second=11.0, end_second=12.0, text=" text. Followed by"),
+        Segment(start_second=12.0, end_second=13.0, text=" another. And one"),
+        Segment(start_second=13.0, end_second=14.0, text=" more for good measure. Final"),
+        Segment(start_second=14.0, end_second=15.0, text=" paragraph to conclude our"),
+        Segment(start_second=15.0, end_second=16.0, text=" test. With some extra"),
+        Segment(start_second=16.0, end_second=17.0, text=" content. And a final"),
+        Segment(start_second=17.0, end_second=18.0, text=" closing statement.")
     ]
 
     segmenter = ParagraphsSegmenter(mock_sat)
@@ -70,63 +70,51 @@ def test_segment_aligns_timestamps_correctly() -> None:
     # Verify first paragraph
     assert result[0][0].text == "Hello world! "
     assert result[0][0].start_second == 0.0
-    assert result[0][0].end_second == 2.0
 
     assert result[0][1].text == "This is a test. "
     assert result[0][1].start_second == 1.0
-    assert result[0][1].end_second == 4.0
 
     # Verify second paragraph
     assert result[1][0].text == "Another paragraph here. "
-    assert result[1][0].start_second == 4.0
-    assert result[1][0].end_second == 5.0
+    assert result[1][0].start_second == 3.0
 
     assert result[1][1].text == "Only 0.1 percent people get it. "
-    assert result[1][1].start_second == 5.0
-    assert result[1][1].end_second == 6.0
+    assert result[1][1].start_second == 4.0
 
     assert result[1][2].text == "And even more. "
-    assert result[1][2].start_second == 6.0
-    assert result[1][2].end_second == 7.0
+    assert result[1][2].start_second == 5.0
 
     # Verify third paragraph (long sentence spanning multiple segments)
     assert result[2][0].text == "This is a longer sentence that spans multiple segments and tests our handling of longer text blocks. "
-    assert result[2][0].start_second == 7.0
-    assert result[2][0].end_second == 11.0
+    assert result[2][0].start_second == 6.0
 
     # Verify fourth paragraph
     assert result[3][0].text == "Short text. "
-    assert result[3][0].start_second == 11.0
-    assert result[3][0].end_second == 12.0
+    assert result[3][0].start_second == 10.0
 
     assert result[3][1].text == "Followed by another. "
-    assert result[3][1].start_second == 12.0
-    assert result[3][1].end_second == 13.0
+    assert result[3][1].start_second == 11.0
 
     assert result[3][2].text == "And one more for good measure. "
-    assert result[3][2].start_second == 13.0
-    assert result[3][2].end_second == 14.0
+    assert result[3][2].start_second == 12.0
 
     # Verify fifth paragraph
     assert result[4][0].text == "Final paragraph to conclude our test. "
-    assert result[4][0].start_second == 14.0
-    assert result[4][0].end_second == 16.0
+    assert result[4][0].start_second == 13.0
 
     assert result[4][1].text == "With some extra content. "
-    assert result[4][1].start_second == 16.0
-    assert result[4][1].end_second == 17.0
+    assert result[4][1].start_second == 15.0
 
     assert result[4][2].text == "And a final closing statement. "
-    assert result[4][2].start_second == 17.0
-    assert result[4][2].end_second == 18.0
+    assert result[4][2].start_second == 16.0
 
     # Verify SaT was called correctly with complete text
     mock_sat.split.assert_called_once_with(
-        "Hello world! This is a test. Another paragraph here. Only 0.1 percent people get it. "
+        " Hello world! This is a test. Another paragraph here. Only 0.1 percent people get it. "
         "And even more. This is a longer sentence that spans multiple segments and tests "
         "our handling of longer text blocks. Short text. Followed by another. And one "
         "more for good measure. Final paragraph to conclude our test. With some extra "
-        "content. And a final closing statement. ",
+        "content. And a final closing statement.",
         do_paragraph_segmentation=True,
         verbose=True
     )