Skip to content

Commit

Permalink
✨ Don't recombine paragraphs which only contain non-word text
Browse files Browse the repository at this point in the history
e.g. [Music] or *Applause*
  • Loading branch information
pajowu committed Sep 20, 2023
1 parent 7f71663 commit 6fd5d37
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions worker/transcribee_worker/whisper_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
r".*\d\.\s?$"
), # Don't split on numerals followed by a dot, e.g. "during the 20. century"
]
# Regexes that protect a paragraph from being recombined
DONT_COMBINE_RES = [
re.compile(r"^\[[^\s]*\]$"), # [MUSIC]
re.compile(r"^\*[^\s]*\*$"), # *Applause*
]


def get_model_file(model_name: str):
Expand Down Expand Up @@ -262,6 +267,13 @@ async def strict_sentence_paragraphs(
lang=paragraph.lang, speaker=paragraph.speaker, children=[]
)

elif any(regex.search(paragraph.text()) for regex in DONT_COMBINE_RES):
if acc_paragraph.children:
yield acc_paragraph
acc_paragraph = None
yield paragraph
continue

locale = Locale(paragraph.lang)
sentence_iter = BreakIterator.createSentenceInstance(locale)
sentence_iter.setText(acc_paragraph.text() + paragraph.text())
Expand Down

0 comments on commit 6fd5d37

Please sign in to comment.