From c36fceb1353dd5d009d94cf90173f08944e7b6a4 Mon Sep 17 00:00:00 2001
From: linearcombination <4829djaskdfj@gmail.com>
Date: Wed, 23 Oct 2024 10:30:45 -0700
Subject: [PATCH] Preserve text formatted as poetry in STET
---
backend/document/stet/stet.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/backend/document/stet/stet.py b/backend/document/stet/stet.py
index 9700a30f..5af4afac 100644
--- a/backend/document/stet/stet.py
+++ b/backend/document/stet/stet.py
@@ -71,9 +71,12 @@ def split_chapter_into_verses(chapter: USFMChapter) -> dict[str, str]:
verse_number = re.search(r'(\d+)', verse_span)
if verse_number:
verse_number_ = verse_number.group(1)
- # Remove all and
tags and their content from the verse text
+ # Remove all tags and their content from the verse text
verse_text = re.sub(r".*?", "", verse_span)
- verse_text = re.sub(r".*?
", "", verse_text)
+ # Remove
tags that do not have class matching "poetry-"
+ verse_text = re.sub(
+ r"
.*?
", "", verse_text
+ )
# Remove the remaining HTML tags and strip extra spaces
verse_text = re.sub(r"<.*?>", "", verse_text).strip()
logger.debug("verse_number: %s, verse_text: %s", verse_number_, verse_text)