Merge pull request #221 from WycliffeAssociates/stet

Stet
WycliffeAssociates · Sep 24, 2024 · 9282262 · 9282262
2 parents 6e3eb74 + 8bf8fff
commit 9282262
Show file tree

Hide file tree

Showing 20 changed files with 817 additions and 76 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -25,6 +25,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libdeflate0 \
     # For weasyprint
     pango1.0-tools \
+    # For stet
+    # pandoc \
     # For fc-cache
     fontconfig
 
@@ -75,7 +77,10 @@ RUN mkdir -p /app/assets_download
 RUN mkdir -p /app/working_temp
 # Make the output directory where generated HTML and PDFs are placed.
 RUN mkdir -p /app/document_output
+# Make the directory where stet source documents are stored
+RUN mkdir -p /app/stet
 
+COPY backend/document/stet/data/stet_en.docx stet/
 
 COPY pyproject.toml .
 COPY ./backend/requirements.txt .

diff --git a/backend/document/config.py b/backend/document/config.py
@@ -1,7 +1,7 @@
 """This module provides configuration values used by the application."""
 import logging
 from logging import config as lc
-from typing import Sequence, final
+from typing import Mapping, Sequence, final
 
 import yaml
 from pydantic import EmailStr, HttpUrl
@@ -63,6 +63,27 @@ class Settings(BaseSettings):
         "ulb",
         "usfm",
     ]
+
+    # This can be expanded to include any additional types (if
+    # there are any) that we want to be available to users. These are all
+    # that I found of relevance in the data API.
+    RESOURCE_TYPE_CODES_AND_NAMES: Mapping[str, str] = {
+        "ayt": "Bahasa Indonesian Bible",
+        "bc": "Bible Commentary",
+        "blv": "Portuguese Bíblia Livre",
+        "cuv": "新标点和合本",
+        "f10": "French Louis Segond 1910 Bible",
+        "nav": "New Arabic Version (Ketab El Hayat)",
+        "reg": "Bible",
+        "tn": "Translation Notes",
+        "tn-condensed": "Condensed Translation Notes",
+        "tq": "Translation Questions",
+        "tw": "Translation Words",
+        # "udb": "Unlocked Dynamic Bible",  # Content team doesn't want udb used
+        "ugnt": "unfoldingWord® Greek New Testament",
+        "uhb": "unfoldingWord® Hebrew Bible",
+        "ulb": "Unlocked Literal Bible",
+    }
     SHOW_TN_BOOK_INTRO: bool = False
     TN_RESOURCE_TYPE: str = "tn"
     EN_TN_CONDENSED_RESOURCE_TYPE: str = "tn-condensed"
@@ -178,6 +199,20 @@ class Settings(BaseSettings):
         "zlm",
     ]
 
+    TEMPLATE_PATHS_MAP: Mapping[str, str] = {
+        "stet": "backend/templates/mustache/template.mustache",
+        "stet_html": "backend/templates/html/stet.html",
+        "book_intro": "backend/templates/tn/book_intro_template.md",
+        "header_enclosing": "backend/templates/html/header_enclosing.html",
+        "header_enclosing_landscape": "backend/templates/html/header_enclosing_landscape.html",  # used by dft project
+        "header_no_css_enclosing": "backend/templates/html/header_no_css_enclosing.html",
+        "header_compact_enclosing": "backend/templates/html/header_compact_enclosing.html",
+        "footer_enclosing": "backend/templates/html/footer_enclosing.html",
+        "cover": "backend/templates/html/cover.html",
+        "email-html": "backend/templates/html/email.html",
+        "email": "backend/templates/text/email.txt",
+    }
+
     # fmt: off
     BC_ARTICLE_URL_FMT_STR: str = "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}"
     # fmt: on
@@ -213,6 +248,9 @@ def api_test_url(self) -> str:
     # Location where generated PDFs are written.
     DOCUMENT_OUTPUT_DIR: str = "document_output"
 
+    # Location where stet source Docx document(s) are stored
+    STET_DIR: str = "stet"
+
     BACKEND_CORS_ORIGINS: list[str]
 
     DOCX_TEMPLATE_PATH: str = "template.docx"

diff --git a/backend/document/domain/document_generator.py b/backend/document/domain/document_generator.py
@@ -21,6 +21,7 @@
 from celery import current_task
 from document.config import settings
 from document.domain import parsing, resource_lookup, worker
+from document.stet import stet
 from document.domain.bible_books import BOOK_NAMES
 
 from document.domain.assembly_strategies.assembly_strategies_book_then_lang_by_chapter import (
@@ -30,10 +31,10 @@
     assemble_content_by_lang_then_book,
 )
 from document.domain.assembly_strategies_docx import (
-    assembly_strategies_book_then_lang_by_chapter as asd_book_then_lang,
+    assembly_strategies_book_then_lang_by_chapter as book_then_lang,
 )
 from document.domain.assembly_strategies_docx import (
-    assembly_strategies_lang_then_book_by_chapter as asd_lang_then_book,
+    assembly_strategies_lang_then_book_by_chapter as lang_then_book,
 )
 from document.domain.assembly_strategies_docx.assembly_strategy_utils import add_hr
 from document.domain.model import (
@@ -52,7 +53,12 @@
     TWNameContentPair,
     USFMBook,
 )
-from document.utils.file_utils import file_needs_update, write_file
+from document.utils.file_utils import (
+    file_needs_update,
+    template,
+    template_path,
+    write_file,
+)
 from docx import Document  # type: ignore
 from docx.enum.section import WD_SECTION  # type: ignore
 from docx.oxml import OxmlElement  # type: ignore
@@ -65,19 +71,6 @@
 logger = settings.logger(__name__)
 
 
-TEMPLATE_PATHS_MAP: Mapping[str, str] = {
-    "book_intro": "backend/templates/tn/book_intro_template.md",
-    "header_enclosing": "backend/templates/html/header_enclosing.html",
-    "header_enclosing_landscape": "backend/templates/html/header_enclosing_landscape.html",  # used by dft project
-    "header_no_css_enclosing": "backend/templates/html/header_no_css_enclosing.html",
-    "header_compact_enclosing": "backend/templates/html/header_compact_enclosing.html",
-    "footer_enclosing": "backend/templates/html/footer_enclosing.html",
-    "cover": "backend/templates/html/cover.html",
-    "email-html": "backend/templates/html/email.html",
-    "email": "backend/templates/text/email.txt",
-}
-
-
 def contains_tw(resource_request: ResourceRequest, tw_regex: str = "tw.*") -> bool:
     """Return True if the resource_request describes a TW resource."""
     value = bool(re.compile(tw_regex).match(resource_request.resource_type))
@@ -148,24 +141,6 @@ def document_request_key(
         return document_request_key
 
 
-def template_path(
-    key: str, template_paths_map: Mapping[str, str] = TEMPLATE_PATHS_MAP
-) -> str:
-    """
-    Return the path to the requested template give a lookup key.
-    Return a different path if the code is running inside the Docker
-    container.
-    """
-    return template_paths_map[key]
-
-
-def template(template_lookup_key: str) -> str:
-    """Return template as string."""
-    with open(template_path(template_lookup_key), "r") as filepath:
-        template = filepath.read()
-    return template
-
-
 def instantiated_email_template(document_request_key: str) -> str:
     """
     Instantiate Jinja2 template. Return instantiated template as string.
@@ -361,8 +336,6 @@ def fetch_usfm_book_content_units(
         parsing.usfm_book_content(
             resource_lookup_dto,
             resource_dir,
-            resource_requests,
-            False,
         )
         for resource_lookup_dto, resource_dir in zip(
             found_usfm_resource_lookup_dtos, resource_dirs
@@ -520,7 +493,7 @@ def assemble_docx_content(
         document_request.assembly_strategy_kind
         == AssemblyStrategyEnum.LANGUAGE_BOOK_ORDER
     ):
-        composer = asd_lang_then_book.assemble_content_by_lang_then_book(
+        composer = lang_then_book.assemble_content_by_lang_then_book(
             usfm_books,
             tn_books,
             tq_books,
@@ -533,7 +506,7 @@ def assemble_docx_content(
         document_request.assembly_strategy_kind
         == AssemblyStrategyEnum.BOOK_LANGUAGE_ORDER
     ):
-        composer = asd_book_then_lang.assemble_content_by_book_then_lang(
+        composer = book_then_lang.assemble_content_by_book_then_lang(
             usfm_books,
             tn_books,
             tq_books,
@@ -716,6 +689,36 @@ def convert_html_to_epub(
     logger.debug("Time for converting HTML to ePub: %s", t1 - t0)
 
 
+# def convert_markdown_to_docx(
+#     markdown_filepath: str,
+#     docx_filepath: str,
+# ) -> None:
+#     """Generate Docx and copy it to output directory."""
+#     t0 = time.time()
+#     # command = [
+#     #     "pandoc",
+#     #     markdown_filepath,
+#     #     "--from markdown",
+#     #     "--to docx",
+#     #     "--output",
+#     #     docx_filepath,
+#     # ]
+#     command = [
+#         "pandoc",
+#         markdown_filepath,
+#         "-o",
+#         docx_filepath,
+#     ]
+#     logger.debug("Generate Docx command: %s", " ".join(command))
+#     subprocess.run(
+#         command,
+#         check=True,
+#         text=True,
+#     )
+#     t1 = time.time()
+#     logger.debug("Time for converting HTML to PDF: %s", t1 - t0)
+
+
 def convert_html_to_docx(
     html_filepath: str,
     docx_filepath: str,
@@ -831,10 +834,12 @@ def select_assembly_layout_kind(
     usfm_resource_types: Sequence[str] = settings.USFM_RESOURCE_TYPES,
     language_book_order: AssemblyStrategyEnum = AssemblyStrategyEnum.LANGUAGE_BOOK_ORDER,
     book_language_order: AssemblyStrategyEnum = AssemblyStrategyEnum.BOOK_LANGUAGE_ORDER,
+    stet_strategy: AssemblyStrategyEnum = AssemblyStrategyEnum.STET_STRATEGY,
     one_column_compact: AssemblyLayoutEnum = AssemblyLayoutEnum.ONE_COLUMN_COMPACT,
     sl_sr: AssemblyLayoutEnum = AssemblyLayoutEnum.TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT,
     sl_sr_compact: AssemblyLayoutEnum = AssemblyLayoutEnum.TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT_COMPACT,
     one_column: AssemblyLayoutEnum = AssemblyLayoutEnum.ONE_COLUMN,
+    stet_layout: AssemblyLayoutEnum = AssemblyLayoutEnum.STET_LAYOUT,
 ) -> AssemblyLayoutEnum:
     """
     Make an intelligent choice of what layout to use given the
@@ -855,6 +860,8 @@ def select_assembly_layout_kind(
         and document_request.assembly_layout_kind
     ):
         return document_request.assembly_layout_kind
+    if document_request.assembly_strategy_kind == stet_strategy:
+        return stet_layout
     if (
         document_request.layout_for_print
         and document_request.assembly_strategy_kind == language_book_order
@@ -1068,11 +1075,52 @@ def generate_document(
     return document_request_key_
 
 
+@worker.app.task
+def generate_stet_docx_document(
+    lang0_code: str,
+    lang1_code: str,
+    email_address: str,
+) -> Json[str]:
+    logger.debug(
+        "passed args: lang0_code: %s, lang1_code: %s, email_adress: %s",
+        lang0_code,
+        lang1_code,
+        email_address,
+    )
+    document_request_key_ = f"{lang0_code}_{lang1_code}_stet"
+    docx_filepath_ = docx_filepath(document_request_key_)
+    if file_needs_update(docx_filepath_):
+        current_task.update_state(state="Converting to Docx")
+        stet.generate_docx_document(
+            lang0_code, lang1_code, document_request_key_, docx_filepath_
+        )
+        # convert_markdown_to_docx(markdown_filepath, docx_filepath_)
+        if should_send_email(email_address):
+            attachments = [
+                Attachment(
+                    filepath=docx_filepath_,
+                    mime_type=(
+                        "application",
+                        "vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    ),
+                )
+            ]
+            current_task.update_state(state="Sending email")
+            send_email_with_attachment(
+                email_address,
+                attachments,
+                document_request_key_,
+            )
+    else:
+        logger.debug("Cache hit for %s", docx_filepath_)
+    return document_request_key_
+
+
 @worker.app.task
 def generate_docx_document(
     document_request_json: Json[Any],
     output_dir: str = settings.DOCUMENT_OUTPUT_DIR,
-) -> Json[Any]:
+) -> Json[str]:
     """
     This is the alternative entry point for Docx document creation only.
     """
@@ -1152,7 +1200,7 @@ def generate_docx_document(
         # underlying HTML content to see if it contains verses and display a
         # message in the document to the end user if it does not (so that they
         # get some indication of why the scripture is missing).
-
+        #
         # Construct sensical phrases to display for title1 and title2 on first
         # page of Word document.
         title1, title2 = get_languages_title_page_strings(found_resource_lookup_dtos)

diff --git a/backend/document/domain/model.py b/backend/document/domain/model.py
@@ -35,6 +35,7 @@ class AssemblyStrategyEnum(str, Enum):
 
     LANGUAGE_BOOK_ORDER = "lbo"
     BOOK_LANGUAGE_ORDER = "blo"
+    STET_STRATEGY = "stet"
 
 
 @final
@@ -76,6 +77,7 @@ class AssemblyLayoutEnum(str, Enum):
     TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT = "2c_sl_sr"
     TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT_COMPACT = "2c_sl_sr_c"
     # fmt: on
+    STET_LAYOUT = "stet"
 
 
 @final
@@ -122,6 +124,13 @@ class DocumentRequestSourceEnum(str, Enum):
     BIEL_UI = "biel_ui"
 
 
+@final
+class StetDocumentRequest(BaseModel):
+    lang0_code: str
+    lang1_code: str
+    email_address: str
+
+
 @final
 class DocumentRequest(BaseModel):
     """
@@ -424,7 +433,7 @@ class BCBook(NamedTuple):
 
 
 @final
-class USFMChapter(NamedTuple):
+class USFMChapter(BaseModel):
     """
     A class to hold the USFM converted to HTML content for a chapter
     in total (including things like 'chunk breaks' and other verse
@@ -437,6 +446,7 @@ class USFMChapter(NamedTuple):
     """
 
     content: str
+    verses: Optional[dict[VerseRef, str]]
 
 
 @final

diff --git a/backend/document/domain/parsing.py b/backend/document/domain/parsing.py
@@ -259,8 +259,6 @@ def get_chapter_num(chapter_usfm_text: str) -> int:
 def usfm_book_content(
     resource_lookup_dto: ResourceLookupDto,
     resource_dir: str,
-    resource_requests: Sequence[ResourceRequest],
-    layout_for_print: bool,
 ) -> USFMBook:
     """
     First produce HTML content from USFM content and then break the
@@ -280,6 +278,7 @@ def usfm_book_content(
             )
             usfm_chapters[chapter_num] = USFMChapter(
                 content=chapter_html_content if chapter_html_content else "",
+                verses=None,
             )
     return USFMBook(
         lang_code=resource_lookup_dto.lang_code,
@@ -698,7 +697,8 @@ def books(
     for resource_lookup_dto, resource_dir in zip(resource_lookup_dtos, resource_dirs):
         if resource_lookup_dto.resource_type in usfm_resource_types:
             usfm_book = usfm_book_content(
-                resource_lookup_dto, resource_dir, resource_requests, layout_for_print
+                resource_lookup_dto,
+                resource_dir,
             )
             usfm_books.append(usfm_book)
         elif (