Skip to content

Commit

Permalink
Merge pull request #221 from WycliffeAssociates/stet
Browse files Browse the repository at this point in the history
Stet
  • Loading branch information
linearcombination authored Sep 24, 2024
2 parents 6e3eb74 + 8bf8fff commit 9282262
Show file tree
Hide file tree
Showing 20 changed files with 817 additions and 76 deletions.
5 changes: 5 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libdeflate0 \
# For weasyprint
pango1.0-tools \
# For stet
# pandoc \
# For fc-cache
fontconfig

Expand Down Expand Up @@ -75,7 +77,10 @@ RUN mkdir -p /app/assets_download
RUN mkdir -p /app/working_temp
# Make the output directory where generated HTML and PDFs are placed.
RUN mkdir -p /app/document_output
# Make the directory where stet source documents are stored
RUN mkdir -p /app/stet

COPY backend/document/stet/data/stet_en.docx stet/

COPY pyproject.toml .
COPY ./backend/requirements.txt .
Expand Down
40 changes: 39 additions & 1 deletion backend/document/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""This module provides configuration values used by the application."""
import logging
from logging import config as lc
from typing import Sequence, final
from typing import Mapping, Sequence, final

import yaml
from pydantic import EmailStr, HttpUrl
Expand Down Expand Up @@ -63,6 +63,27 @@ class Settings(BaseSettings):
"ulb",
"usfm",
]

# This can be expanded to include any additional types (if
# there are any) that we want to be available to users. These are all
# that I found of relevance in the data API.
RESOURCE_TYPE_CODES_AND_NAMES: Mapping[str, str] = {
"ayt": "Bahasa Indonesian Bible",
"bc": "Bible Commentary",
"blv": "Portuguese Bíblia Livre",
"cuv": "新标点和合本",
"f10": "French Louis Segond 1910 Bible",
"nav": "New Arabic Version (Ketab El Hayat)",
"reg": "Bible",
"tn": "Translation Notes",
"tn-condensed": "Condensed Translation Notes",
"tq": "Translation Questions",
"tw": "Translation Words",
# "udb": "Unlocked Dynamic Bible", # Content team doesn't want udb used
"ugnt": "unfoldingWord® Greek New Testament",
"uhb": "unfoldingWord® Hebrew Bible",
"ulb": "Unlocked Literal Bible",
}
SHOW_TN_BOOK_INTRO: bool = False
TN_RESOURCE_TYPE: str = "tn"
EN_TN_CONDENSED_RESOURCE_TYPE: str = "tn-condensed"
Expand Down Expand Up @@ -178,6 +199,20 @@ class Settings(BaseSettings):
"zlm",
]

TEMPLATE_PATHS_MAP: Mapping[str, str] = {
"stet": "backend/templates/mustache/template.mustache",
"stet_html": "backend/templates/html/stet.html",
"book_intro": "backend/templates/tn/book_intro_template.md",
"header_enclosing": "backend/templates/html/header_enclosing.html",
"header_enclosing_landscape": "backend/templates/html/header_enclosing_landscape.html", # used by dft project
"header_no_css_enclosing": "backend/templates/html/header_no_css_enclosing.html",
"header_compact_enclosing": "backend/templates/html/header_compact_enclosing.html",
"footer_enclosing": "backend/templates/html/footer_enclosing.html",
"cover": "backend/templates/html/cover.html",
"email-html": "backend/templates/html/email.html",
"email": "backend/templates/text/email.txt",
}

# fmt: off
BC_ARTICLE_URL_FMT_STR: str = "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}"
# fmt: on
Expand Down Expand Up @@ -213,6 +248,9 @@ def api_test_url(self) -> str:
# Location where generated PDFs are written.
DOCUMENT_OUTPUT_DIR: str = "document_output"

# Location where stet source Docx document(s) are stored
STET_DIR: str = "stet"

BACKEND_CORS_ORIGINS: list[str]

DOCX_TEMPLATE_PATH: str = "template.docx"
Expand Down
128 changes: 88 additions & 40 deletions backend/document/domain/document_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from celery import current_task
from document.config import settings
from document.domain import parsing, resource_lookup, worker
from document.stet import stet
from document.domain.bible_books import BOOK_NAMES

from document.domain.assembly_strategies.assembly_strategies_book_then_lang_by_chapter import (
Expand All @@ -30,10 +31,10 @@
assemble_content_by_lang_then_book,
)
from document.domain.assembly_strategies_docx import (
assembly_strategies_book_then_lang_by_chapter as asd_book_then_lang,
assembly_strategies_book_then_lang_by_chapter as book_then_lang,
)
from document.domain.assembly_strategies_docx import (
assembly_strategies_lang_then_book_by_chapter as asd_lang_then_book,
assembly_strategies_lang_then_book_by_chapter as lang_then_book,
)
from document.domain.assembly_strategies_docx.assembly_strategy_utils import add_hr
from document.domain.model import (
Expand All @@ -52,7 +53,12 @@
TWNameContentPair,
USFMBook,
)
from document.utils.file_utils import file_needs_update, write_file
from document.utils.file_utils import (
file_needs_update,
template,
template_path,
write_file,
)
from docx import Document # type: ignore
from docx.enum.section import WD_SECTION # type: ignore
from docx.oxml import OxmlElement # type: ignore
Expand All @@ -65,19 +71,6 @@
logger = settings.logger(__name__)


TEMPLATE_PATHS_MAP: Mapping[str, str] = {
"book_intro": "backend/templates/tn/book_intro_template.md",
"header_enclosing": "backend/templates/html/header_enclosing.html",
"header_enclosing_landscape": "backend/templates/html/header_enclosing_landscape.html", # used by dft project
"header_no_css_enclosing": "backend/templates/html/header_no_css_enclosing.html",
"header_compact_enclosing": "backend/templates/html/header_compact_enclosing.html",
"footer_enclosing": "backend/templates/html/footer_enclosing.html",
"cover": "backend/templates/html/cover.html",
"email-html": "backend/templates/html/email.html",
"email": "backend/templates/text/email.txt",
}


def contains_tw(resource_request: ResourceRequest, tw_regex: str = "tw.*") -> bool:
"""Return True if the resource_request describes a TW resource."""
value = bool(re.compile(tw_regex).match(resource_request.resource_type))
Expand Down Expand Up @@ -148,24 +141,6 @@ def document_request_key(
return document_request_key


def template_path(
key: str, template_paths_map: Mapping[str, str] = TEMPLATE_PATHS_MAP
) -> str:
"""
Return the path to the requested template give a lookup key.
Return a different path if the code is running inside the Docker
container.
"""
return template_paths_map[key]


def template(template_lookup_key: str) -> str:
"""Return template as string."""
with open(template_path(template_lookup_key), "r") as filepath:
template = filepath.read()
return template


def instantiated_email_template(document_request_key: str) -> str:
"""
Instantiate Jinja2 template. Return instantiated template as string.
Expand Down Expand Up @@ -361,8 +336,6 @@ def fetch_usfm_book_content_units(
parsing.usfm_book_content(
resource_lookup_dto,
resource_dir,
resource_requests,
False,
)
for resource_lookup_dto, resource_dir in zip(
found_usfm_resource_lookup_dtos, resource_dirs
Expand Down Expand Up @@ -520,7 +493,7 @@ def assemble_docx_content(
document_request.assembly_strategy_kind
== AssemblyStrategyEnum.LANGUAGE_BOOK_ORDER
):
composer = asd_lang_then_book.assemble_content_by_lang_then_book(
composer = lang_then_book.assemble_content_by_lang_then_book(
usfm_books,
tn_books,
tq_books,
Expand All @@ -533,7 +506,7 @@ def assemble_docx_content(
document_request.assembly_strategy_kind
== AssemblyStrategyEnum.BOOK_LANGUAGE_ORDER
):
composer = asd_book_then_lang.assemble_content_by_book_then_lang(
composer = book_then_lang.assemble_content_by_book_then_lang(
usfm_books,
tn_books,
tq_books,
Expand Down Expand Up @@ -716,6 +689,36 @@ def convert_html_to_epub(
logger.debug("Time for converting HTML to ePub: %s", t1 - t0)


# def convert_markdown_to_docx(
# markdown_filepath: str,
# docx_filepath: str,
# ) -> None:
# """Generate Docx and copy it to output directory."""
# t0 = time.time()
# # command = [
# # "pandoc",
# # markdown_filepath,
# # "--from markdown",
# # "--to docx",
# # "--output",
# # docx_filepath,
# # ]
# command = [
# "pandoc",
# markdown_filepath,
# "-o",
# docx_filepath,
# ]
# logger.debug("Generate Docx command: %s", " ".join(command))
# subprocess.run(
# command,
# check=True,
# text=True,
# )
# t1 = time.time()
# logger.debug("Time for converting HTML to PDF: %s", t1 - t0)


def convert_html_to_docx(
html_filepath: str,
docx_filepath: str,
Expand Down Expand Up @@ -831,10 +834,12 @@ def select_assembly_layout_kind(
usfm_resource_types: Sequence[str] = settings.USFM_RESOURCE_TYPES,
language_book_order: AssemblyStrategyEnum = AssemblyStrategyEnum.LANGUAGE_BOOK_ORDER,
book_language_order: AssemblyStrategyEnum = AssemblyStrategyEnum.BOOK_LANGUAGE_ORDER,
stet_strategy: AssemblyStrategyEnum = AssemblyStrategyEnum.STET_STRATEGY,
one_column_compact: AssemblyLayoutEnum = AssemblyLayoutEnum.ONE_COLUMN_COMPACT,
sl_sr: AssemblyLayoutEnum = AssemblyLayoutEnum.TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT,
sl_sr_compact: AssemblyLayoutEnum = AssemblyLayoutEnum.TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT_COMPACT,
one_column: AssemblyLayoutEnum = AssemblyLayoutEnum.ONE_COLUMN,
stet_layout: AssemblyLayoutEnum = AssemblyLayoutEnum.STET_LAYOUT,
) -> AssemblyLayoutEnum:
"""
Make an intelligent choice of what layout to use given the
Expand All @@ -855,6 +860,8 @@ def select_assembly_layout_kind(
and document_request.assembly_layout_kind
):
return document_request.assembly_layout_kind
if document_request.assembly_strategy_kind == stet_strategy:
return stet_layout
if (
document_request.layout_for_print
and document_request.assembly_strategy_kind == language_book_order
Expand Down Expand Up @@ -1068,11 +1075,52 @@ def generate_document(
return document_request_key_


@worker.app.task
def generate_stet_docx_document(
lang0_code: str,
lang1_code: str,
email_address: str,
) -> Json[str]:
logger.debug(
"passed args: lang0_code: %s, lang1_code: %s, email_adress: %s",
lang0_code,
lang1_code,
email_address,
)
document_request_key_ = f"{lang0_code}_{lang1_code}_stet"
docx_filepath_ = docx_filepath(document_request_key_)
if file_needs_update(docx_filepath_):
current_task.update_state(state="Converting to Docx")
stet.generate_docx_document(
lang0_code, lang1_code, document_request_key_, docx_filepath_
)
# convert_markdown_to_docx(markdown_filepath, docx_filepath_)
if should_send_email(email_address):
attachments = [
Attachment(
filepath=docx_filepath_,
mime_type=(
"application",
"vnd.openxmlformats-officedocument.wordprocessingml.document",
),
)
]
current_task.update_state(state="Sending email")
send_email_with_attachment(
email_address,
attachments,
document_request_key_,
)
else:
logger.debug("Cache hit for %s", docx_filepath_)
return document_request_key_


@worker.app.task
def generate_docx_document(
document_request_json: Json[Any],
output_dir: str = settings.DOCUMENT_OUTPUT_DIR,
) -> Json[Any]:
) -> Json[str]:
"""
This is the alternative entry point for Docx document creation only.
"""
Expand Down Expand Up @@ -1152,7 +1200,7 @@ def generate_docx_document(
# underlying HTML content to see if it contains verses and display a
# message in the document to the end user if it does not (so that they
# get some indication of why the scripture is missing).

#
# Construct sensical phrases to display for title1 and title2 on first
# page of Word document.
title1, title2 = get_languages_title_page_strings(found_resource_lookup_dtos)
Expand Down
12 changes: 11 additions & 1 deletion backend/document/domain/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class AssemblyStrategyEnum(str, Enum):

LANGUAGE_BOOK_ORDER = "lbo"
BOOK_LANGUAGE_ORDER = "blo"
STET_STRATEGY = "stet"


@final
Expand Down Expand Up @@ -76,6 +77,7 @@ class AssemblyLayoutEnum(str, Enum):
TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT = "2c_sl_sr"
TWO_COLUMN_SCRIPTURE_LEFT_SCRIPTURE_RIGHT_COMPACT = "2c_sl_sr_c"
# fmt: on
STET_LAYOUT = "stet"


@final
Expand Down Expand Up @@ -122,6 +124,13 @@ class DocumentRequestSourceEnum(str, Enum):
BIEL_UI = "biel_ui"


@final
class StetDocumentRequest(BaseModel):
lang0_code: str
lang1_code: str
email_address: str


@final
class DocumentRequest(BaseModel):
"""
Expand Down Expand Up @@ -424,7 +433,7 @@ class BCBook(NamedTuple):


@final
class USFMChapter(NamedTuple):
class USFMChapter(BaseModel):
"""
A class to hold the USFM converted to HTML content for a chapter
in total (including things like 'chunk breaks' and other verse
Expand All @@ -437,6 +446,7 @@ class USFMChapter(NamedTuple):
"""

content: str
verses: Optional[dict[VerseRef, str]]


@final
Expand Down
6 changes: 3 additions & 3 deletions backend/document/domain/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,6 @@ def get_chapter_num(chapter_usfm_text: str) -> int:
def usfm_book_content(
resource_lookup_dto: ResourceLookupDto,
resource_dir: str,
resource_requests: Sequence[ResourceRequest],
layout_for_print: bool,
) -> USFMBook:
"""
First produce HTML content from USFM content and then break the
Expand All @@ -280,6 +278,7 @@ def usfm_book_content(
)
usfm_chapters[chapter_num] = USFMChapter(
content=chapter_html_content if chapter_html_content else "",
verses=None,
)
return USFMBook(
lang_code=resource_lookup_dto.lang_code,
Expand Down Expand Up @@ -698,7 +697,8 @@ def books(
for resource_lookup_dto, resource_dir in zip(resource_lookup_dtos, resource_dirs):
if resource_lookup_dto.resource_type in usfm_resource_types:
usfm_book = usfm_book_content(
resource_lookup_dto, resource_dir, resource_requests, layout_for_print
resource_lookup_dto,
resource_dir,
)
usfm_books.append(usfm_book)
elif (
Expand Down
Loading

0 comments on commit 9282262

Please sign in to comment.