Skip to content

Commit

Permalink
Merge pull request #231 from WycliffeAssociates/usfm-error-detection-…
Browse files Browse the repository at this point in the history
…and-fix-programmatically

Usfm error detection and fix programmatically
  • Loading branch information
linearcombination authored Dec 5, 2024
2 parents 642cd7d + 16e74c1 commit b1b233b
Show file tree
Hide file tree
Showing 27 changed files with 2,322 additions and 396 deletions.
13 changes: 12 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ FROM_EMAIL_ADDRESS = "[email protected]"
SMTP_HOST = "https://example.com"
SMTP_PORT = 111
SMTP_PASSWORD = "fakepass"
SEND_EMAIL = False
SEND_EMAIL = false

# The port to pass to gunicorn via ./backend/gunicorn.conf.py
PORT=5005
Expand All @@ -45,6 +45,17 @@ USE_GIT_CLI=true
# future, if desired we could have them show by sleec
SHOW_TN_BOOK_INTRO=false

# If true, then check USFM content for certain USFM structural defects
# and attempt a fix so that the USFM can then be parsed.
CHECK_USFM=true

# When detecting and potentially correcting defective USFM source
# text, true means check all books for a language which had at least
# one book with a USFM defect. false means only check the specific
# books specified as defective for the language as declared in
# parsing.RESOURCES_WITH_USFM_DEFECTS
CHECK_ALL_BOOKS_FOR_LANGUAGE=true

# * http://localhost:3000 covers requests originating from the case
# where 'npm run dev' is invoked to run vite (to run svelte js frontend)
# outside Docker. This results in vite's development mode which runs on
Expand Down
197 changes: 2 additions & 195 deletions backend/document/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,35 +20,6 @@ class Settings(BaseSettings):
# GITHUB_API_TOKEN: str = "FOO" # This might be used in a later version
DATA_API_URL: HttpUrl

LTR_DIRECTION_HTML: str = "<div style='direction: ltr;'>"
RTL_DIRECTION_HTML: str = "<div style='direction: rtl;'>"

END_OF_CHAPTER_HTML: str = '<div class="end-of-chapter"></div>'
RESOURCE_TYPE_NAME_FMT_STR: str = "<h2>{}</h2>"
TN_VERSE_NOTES_ENCLOSING_DIV_FMT_STR: str = "<div style='column-count: 2;'>{}</div>"
TQ_HEADING_AND_QUESTIONS_FMT_STR: str = (
"<h3>{}</h3>\n<div style='column-count: 2;'>{}</div>"
)
HTML_ROW_BEGIN: str = "<div class='row'>"
HTML_ROW_END: str = "</div>"
HTML_COLUMN_BEGIN: str = "<div class='column'>"
HTML_COLUMN_END: str = "</div>"
HTML_COLUMN_LEFT_BEGIN: str = "<div class='column-left'>"
HTML_COLUMN_RIGHT_BEGIN: str = "<div class='column-right'>"
BOOK_NAME_FMT_STR: str = "<h2 style='text-align: center;'>{}</h2>"
CHAPTER_HEADER_FMT_STR: str = '<h2 class="chapter">Chapter {}</h2>'
UNORDERED_LIST_BEGIN_STR: str = "<ul>"
UNORDERED_LIST_END_STR: str = "</ul>"
OPENING_H3_FMT_STR: str = "<h3>{}"
OPENING_H3_WITH_ID_FMT_STR: str = '<h3 id="{}-{}">{}'
TRANSLATION_WORD_ANCHOR_LINK_FMT_STR: str = "[{}](#{}-{})"
TRANSLATION_WORD_PREFIX_ANCHOR_LINK_FMT_STR: str = "({}: [{}](#{}-{}))"
TRANSLATION_WORD_PREFIX_FMT_STR: str = "({}: {})"
# TODO This needs to be changed to the .NET USFM renderer's marker
# pattern. This is the USFM-Tools singlePageRenderer's expected output,
# i.e., the output from the previous renderer.
TRANSLATION_NOTE_ANCHOR_LINK_FMT_STR: str = "[{}](#{}-{}-ch-{}-v-{})"

USFM_RESOURCE_TYPES: Sequence[str] = [
"avd",
"ayt",
Expand All @@ -64,158 +35,10 @@ class Settings(BaseSettings):
"usfm",
]

# This can be expanded to include any additional types (if
# there are any) that we want to be available to users. These are all
# that I found of relevance in the data API.
RESOURCE_TYPE_CODES_AND_NAMES: Mapping[str, str] = {
"ayt": "Bahasa Indonesian Bible",
"bc": "Bible Commentary",
"blv": "Portuguese Bíblia Livre",
"cuv": "新标点和合本",
"f10": "French Louis Segond 1910 Bible",
"nav": "New Arabic Version (Ketab El Hayat)",
"reg": "Bible",
"tn": "Translation Notes",
"tn-condensed": "Condensed Translation Notes",
"tq": "Translation Questions",
"tw": "Translation Words",
# "udb": "Unlocked Dynamic Bible", # Content team doesn't want udb used
"ugnt": "unfoldingWord® Greek New Testament",
"uhb": "unfoldingWord® Hebrew Bible",
"ulb": "Unlocked Literal Bible",
}
SHOW_TN_BOOK_INTRO: bool = False
TN_RESOURCE_TYPE: str = "tn"
EN_TN_CONDENSED_RESOURCE_TYPE: str = "tn-condensed"
TQ_RESOURCE_TYPE: str = "tq"
TW_RESOURCE_TYPE: str = "tw"
BC_RESOURCE_TYPE: str = "bc"
NON_USFM_RESOURCE_TYPES: Sequence[str] = [
TN_RESOURCE_TYPE,
EN_TN_CONDENSED_RESOURCE_TYPE,
TQ_RESOURCE_TYPE,
TW_RESOURCE_TYPE,
BC_RESOURCE_TYPE,
]
# NOTE This is only used to see if a lang_code is in the collection
# otherwise it is a heart language. Eventually the graphql data api may
# provide gateway/heart boolean value.
GATEWAY_LANGUAGES: Sequence[str] = [
"abs",
"aju",
"am",
"apd",
"ar",
"ar-x-dcv",
"ary",
"arz",
"as",
"ase",
"bem",
"bg",
"bgw",
"bi",
"bn",
"ceb",
"cmn",
"cmn-x-omc",
"csl",
"dz",
"en",
"es",
"es-419",
"fa",
"fil",
"fr",
"grt",
"gu",
"gug",
"ha",
"hbs",
"hca",
"he",
"hi",
"hne",
"hu",
"id",
"id-x-dcv",
"idb",
"ilo",
"ins",
"ja",
"jv",
"kas",
"km",
"kn",
"lbj",
"ln",
"lo",
"mai",
"mg",
"ml",
"mn",
"mni",
"mnk",
"mr",
"ms",
"my",
"ne",
"nl",
"npi",
"or",
"pa",
"pbt",
"pes",
"pis",
"plt",
"pmy",
"pnb",
"prs",
"ps",
"psr",
"pt",
"pt-br",
"raj",
"rsl",
"ru",
"rwr",
"sn",
"sw",
"swc",
"swh",
"ta",
"te",
"th",
"ti",
"tl",
"tn",
"tpi",
"tr",
"tsg",
"ug",
"ur",
"vi",
"zh",
"zlm",
]

TEMPLATE_PATHS_MAP: Mapping[str, str] = {
"stet": "backend/templates/mustache/template.mustache",
"stet_html": "backend/templates/html/stet.html",
"book_intro": "backend/templates/tn/book_intro_template.md",
"header_enclosing": "backend/templates/html/header_enclosing.html",
"header_enclosing_landscape": "backend/templates/html/header_enclosing_landscape.html", # used by dft project
"header_no_css_enclosing": "backend/templates/html/header_no_css_enclosing.html",
"header_compact_enclosing": "backend/templates/html/header_compact_enclosing.html",
"footer_enclosing": "backend/templates/html/footer_enclosing.html",
"cover": "backend/templates/html/cover.html",
"email-html": "backend/templates/html/email.html",
"email": "backend/templates/text/email.txt",
}

# fmt: off
BC_ARTICLE_URL_FMT_STR: str = "https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}"
# fmt: on
CHECK_USFM: bool
CHECK_ALL_BOOKS_FOR_LANGUAGE: bool

def logger(self, name: str) -> logging.Logger:
"""
Expand Down Expand Up @@ -264,15 +87,6 @@ def api_test_url(self) -> str:
# case of the final PDF). In hours.
ASSET_CACHING_PERIOD: int

# Return a list of the Markdown section titles that our
# Python-Markdown remove_section_processor extension should remove.
MARKDOWN_SECTIONS_TO_REMOVE: list[str] = [
"Examples from the Bible stories",
"Links",
"Picture of",
"Pictures",
]

EMAIL_SEND_SUBJECT: str
TO_EMAIL_ADDRESS: EmailStr

Expand All @@ -288,13 +102,6 @@ def api_test_url(self) -> str:
# Used by docker
IMAGE_TAG: str

# User agent value required by domain host to allow serving
# files. Other values could possibly also work.
USER_AGENT: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"

# Used in assembly_strategy_utils module when zero-filling various strings
NUM_ZEROS: int = 3

model_config = SettingsConfigDict(env_file=".env", case_sensitive=True)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@

logger = settings.logger(__name__)

HTML_ROW_BEGIN: str = "<div class='row'>"
HTML_ROW_END: str = "</div>"
HTML_COLUMN_BEGIN: str = "<div class='column'>"
HTML_COLUMN_END: str = "</div>"
HTML_COLUMN_LEFT_BEGIN: str = "<div class='column-left'>"
HTML_COLUMN_RIGHT_BEGIN: str = "<div class='column-right'>"
END_OF_CHAPTER_HTML: str = '<div class="end-of-chapter"></div>'
BOOK_NAME_FMT_STR: str = "<h2 style='text-align: center;'>{}</h2>"


def assemble_content_by_book_then_lang(
usfm_books: Sequence[USFMBook],
Expand Down Expand Up @@ -165,12 +174,12 @@ def assemble_usfm_by_chapter(
tq_books: Sequence[TQBook],
tw_books: Sequence[TWBook],
bc_books: Sequence[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
hr: str = "<hr/>",
book_chapters: Mapping[str, int] = BOOK_CHAPTERS,
show_tn_book_intro: bool = settings.SHOW_TN_BOOK_INTRO,
fmt_str: str = settings.BOOK_NAME_FMT_STR,
fmt_str: str = BOOK_NAME_FMT_STR,
) -> str:
"""
Construct the HTML wherein at least one USFM resource exists, one column
Expand Down Expand Up @@ -270,7 +279,7 @@ def assemble_tn_by_chapter(
tq_books: Sequence[TQBook],
tw_books: Sequence[TWBook],
bc_books: Sequence[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
book_chapters: Mapping[str, int] = BOOK_CHAPTERS,
show_tn_book_intro: bool = settings.SHOW_TN_BOOK_INTRO,
Expand Down Expand Up @@ -344,7 +353,7 @@ def assemble_tq_by_chapter(
tq_books: Sequence[TQBook],
tw_books: Sequence[TWBook],
bc_books: Sequence[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
book_chapters: Mapping[str, int] = BOOK_CHAPTERS,
) -> str:
Expand Down Expand Up @@ -394,7 +403,7 @@ def assemble_tw_by_chapter(
tq_books: Sequence[TQBook],
tw_books: Sequence[TWBook],
bc_books: Sequence[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
) -> str:
content = []

Expand All @@ -416,15 +425,15 @@ def assemble_usfm_by_chapter_2c_sl_sr(
tq_books: Sequence[TQBook],
tw_books: Sequence[TWBook],
bc_books: Sequence[BCBook],
html_row_begin: str = settings.HTML_ROW_BEGIN,
html_column_begin: str = settings.HTML_COLUMN_BEGIN,
html_column_left_begin: str = settings.HTML_COLUMN_LEFT_BEGIN,
html_column_right_begin: str = settings.HTML_COLUMN_RIGHT_BEGIN,
html_column_end: str = settings.HTML_COLUMN_END,
html_row_end: str = settings.HTML_ROW_END,
html_row_begin: str = HTML_ROW_BEGIN,
html_column_begin: str = HTML_COLUMN_BEGIN,
html_column_left_begin: str = HTML_COLUMN_LEFT_BEGIN,
html_column_right_begin: str = HTML_COLUMN_RIGHT_BEGIN,
html_column_end: str = HTML_COLUMN_END,
html_row_end: str = HTML_ROW_END,
close_direction_html: str = "</div>",
book_chapters: Mapping[str, int] = BOOK_CHAPTERS,
fmt_str: str = settings.BOOK_NAME_FMT_STR,
fmt_str: str = BOOK_NAME_FMT_STR,
) -> str:
"""
Construct the HTML for the two column scripture left scripture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

logger = settings.logger(__name__)

END_OF_CHAPTER_HTML: str = '<div class="end-of-chapter"></div>'
BOOK_NAME_FMT_STR: str = "<h2 style='text-align: center;'>{}</h2>"


def assemble_content_by_lang_then_book(
usfm_books: Sequence[USFMBook],
Expand Down Expand Up @@ -163,10 +166,10 @@ def assemble_usfm_by_book(
tw_book: Optional[TWBook],
usfm_book2: Optional[USFMBook],
bc_book: Optional[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
hr: str = "<hr/>",
close_direction_html: str = "</div>",
fmt_str: str = settings.BOOK_NAME_FMT_STR,
fmt_str: str = BOOK_NAME_FMT_STR,
) -> str:
content = []
content.append(usfm_language_direction_html(usfm_book))
Expand Down Expand Up @@ -207,7 +210,7 @@ def assemble_tn_by_book(
tw_book: Optional[TWBook],
usfm_book2: Optional[USFMBook],
bc_book: Optional[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
) -> str:
content = []
Expand All @@ -232,7 +235,7 @@ def assemble_tq_by_book(
tw_book: Optional[TWBook],
usfm_book2: Optional[USFMBook],
bc_book: Optional[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
) -> str:
content = []
Expand All @@ -256,7 +259,7 @@ def assemble_tw_by_book(
tw_book: Optional[TWBook],
usfm_book2: Optional[USFMBook],
bc_book: Optional[BCBook],
end_of_chapter_html: str = settings.END_OF_CHAPTER_HTML,
end_of_chapter_html: str = END_OF_CHAPTER_HTML,
close_direction_html: str = "</div>",
) -> str:
content = []
Expand Down
Loading

0 comments on commit b1b233b

Please sign in to comment.