Skip to content

Commit

Permalink
Handle NULL bytes and control characters in source USFM
Browse files Browse the repository at this point in the history
Our test team noted that when building a doc for English->Lingala (ln), the build fails with an error

This happened because the USFM for Lingala had control characters in
the text, in particular ^B (STX in ASCII - start of text). Since there
could be other source languages that have control characters I have
augmented STET to strip them out prior to processing.
  • Loading branch information
linearcombination committed Oct 16, 2024
1 parent 673cfe7 commit 7617cd3
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
5 changes: 4 additions & 1 deletion backend/document/stet/stet.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,14 @@ def generate_docx_document(
# template = filepath.read()
env = jinja2.Environment(autoescape=True).from_string(template)
full_html = env.render(data=word_entries)
# Remove any NULL bytes and all control characters
cleaned_full_html = re.sub(r"[\x00-\x1F]+", "", full_html)
# logger.debug("full_html: %s", full_html)
# filepath_ = f"{working_dir}/{lang0_code}_{lang1_code}_stet.html"
filepath_ = f"{working_dir}/{document_request_key_}.html"
with open(filepath_, "w", encoding="utf-8") as outfile2:
outfile2.write(full_html)
outfile2.write(cleaned_full_html)

html_to_docx = HtmlToDocx()
# docx_filepath = f"{Path(filepath_).stem}.docx"
html_to_docx.parse_html_file(filepath_, f"{output_dir}/{Path(docx_filepath_).stem}")
Expand Down
15 changes: 15 additions & 0 deletions tests/e2e/test_api_stet_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,18 @@ def test_en_abu_stet_docx() -> None:
},
)
check_result(response, suffix="docx")


@pytest.mark.stet
@pytest.mark.docx
def test_en_ln_stet_docx() -> None:
with TestClient(app=app, base_url=settings.api_test_url()) as client:
response = client.post(
"/stet/documents_stet_docx",
json={
"lang0_code": "en",
"lang1_code": "ln",
"email_address": settings.TO_EMAIL_ADDRESS,
},
)
check_result(response, suffix="docx")

0 comments on commit 7617cd3

Please sign in to comment.