Skip to content

Commit

Permalink
Merge pull request #221 from transifex/TX-13385_docx_ampersand_fix
Browse files Browse the repository at this point in the history
Tx 13385 docx ampersand fix
  • Loading branch information
kbairak authored Jul 20, 2021
2 parents 39ab6f1 + 751ebf8 commit 232c4c4
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 28 deletions.
64 changes: 39 additions & 25 deletions openformats/formats/docx.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import io
import itertools
import os
import shutil
import tempfile
import uuid
import six
import io
import re
import shutil
from zipfile import ZIP_DEFLATED, ZipFile

import six
from bs4 import BeautifulSoup
from zipfile import ZipFile, ZIP_DEFLATED

from openformats.strings import OpenString
from openformats.handlers import Handler
from openformats.strings import OpenString


class DocxFile(object):
"""
A class used to wrap and expose the internals of a .docx file
A class used to wrap and expose the internals of a .docx file
A docx file is a zipped file that when unzipped,
generates a similar file/folder structure:
Expand Down Expand Up @@ -77,7 +75,8 @@ class DocxFile(object):
```
<Relationships>
...
<Relationship Id="rId6" Target="https://www.transifex.com/" TargetMode="External"/>
<Relationship Id="rId6" Target="https://www.transifex.com/"
TargetMode="External"/>
...
</Relationships>
```
Expand All @@ -103,11 +102,12 @@ def __init__(self, content):
with io.open(base_rels_path, 'r') as f:
base_rels = f.read()

document_relative_path = next(
relationship for relationship in BeautifulSoup(base_rels, 'xml').find_all(
attrs={'Target': True}
) if relationship.attrs.get('Type').endswith('/officeDocument')
).attrs['Target']
document_relative_path = next((
relationship
for relationship in (BeautifulSoup(base_rels, 'xml').
find_all(attrs={'Target': True}))
if relationship.attrs.get('Type').endswith('/officeDocument')
)).attrs['Target']

self.__document_path = '{}/{}'.format(
self.__tmp_folder, document_relative_path
Expand Down Expand Up @@ -197,7 +197,7 @@ def parse(self, content, **kwargs):
"""
We will segment the text by paragraph `<w:p>` as this
is defined in the docx structure.
For all the text `<w:t>` inside a paragraph,
we use tag separators `<tx>`, in order to denote
text style changes (normal->bold, bold->italic, 10px->14px etc)
Expand All @@ -216,7 +216,7 @@ def parse(self, content, **kwargs):
order = itertools.count()
for paragraph in soup.find_all('w:p'):
paragraph_text = []
text_elements = paragraph.find_all('w:t')
text_elements = paragraph.find_all('w:t')
if not text_elements:
continue

Expand Down Expand Up @@ -308,7 +308,7 @@ def compile(self, template, stringset, **kwargs):
rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml')

for paragraph in soup.find_all('w:p'):
text_elements = paragraph.find_all('w:t')
text_elements = paragraph.find_all('w:t')
if not text_elements:
continue

Expand All @@ -321,17 +321,18 @@ def compile(self, template, stringset, **kwargs):
continue

translation = stringset[txid].string
translation = self._escape_xml(translation)

translation_soup = BeautifulSoup(
u'<wrapper>{}</wrapper>'.format(translation), 'xml'
u'<wrapper>{}</wrapper>'.format(translation), 'xml',
).find_all(text=True)

leading_spaces = 0

for index, text_element in enumerate(text_elements):
text = six.text_type(text_element.text)
# detect text elements that contain no text
# and remove leading whitespace from the next string
# and remove leading whitespace from the next string
if not text.strip():
leading_spaces = len(text) - len(text.strip())
continue
Expand All @@ -354,17 +355,16 @@ def compile(self, template, stringset, **kwargs):
translation = translation[leading_spaces:]
leading_spaces = 0


# the text parts of the translation are more that the
# text parts of the document, so we will compress the
# text parts of the document, so we will compress the
# remaining translation parts into one string
if index == len(text_elements) - 1 and len(translation_soup) > 0:
if (index == len(text_elements) - 1 and
len(translation_soup) > 0):
translation = "".join(
[translation] +
[six.text_type(t) for t in translation_soup]
)


if hyperlink_url:
# attempt to find a parent containing `href` attribute
# in order to extract the potential modified url.
Expand All @@ -376,10 +376,24 @@ def compile(self, template, stringset, **kwargs):
)
text_element.clear()
text_element.insert(0, translation)

docx.set_document(six.text_type(soup))
docx.set_document_rels(six.text_type(rels_soup))

result = docx.compress()
docx.delete()
return result
return result

@staticmethod
def _escape_xml(translation):
""" Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in its
input. We do expect some '<tx>' and `</tx>` so we first replace
these tags to placeholders, do the escaping and restore them.
"""
return translation.\
replace(u"<tx", u"__TX__OPENING__TAG__").\
replace(u"</tx>", u"__TX__CLOSING__TAG__").\
replace(u"&", "&amp;").\
replace(u"<", "&lt;").\
replace(u"__TX__OPENING__TAG__", u"<tx").\
replace(u"__TX__CLOSING__TAG__", u"</tx>")
Binary file not shown.
Binary file added openformats/tests/formats/docx/files/with_lt.docx
Binary file not shown.
92 changes: 89 additions & 3 deletions openformats/tests/formats/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_broken_file(self):
with open(path, 'rb') as f:
content = f.read()

docx = DocxFile(content)
DocxFile(content) # Make sure no errors happen during init

handler = DocxHandler()
template, stringset = handler.parse(content)
Expand Down Expand Up @@ -47,7 +47,6 @@ def test_broken_file(self):
u'Φου βαρ βαζ'
)


def test_docx_file(self):
path = '{}/hello_world.docx'.format(self.TESTFILE_BASE)
with open(path, 'rb') as f:
Expand Down Expand Up @@ -515,7 +514,7 @@ def test_two_text_elements_file(self):
docx = DocxFile(content)

expected_strings = [

u'<tx>Hello</tx><tx> world</tx>',
u'<tx>Goodbye </tx><tx>world</tx>',
u'<tx>This is a </tx><tx href="https://google.com/">link</tx>',
Expand Down Expand Up @@ -587,3 +586,90 @@ def test_two_text_elements_file(self):

for url in [u'https://transifex.com/']:
self.assertTrue(url in docx.get_document_rels())

def test_ampersand(self):
# Parse original file
path = '{}/with_ampersand.docx'.format(self.TESTFILE_BASE)
with open(path, 'rb') as f:
content = f.read()
handler = DocxHandler()
template, stringset = handler.parse(content)

# Make sure extracted data is OK
self.assertEqual(len(stringset), 1)
openstring = stringset[0]
self.assertEqual(openstring.order, 0)
self.assertEqual(openstring.string,
u'This is an & ampersand')
self.assertEqual(openstring.string, openstring.key)

# Compile with altered translation
translation = U'THIS IS AN & AMPERSAND'
stringset = [
OpenString(openstring.key, translation, order=0)
]
content = handler.compile(template, stringset)

# Make sure compiled file has altered data
docx = DocxFile(content)
self.assertFalse("This is an" in docx.get_document())
self.assertFalse("ampersand" in docx.get_document())
self.assertTrue("THIS IS AN" in docx.get_document())
self.assertTrue("AMPERSAND" in docx.get_document())

# Parse compiled file
template, stringset = handler.parse(content)

# Make sure compiled file has the correct translation
self.assertEqual(len(stringset), 1)
openstring = stringset[0]
self.assertEqual(openstring.order, 0)
self.assertEqual(openstring.string, translation)
self.assertEqual(openstring.string, openstring.key)

def test_escape_xml(self):
for original, escaped in (("ab", "ab"),
("a<b", "a&lt;b"),
("a<tx>b", "a<tx>b"),
("a<tx>b<c</tx>", "a<tx>b&lt;c</tx>")):
self.assertEqual(DocxHandler._escape_xml(original), escaped)

def test_lt(self):
# Parse original file
path = '{}/with_lt.docx'.format(self.TESTFILE_BASE)
with open(path, 'rb') as f:
content = f.read()
handler = DocxHandler()
template, stringset = handler.parse(content)

# Make sure extracted data is OK
self.assertEqual(len(stringset), 1)
openstring = stringset[0]
self.assertEqual(openstring.order, 0)
self.assertEqual(openstring.string,
u'This is a < lessthan')
self.assertEqual(openstring.string, openstring.key)

# Compile with altered translation
translation = U'THIS IS A < LESSTHAN'
stringset = [
OpenString(openstring.key, translation, order=0)
]
content = handler.compile(template, stringset)

# Make sure compiled file has altered data
docx = DocxFile(content)
self.assertFalse("This is a" in docx.get_document())
self.assertFalse("lessthan" in docx.get_document())
self.assertTrue("THIS IS A" in docx.get_document())
self.assertTrue("LESSTHAN" in docx.get_document())

# Parse compiled file
template, stringset = handler.parse(content)

# Make sure compiled file has the correct translation
self.assertEqual(len(stringset), 1)
openstring = stringset[0]
self.assertEqual(openstring.order, 0)
self.assertEqual(openstring.string, translation)
self.assertEqual(openstring.string, openstring.key)

0 comments on commit 232c4c4

Please sign in to comment.