Merge pull request #221 from transifex/TX-13385_docx_ampersand_fix

Tx 13385 docx ampersand fix
transifex · Jul 20, 2021 · 232c4c4 · 232c4c4
2 parents 39ab6f1 + 751ebf8
commit 232c4c4
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 28 deletions.
diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
@@ -1,22 +1,20 @@
+import io
 import itertools
 import os
+import shutil
 import tempfile
 import uuid
-import six
-import io
-import re
-import shutil
+from zipfile import ZIP_DEFLATED, ZipFile
 
+import six
 from bs4 import BeautifulSoup
-from zipfile import ZipFile, ZIP_DEFLATED
-
-from openformats.strings import OpenString
 from openformats.handlers import Handler
+from openformats.strings import OpenString
 
 
 class DocxFile(object):
     """
-    A class used to wrap and expose the internals of a .docx file 
+    A class used to wrap and expose the internals of a .docx file
 
     A docx file is a zipped file that when unzipped,
     generates a similar file/folder structure:
@@ -77,7 +75,8 @@ class DocxFile(object):
     ```
     <Relationships>
         ...
-        <Relationship Id="rId6" Target="https://www.transifex.com/" TargetMode="External"/>
+        <Relationship Id="rId6" Target="https://www.transifex.com/"
+                      TargetMode="External"/>
         ...
     </Relationships>
     ```
@@ -103,11 +102,12 @@ def __init__(self, content):
         with io.open(base_rels_path, 'r') as f:
             base_rels = f.read()
 
-        document_relative_path = next(
-            relationship for relationship in BeautifulSoup(base_rels, 'xml').find_all(
-                attrs={'Target': True}
-            ) if relationship.attrs.get('Type').endswith('/officeDocument')
-        ).attrs['Target']
+        document_relative_path = next((
+            relationship
+            for relationship in (BeautifulSoup(base_rels, 'xml').
+                                 find_all(attrs={'Target': True}))
+            if relationship.attrs.get('Type').endswith('/officeDocument')
+        )).attrs['Target']
 
         self.__document_path = '{}/{}'.format(
             self.__tmp_folder, document_relative_path
@@ -197,7 +197,7 @@ def parse(self, content, **kwargs):
         """
         We will segment the text by paragraph `<w:p>` as this
         is defined in the docx structure.
-        
+
         For all the text `<w:t>` inside a paragraph,
         we use tag separators `<tx>`, in order to denote
         text style changes (normal->bold, bold->italic, 10px->14px etc)
@@ -216,7 +216,7 @@ def parse(self, content, **kwargs):
         order = itertools.count()
         for paragraph in soup.find_all('w:p'):
             paragraph_text = []
-            text_elements =  paragraph.find_all('w:t')
+            text_elements = paragraph.find_all('w:t')
             if not text_elements:
                 continue
 
@@ -308,7 +308,7 @@ def compile(self, template, stringset, **kwargs):
         rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml')
 
         for paragraph in soup.find_all('w:p'):
-            text_elements =  paragraph.find_all('w:t')
+            text_elements = paragraph.find_all('w:t')
             if not text_elements:
                 continue
 
@@ -321,17 +321,18 @@ def compile(self, template, stringset, **kwargs):
                 continue
 
             translation = stringset[txid].string
+            translation = self._escape_xml(translation)
 
             translation_soup = BeautifulSoup(
-                u'<wrapper>{}</wrapper>'.format(translation), 'xml'
+                u'<wrapper>{}</wrapper>'.format(translation), 'xml',
             ).find_all(text=True)
 
             leading_spaces = 0
 
             for index, text_element in enumerate(text_elements):
                 text = six.text_type(text_element.text)
                 # detect text elements that contain no text
-                # and remove leading whitespace from the next string 
+                # and remove leading whitespace from the next string
                 if not text.strip():
                     leading_spaces = len(text) - len(text.strip())
                     continue
@@ -354,17 +355,16 @@ def compile(self, template, stringset, **kwargs):
                         translation = translation[leading_spaces:]
                     leading_spaces = 0
 
-
                 # the text parts of the translation are more that the
-                # text parts of the document, so we will compress the 
+                # text parts of the document, so we will compress the
                 # remaining translation parts into one string
-                if index == len(text_elements) - 1 and len(translation_soup) > 0:
+                if (index == len(text_elements) - 1 and
+                        len(translation_soup) > 0):
                     translation = "".join(
                         [translation] +
                         [six.text_type(t) for t in translation_soup]
                     )
 
-
                 if hyperlink_url:
                     # attempt to find a parent containing `href` attribute
                     # in order to extract the potential modified url.
@@ -376,10 +376,24 @@ def compile(self, template, stringset, **kwargs):
                     )
                 text_element.clear()
                 text_element.insert(0, translation)
-        
+
         docx.set_document(six.text_type(soup))
         docx.set_document_rels(six.text_type(rels_soup))
 
         result = docx.compress()
         docx.delete()
-        return result
+        return result
+
+    @staticmethod
+    def _escape_xml(translation):
+        """ Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in its
+            input. We do expect some '<tx>' and `</tx>` so we first replace
+            these tags to placeholders, do the escaping and restore them.
+        """
+        return translation.\
+            replace(u"<tx", u"__TX__OPENING__TAG__").\
+            replace(u"</tx>", u"__TX__CLOSING__TAG__").\
+            replace(u"&", "&amp;").\
+            replace(u"<", "&lt;").\
+            replace(u"__TX__OPENING__TAG__", u"<tx").\
+            replace(u"__TX__CLOSING__TAG__", u"</tx>")
diff --git a/openformats/tests/formats/docx/files/with_ampersand.docx b/openformats/tests/formats/docx/files/with_ampersand.docx
diff --git a/openformats/tests/formats/docx/files/with_lt.docx b/openformats/tests/formats/docx/files/with_lt.docx
diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
@@ -13,7 +13,7 @@ def test_broken_file(self):
         with open(path, 'rb') as f:
             content = f.read()
 
-        docx = DocxFile(content)
+        DocxFile(content)  # Make sure no errors happen during init
 
         handler = DocxHandler()
         template, stringset = handler.parse(content)
@@ -47,7 +47,6 @@ def test_broken_file(self):
             u'Φου βαρ βαζ'
         )
 
-
     def test_docx_file(self):
         path = '{}/hello_world.docx'.format(self.TESTFILE_BASE)
         with open(path, 'rb') as f:
@@ -515,7 +514,7 @@ def test_two_text_elements_file(self):
         docx = DocxFile(content)
 
         expected_strings = [
-            
+
             u'<tx>Hello</tx><tx> world</tx>',
             u'<tx>Goodbye </tx><tx>world</tx>',
             u'<tx>This is a </tx><tx href="https://google.com/">link</tx>',
@@ -587,3 +586,90 @@ def test_two_text_elements_file(self):
 
         for url in [u'https://transifex.com/']:
             self.assertTrue(url in docx.get_document_rels())
+
+    def test_ampersand(self):
+        # Parse original file
+        path = '{}/with_ampersand.docx'.format(self.TESTFILE_BASE)
+        with open(path, 'rb') as f:
+            content = f.read()
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        # Make sure extracted data is OK
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string,
+                         u'This is an & ampersand')
+        self.assertEqual(openstring.string, openstring.key)
+
+        # Compile with altered translation
+        translation = U'THIS IS AN & AMPERSAND'
+        stringset = [
+            OpenString(openstring.key, translation, order=0)
+        ]
+        content = handler.compile(template, stringset)
+
+        # Make sure compiled file has altered data
+        docx = DocxFile(content)
+        self.assertFalse("This is an" in docx.get_document())
+        self.assertFalse("ampersand" in docx.get_document())
+        self.assertTrue("THIS IS AN" in docx.get_document())
+        self.assertTrue("AMPERSAND" in docx.get_document())
+
+        # Parse compiled file
+        template, stringset = handler.parse(content)
+
+        # Make sure compiled file has the correct translation
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string, translation)
+        self.assertEqual(openstring.string, openstring.key)
+
+    def test_escape_xml(self):
+        for original, escaped in (("ab", "ab"),
+                                  ("a<b", "a&lt;b"),
+                                  ("a<tx>b", "a<tx>b"),
+                                  ("a<tx>b<c</tx>", "a<tx>b&lt;c</tx>")):
+            self.assertEqual(DocxHandler._escape_xml(original), escaped)
+
+    def test_lt(self):
+        # Parse original file
+        path = '{}/with_lt.docx'.format(self.TESTFILE_BASE)
+        with open(path, 'rb') as f:
+            content = f.read()
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        # Make sure extracted data is OK
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string,
+                         u'This is a < lessthan')
+        self.assertEqual(openstring.string, openstring.key)
+
+        # Compile with altered translation
+        translation = U'THIS IS A < LESSTHAN'
+        stringset = [
+            OpenString(openstring.key, translation, order=0)
+        ]
+        content = handler.compile(template, stringset)
+
+        # Make sure compiled file has altered data
+        docx = DocxFile(content)
+        self.assertFalse("This is a" in docx.get_document())
+        self.assertFalse("lessthan" in docx.get_document())
+        self.assertTrue("THIS IS A" in docx.get_document())
+        self.assertTrue("LESSTHAN" in docx.get_document())
+
+        # Parse compiled file
+        template, stringset = handler.parse(content)
+
+        # Make sure compiled file has the correct translation
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string, translation)
+        self.assertEqual(openstring.string, openstring.key)