Merge pull request #209 from transifex/docx_hyperlinks_after_text_check

Check hyperlinks after detecting text content
transifex · Mar 12, 2021 · fcfe304 · fcfe304
2 parents 8d5b032 + 3222a22
commit fcfe304
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 13 deletions.
diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
@@ -329,31 +329,31 @@ def compile(self, template, stringset, **kwargs):
             leading_spaces = 0
 
             for index, text_element in enumerate(text_elements):
-                hyperlink_url = self.get_hyperlink_url(
-                    text_element, rels_soup
-                )
-                # the text parts of the translation are less that the
-                # text parts of the document, so we will just remove
-                # any excessing part from the document
-                if len(translation_soup) == 0:
-                    if hyperlink_url:
-                        text_element.find_parent('w:hyperlink').decompose()
-                    else:
-                        text_element.find_parent('w:r').decompose()
-                    continue
-
                 text = six.text_type(text_element.text)
                 # detect text elements that contain no text
                 # and remove leading whitespace from the next string 
                 if not text.strip():
                     leading_spaces = len(text) - len(text.strip())
                     continue
                 else:
+                    hyperlink_url = self.get_hyperlink_url(
+                        text_element, rels_soup
+                    )
+                    # the text parts of the translation are less that the
+                    # text parts of the document, so we will just remove
+                    # any excessing part from the document
+                    if len(translation_soup) == 0:
+                        if hyperlink_url:
+                            text_element.find_parent('w:hyperlink').decompose()
+                        else:
+                            text_element.decompose()
+                        continue
                     translation_part = translation_soup.pop(0)
                     translation = six.text_type(translation_part)
                     if not translation[:leading_spaces].strip():
                         translation = translation[leading_spaces:]
                     leading_spaces = 0
+
 
                 # the text parts of the translation are more that the
                 # text parts of the document, so we will compress the 

diff --git a/openformats/tests/formats/docx/files/missing_wr_parent.docx b/openformats/tests/formats/docx/files/missing_wr_parent.docx
diff --git a/openformats/tests/formats/docx/files/special_cases_out.docx b/openformats/tests/formats/docx/files/special_cases_out.docx
diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
@@ -8,6 +8,46 @@
 class DocxTestCase(unittest.TestCase):
     TESTFILE_BASE = 'openformats/tests/formats/docx/files'
 
+    def test_broken_file(self):
+        path = '{}/missing_wr_parent.docx'.format(self.TESTFILE_BASE)
+        with open(path, 'rb') as f:
+            content = f.read()
+
+        docx = DocxFile(content)
+
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        self.assertEqual(len(stringset), 1)
+
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(
+            openstring.string,
+            u'Foo bar baz'
+        )
+        self.assertEqual(openstring.string, openstring.key)
+
+        translation = u'Φου βαρ βαζ'
+        stringset = [
+            OpenString(openstring.key, translation, order=1)
+        ]
+
+        content = handler.compile(template, stringset)
+
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        self.assertEqual(len(stringset), 1)
+
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(
+            openstring.string,
+            u'Φου βαρ βαζ'
+        )
+
+
     def test_docx_file(self):
         path = '{}/hello_world.docx'.format(self.TESTFILE_BASE)
         with open(path, 'rb') as f: