From 58f4c500b1a2bfda0b7536c8a70ae4ed4d7c858f Mon Sep 17 00:00:00 2001
From: Konstantinos Bairaktaris <kbairak@transifex.com>
Date: Wed, 14 Jul 2021 09:40:59 +0300
Subject: [PATCH 1/4] Apply flake8 suggestions

Doing this first so that the fix commit is simpler
---
 openformats/formats/docx.py                 | 36 ++++++++++-----------
 openformats/tests/formats/docx/test_docx.py |  5 ++-
 2 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
index 758b25ce..5f4c70f0 100644
--- a/openformats/formats/docx.py
+++ b/openformats/formats/docx.py
@@ -4,7 +4,6 @@
 import uuid
 import six
 import io
-import re
 import shutil
 
 from bs4 import BeautifulSoup
@@ -16,7 +15,7 @@
 
 class DocxFile(object):
     """
-    A class used to wrap and expose the internals of a .docx file 
+    A class used to wrap and expose the internals of a .docx file
 
     A docx file is a zipped file that when unzipped,
     generates a similar file/folder structure:
@@ -77,7 +76,8 @@ class DocxFile(object):
     ```
     <Relationships>
         ...
-        <Relationship Id="rId6" Target="https://www.transifex.com/" TargetMode="External"/>
+        <Relationship Id="rId6" Target="https://www.transifex.com/"
+                      TargetMode="External"/>
         ...
     </Relationships>
     ```
@@ -103,11 +103,12 @@ def __init__(self, content):
         with io.open(base_rels_path, 'r') as f:
             base_rels = f.read()
 
-        document_relative_path = next(
-            relationship for relationship in BeautifulSoup(base_rels, 'xml').find_all(
-                attrs={'Target': True}
-            ) if relationship.attrs.get('Type').endswith('/officeDocument')
-        ).attrs['Target']
+        document_relative_path = next((
+            relationship
+            for relationship in (BeautifulSoup(base_rels, 'xml').
+                                 find_all(attrs={'Target': True}))
+            if relationship.attrs.get('Type').endswith('/officeDocument')
+        )).attrs['Target']
 
         self.__document_path = '{}/{}'.format(
             self.__tmp_folder, document_relative_path
@@ -197,7 +198,7 @@ def parse(self, content, **kwargs):
         """
         We will segment the text by paragraph `<w:p>` as this
         is defined in the docx structure.
-        
+
         For all the text `<w:t>` inside a paragraph,
         we use tag separators `<tx>`, in order to denote
         text style changes (normal->bold, bold->italic, 10px->14px etc)
@@ -216,7 +217,7 @@ def parse(self, content, **kwargs):
         order = itertools.count()
         for paragraph in soup.find_all('w:p'):
             paragraph_text = []
-            text_elements =  paragraph.find_all('w:t')
+            text_elements = paragraph.find_all('w:t')
             if not text_elements:
                 continue
 
@@ -308,7 +309,7 @@ def compile(self, template, stringset, **kwargs):
         rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml')
 
         for paragraph in soup.find_all('w:p'):
-            text_elements =  paragraph.find_all('w:t')
+            text_elements = paragraph.find_all('w:t')
             if not text_elements:
                 continue
 
@@ -331,7 +332,7 @@ def compile(self, template, stringset, **kwargs):
             for index, text_element in enumerate(text_elements):
                 text = six.text_type(text_element.text)
                 # detect text elements that contain no text
-                # and remove leading whitespace from the next string 
+                # and remove leading whitespace from the next string
                 if not text.strip():
                     leading_spaces = len(text) - len(text.strip())
                     continue
@@ -354,17 +355,16 @@ def compile(self, template, stringset, **kwargs):
                         translation = translation[leading_spaces:]
                     leading_spaces = 0
 
-                
                 # the text parts of the translation are more that the
-                # text parts of the document, so we will compress the 
+                # text parts of the document, so we will compress the
                 # remaining translation parts into one string
-                if index == len(text_elements) - 1 and len(translation_soup) > 0:
+                if (index == len(text_elements) - 1 and
+                        len(translation_soup) > 0):
                     translation = "".join(
                         [translation] +
                         [six.text_type(t) for t in translation_soup]
                     )
 
-               
                 if hyperlink_url:
                     # attempt to find a parent containing `href` attribute
                     # in order to extract the potential modified url.
@@ -376,10 +376,10 @@ def compile(self, template, stringset, **kwargs):
                     )
                 text_element.clear()
                 text_element.insert(0, translation)
-        
+
         docx.set_document(six.text_type(soup))
         docx.set_document_rels(six.text_type(rels_soup))
 
         result = docx.compress()
         docx.delete()
-        return result
\ No newline at end of file
+        return result
diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
index f52a656a..2bd6a829 100644
--- a/openformats/tests/formats/docx/test_docx.py
+++ b/openformats/tests/formats/docx/test_docx.py
@@ -13,7 +13,7 @@ def test_broken_file(self):
         with open(path, 'rb') as f:
             content = f.read()
 
-        docx = DocxFile(content)
+        DocxFile(content)  # Make sure no errors happen during init
 
         handler = DocxHandler()
         template, stringset = handler.parse(content)
@@ -47,7 +47,6 @@ def test_broken_file(self):
             u'Φου βαρ βαζ'
         )
 
-
     def test_docx_file(self):
         path = '{}/hello_world.docx'.format(self.TESTFILE_BASE)
         with open(path, 'rb') as f:
@@ -515,7 +514,7 @@ def test_two_text_elements_file(self):
         docx = DocxFile(content)
 
         expected_strings = [
-            
+
             u'<tx>Hello</tx><tx> world</tx>',
             u'<tx>Goodbye </tx><tx>world</tx>',
             u'<tx>This is a </tx><tx href="https://google.com/">link</tx>',

From 7de2b749725415ff4f5373ee2b163e7a7e6a3352 Mon Sep 17 00:00:00 2001
From: Konstantinos Bairaktaris <kbairak@transifex.com>
Date: Wed, 14 Jul 2021 10:10:22 +0300
Subject: [PATCH 2/4] Replace user's '&' with '&amp;' in compiled docx

The issue is that BeautifulSoup does a "best effort" when converting its
input to XML data. So, when a plain '&' is in the string, the
information will be lost:

    >>> BeautifulSoup('<w>a & b</w>', 'xml').w.string
    <<< u'a  b'

This commit adds a `.replace('&', '&amp;')` to the constructor of the
soup object used for translations to fix this issue
---
 openformats/formats/docx.py                   |   3 +-
 .../formats/docx/files/with_ampersand.docx    | Bin 0 -> 4257 bytes
 openformats/tests/formats/docx/test_docx.py   |  40 ++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 openformats/tests/formats/docx/files/with_ampersand.docx

diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
index 5f4c70f0..ae114011 100644
--- a/openformats/formats/docx.py
+++ b/openformats/formats/docx.py
@@ -324,7 +324,8 @@ def compile(self, template, stringset, **kwargs):
             translation = stringset[txid].string
 
             translation_soup = BeautifulSoup(
-                u'<wrapper>{}</wrapper>'.format(translation), 'xml'
+                u'<wrapper>{}</wrapper>'.
+                format(translation.replace("&", "&amp;")), 'xml'
             ).find_all(text=True)
 
             leading_spaces = 0
diff --git a/openformats/tests/formats/docx/files/with_ampersand.docx b/openformats/tests/formats/docx/files/with_ampersand.docx
new file mode 100644
index 0000000000000000000000000000000000000000..fdbee23d331284a51e341a624edff227a5c8607b
GIT binary patch
literal 4257
zcmaJ^2{@GP8lJIa?E7TPu2IA&6@J^;GPXfv-(oOgW-J+NHL?^cM1{s0vhPIVM<O!W
znaU*k3r#3{wlmZD&q<y0_sn(8HFI6>^L_7oKlgLL_hW8M&%g%&fj|JYv!5*i`+}AF
z+a&<$8>py2`z}v0HKvEL#V+pf>n=?C%3bRru1V@(Bv+ZD(<uuBM5Ys{@;x$)rP(|_
zr@JL|c}Zl*kGPv-c0VBj)>(pr`9lYrp*|t6H5A2MhZUP`G?PY*(jAYMkfZb5c;L6L
zb(v6#m6iM-W=eP>EW6duOV$=8`%gBk-@AM~CtGQB%%{iUF<t$Z;r8iMVc;6+=)<1Q
zg^rcADhm8&p&o;-`Ju)Bn?A~EY#~nGJK?!1Y{aFr%4c4}Dnh^5sn|9&rpXpK7~u9$
z%9x10mTuoc=jnw0yiStvO18N%6KHk-C`F>4SAz}!F#ktHxTz~VFz)677;K=TJ0<|B
z5aQ>XX6c0JfbvDI+sH@0ZAQcTj~nF0Lf|G>ydz3KBdx|OP^?!kV;ZaRy$P80H-TXb
zHp?W~xpvV+{`w3_o-SEHmyd##s{W@@^_DaEi&p;jqb!B9AaH+t=KZ<E(F4vV<?SlK
zQ%eTDo+gOFF;#+_F9|GNpKK$GK>J7)FAt>Jo?X*7{u3_p_B`{=fO%OoyRL>>-fbDL
z`~q@%$%!d8T}GbN+IxBo+4UTWbv9x_FJdfNa3ff2wWR!Su3fUe8Has_%;vWs8-8C}
zC<g2?J~mwd{>|d#M8O=6kSX?>);E8)6y8FVsvb>Ik#LJRm=dHvB%kg9-_DwGS#;>s
ztS}v{@tShyxFeWLXmdmC8iqJpSTAgMu$sRMBf+0ph4;qWH!hud3!36Qv*<5#I!GGP
zw3nz1S1=G_YwkU;mbVkmI2V7j>hr4wO-+2$r3c&_6U;WSY|##_z3PnvwDM%yo430H
z0f5)PEf3fJ^0;BKwCWI&`_UaxuIR<c_?~H_`+C~k0g2!8vy$RR_`(t#z2J^A4M;^0
zxiz*9MO;OBd99Q`@%$7xZUDhL3leTCn;A0LXBf??K^D9Eg5Z+3bZqiY3IzlJ5o6mo
z{Rq`l=J;E5u&w*b*Mjwn@3IGVbhnSSs|t)Wvki@oh;SHn)FyXxC=bcQ!F)tebCiz{
zlla{AQr_Kz_JW~LvfuI_6b`x1KR`Msjy2fp?a1+F9R(tE!nr8zug<~oZc-8Xnul7*
z1_N6vH@y5$kz^y)?~3*5H{H|E*A~oLV&p+(n26pgDK8Yvcb`aW`DD{S9C|-Gkac@)
z;@rqoXs@?R)f4};kAVhTQ5q*Zn6FC^hh_4mRBP5)GEd8S06a)4%EvpTAwCe_<r8Rg
z7@Jso8<2)J!V6&rrJ^nRTeL-}XyY&e9*Y0g2x?zk^h2V9XrTXFFOp4JsL02xXGy*w
z7g}>A7ozpBa;pF(LPPs0=I63u;q>wLveO-ZZAUIDDZ039-E<mY^1E?G9*)noXc#Kw
z#>zGh%akR+Tf5CH9hu96@m<2a40U?wu2A!1YsaO>a<AWh_08E*KC+Ww1tE!N7tBR%
zArA1-AAW63w_VYDCGB)vTa};Fg&5&dqK1%A@)RC@DAq8w6~?pl@OBSSQ$cI1D8i3D
z?d!z5*X^aN{ZF?mwY2l@{q^x+7NIBxPGouq#?j!|RD}N7MLjI<Z{eV6!*@9L15sdQ
zig^uXd$MRFphmef@w_H?X?}&rcfw8oiiehVGSVSank7}_^ww(rbY%r&tScj1-?WsN
z7|LRu6JY@Hy6%iWHsO<At!ED`sJYmFNu*7F!FHOyQD)o~*{(2D8~6F~mVzRz&frxN
z*VR>3zB2vH@Ik}Qoeo+2R4xlE@Nis-r49$Huw<31DM$e^EaA;;X5VpCyOZ9p+H5On
zGD*(Ki&Vr{@BzQ_%|KPFhOn2m=esgHKABQaFMlX1iF{Tnaba1gf}&U3i<VXSYQnv6
z=Y`DI1-st17)lt{bEQ%iB50hV6tvi@6H!WGe4G2ujOe@idZJab4()Mb`i>8Kw@Z1#
zp}>_h2%}PL6Ie6B-j7^3k9@g%OEF7K>MN$6@jb-!s<s~7s=Gm%6(XzXe{p<Vw6aga
zvCgD<`^Cj}QTNe|0vn8V$zaxMVfJ}tZ@U<BY%+u}jz8$)<|NwCZLk?FioU)6=Fv;$
zC*CC}Y`()RAv{B1ssN-r+hr<Q9>Egs+^5-A%Gt)8i?=Hu7j714ZpmG|h0F8}+8`#E
zSsVwJa%Rq6EkS4b2|qW!UTyx00eiv{7|pB-1G#!up*sN_#i9;@l!9}yv&Kh8Mg~G2
ztbyFh@~2;M>N6Zg6(#i+u)5DHi6XeoJ{mljL705Z^xI;h5hu*5!u%Vpj~IV<sc`;>
zIDtW-zQ{mY|MXkEHk*NRMZT@Pw(V@mb>2c>+t)&0X2j{(67a0oknO6UUz}y@CBM@O
zPXg`+yQFNGsA~oiLx>8Zj4wq8YzsYL9GuP4x^U~~?;nV-<E|2qJTIfSqqE^*J$}Ym
zf!Idq4;$-Oef47TZo_Fg!_W5@{%R^X$x_8+l)qAKBnTub_Dg3UIqQ=zYyWAOHB0Q`
zV@MO=Fx)6Gq$SM|>XT}n_D7D<`;roYu6w;0Ey^>+_Oo=bw>o-P(oZ$dETgc#FH{1T
z4P!l`)?#^!*%F&7$Cf2NwvJg2%9Z83BgPhNB3~gN$5-N0h@Q2gw!OA6^n}0-M5RY?
zLag=ExOrpLdNfzr<2p8$03@GWtwYi&VZcZCoZ%(<`U|;o5ddahXyW^K8%KoZ_QC+p
zN%@Q(*?0c**ZCIAJ0lKZ)+IHyIe9Hx;lAYqZrIHgkC&YgiMbK#5Yl=ij_epSo|xt{
zwoWJ5b<jkp{01hv?^*GurV0y%>#D=9;ZbOWZGSq3#Qn$MVsPMRjbT^U^V;M1AU5{C
zsKlrp^^7AhSGCd@sRFI56Ox<8byq<s3@+{c-p~One<Y9Y()8}~!ZmQqXyJuuXjnsp
z(MAVDmF;foWA)UHS)NND+b%y0XdRzCk|$d3Yk1@1rgx4!SkpjD1v)#(u9q9&y?tLj
ztZkEAu0d{0erex!es+#S7oj!NXW3`!PIoV6wsfq{NW-`<{pnj%f!b<>!_yNxmvqjh
zx(x4x++7emHXm!(Xl>4b8?7Om$=ycX-BYw*wP=kT$aQOwg>0W>3U<UR$6{K(<{NBe
z)rBi)dvD?#uh@j89m|T``e3nQF&JC(wENg|@a$dYHRO$?R?4Oejl}?{`|ayg>C0gL
z9gBJX!(vYiI>^fHyzh5!RBjG4>x6Q(EVK0C=IfZE^9uQSrlZ4h{sKC94cpd?Cyz_7
ztUof7ljzCKAyWvXbdp_#7J*;mz^SaGs@eRO;wTqkHLI_Y>j8VRjAwF$${=O$x$P=!
zaDBPH>iS?5;gQHu(zA&)qiXaZ@MT0?YSl9)CZ)(u7`x_((~?m7?dDn^3qH<XyhP|7
z=JB45LM5T>C%affe?De!i9*&#ClB?FsuBc@=<ez)ChCHPMV@Bjw9|7OK#34D&W03y
z32QRF7v;-w_Vg?~QE$@ug9ZbAcml-mhG&~e<C(_w;Gqlt7b=VN(i<|=6PP-;5v4Y@
z-AR352e!iJ6|OUfa}N@-BTI`tmL2XcX$W#57=^=|HHq2rYFZ(yj5KiDyGfu!REe;r
z{~fshy;}m2K|v_A*LRUXCTXBMp={LqwPS#6$ULAW72J?#T=am+)k6&|emzRB`y#@n
z&|oT8xN9-_7GkRtw=~Kh=+7ryKPh1CRyWcVf*Kj0+xcQ`9U~Dub>w`$r~LeYCZ(uZ
zy!N4yjPUv5A_Mjp#beDAK>Ad6nZ%B;UZ8E(W*(&1sn?^LPmtYtF)?%u=x9}5wk^Rj
zY6W=#mg-Y|CL2kAXbl>s*}mD6su#fc_Tj<h23LV}%@4}140l9u#L%buu1BP&>eUn)
z6QDXtR>tE50#5z1Wt`E#y~#F{c9Ka<7`f+KVJi;2XIcDsZf(x9*aq?BlJyoV4KG48
zQ{opY)i6}d{@JJVQdc<WQi(|QBQ92<SY)6RO;(?z_R-j=zrGXS<B<BON*g3I7uC(v
z47%V!f%{w&tC<|y)tFH(G3U>qi$SUfE~Yn`rMw&~F+DfcqTDS0k(4T()5L9gnlZFw
zVOZ|C93-^9dWx4a+A5yFMLAiJrKqixEyyY)SMMLs7!kx?pV(G?V74nL4Bj*d3^LSR
z5l3EPJ?YRO(d01m<xWCC${XJ;$rAAwug(R4t4!R}IOav_cCTOyy2wE&ExXv>if+XR
zt6na=W#~14Wmo_7X4Uj)N7VzloDbYH!Is*`y%f};G3+a<Xq{T##S?lwu`dbLqLp2+
zV3iH}mWO7?gz&69H>L7c*YSlFGE6JBM^inxu7bsHdb9~czF~3TiCr-mZuaGX%jZLr
zSR@B|rs%pb<!C%yLoJRo6TCfFH1X}`^aZ_hl!Bdh*3+ljiDByWAmGW%$vTT~Zb6@;
z*4yN@s6}7`v5s<gTTu(*{QaH-((wU)43zr^E80N$bL<;0&5eIf+;915{QnU=)K~q_
z6!vTSeh*8V3w}g3Rm=XH{`ZvdYxaK2NAtNqLX6?}uJ_j`?e_pbA47qP*8dpBU(@%C
zFU=_Y2rs6;(|_28Uz7KX3r(MY#4(n?lYdg{U-S1j+s}KCr<U^j-uDaAuj%_~LF>pL
a(aiRL`_tT*@c?ZRGxeiRtrCKR_U~Vs3EMRQ

literal 0
HcmV?d00001

diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
index 2bd6a829..aa195fdc 100644
--- a/openformats/tests/formats/docx/test_docx.py
+++ b/openformats/tests/formats/docx/test_docx.py
@@ -586,3 +586,43 @@ def test_two_text_elements_file(self):
 
         for url in [u'https://transifex.com/']:
             self.assertTrue(url in docx.get_document_rels())
+
+    def test_ampersand(self):
+        # Parse original file
+        path = '{}/with_ampersand.docx'.format(self.TESTFILE_BASE)
+        with open(path, 'rb') as f:
+            content = f.read()
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        # Make sure extracted data is OK
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string,
+                         u'This is an & ampersand')
+        self.assertEqual(openstring.string, openstring.key)
+
+        # Compile with altered translation
+        translation = U'THIS IS AN & AMPERSAND'
+        stringset = [
+            OpenString(openstring.key, translation, order=0)
+        ]
+        content = handler.compile(template, stringset)
+
+        # Make sure compiled file has altered data
+        docx = DocxFile(content)
+        self.assertFalse("This is an" in docx.get_document())
+        self.assertFalse("ampersand" in docx.get_document())
+        self.assertTrue("THIS IS AN" in docx.get_document())
+        self.assertTrue("AMPERSAND" in docx.get_document())
+
+        # Parse compiled file
+        template, stringset = handler.parse(content)
+
+        # Make sure compiled file has the correct translation
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string, translation)
+        self.assertEqual(openstring.string, openstring.key)

From c5f2d4c8fbc3ec1c521144d84468c684ed8e9553 Mon Sep 17 00:00:00 2001
From: Konstantinos Bairaktaris <kbairak@transifex.com>
Date: Wed, 14 Jul 2021 14:11:22 +0300
Subject: [PATCH 3/4] Also escape '<' to '&lt;'

---
 openformats/formats/docx.py                   |  15 +++++-
 .../tests/formats/docx/files/with_lt.docx     | Bin 0 -> 4756 bytes
 openformats/tests/formats/docx/test_docx.py   |  49 ++++++++++++++++++
 3 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 openformats/tests/formats/docx/files/with_lt.docx

diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
index ae114011..eb1bf9b3 100644
--- a/openformats/formats/docx.py
+++ b/openformats/formats/docx.py
@@ -1,4 +1,5 @@
 import itertools
+import re
 import os
 import tempfile
 import uuid
@@ -170,6 +171,11 @@ class DocxHandler(Handler):
     EXTRACTS_RAW = False
     name = "DOCX"
 
+    LT_PATTERN = r'\<(?!tx|/tx)'
+    LT_PATTERN = (LT_PATTERN.decode("utf8")
+                  if isinstance(LT_PATTERN, six.binary_type)
+                  else LT_PATTERN)
+
     @classmethod
     def get_hyperlink_url(cls, element, document_rels):
         run_parent = element.find_parent('w:r').parent
@@ -323,9 +329,14 @@ def compile(self, template, stringset, **kwargs):
 
             translation = stringset[txid].string
 
+            # Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in
+            # its input. We do expect some '<tx>' and `</tx>` so we only escape
+            # '<' if it's not part of '<tx>'
+            translation = translation.replace(u"&", u"&amp;")
+            translation = re.sub(self.LT_PATTERN, u"&lt;", translation)
+
             translation_soup = BeautifulSoup(
-                u'<wrapper>{}</wrapper>'.
-                format(translation.replace("&", "&amp;")), 'xml'
+                u'<wrapper>{}</wrapper>'.format(translation), 'xml',
             ).find_all(text=True)
 
             leading_spaces = 0
diff --git a/openformats/tests/formats/docx/files/with_lt.docx b/openformats/tests/formats/docx/files/with_lt.docx
new file mode 100644
index 0000000000000000000000000000000000000000..4fcfb994bc71918a1e8fb1f874cbb5c29cb52375
GIT binary patch
literal 4756
zcmaJ_2UHX55~d|IL8|m3h=72Apn#XE80kVNO6VprfP~Ny0i{S06zRQpgwTY5gd$CP
zktPu7Rip?Ah!kJqd-sXHcbz@EXU|Dycjx<N=Kto`)g&cjBBG$6AUe9U4kkKb)cAV~
zC%B!n_(j4MJ<zB@DoF>N_Ds8*;OUMrRE52&3+_)+^Y#hvUI!*=spUeZCUvQ5bd4Ih
zrS_MW__HTCkLn-hMRSn@3Tu2r{H#7TrBA!=Rn!J)S6N$Fm*l8!E#EEf)M=*zMSS&%
z?@=>fO5_i48g$-*bV|~&R(cGQ&eYhfOB}8G*bf*Z(S8+`BKdA<5y$VJr>dHWvC{}b
z-Wp({WIJ<A^L$l(;oGKf_oMSQrKaKi-!^#HJrT<hJ&pR|nisE90W{rGv!Th16Q=qw
zTI130TdH~8oF8`+vu$Q^+l?KW3NQK+Z-hprjd>I6YLZhdIPIz~<HPSKAtKWKM}X+@
z|A3*Ox=ttuXK^UX34ZYb((XCf4CN=m<ULiw>mB5G=`xLFxtL-P>}jHy=cjQIH~)bO
z!!J`MKIu5GQJlhFa_Ye_?2#z-2GMx1My?RmwRv9{Gs~r0@9x!oMzl-CYUqS&Tno{$
z?3c%C#`6^oe9jaBj8jRZN<{`r#AK#ahFIE-(rVa}<b{WQFsjKOQAvT`7-Q)ne$nLM
zr&$`UujvR~E^MSv5tHK(9X4k{6C+rugj68@S-edwXn<K_(rL(|hB1{V*g=0h%Kw{n
z3vbb<tNCr{p&g!?s)g;BT*EBo#>}MA0!tu@L1i(AKDO{;)4tj;p0p9_=;uv(^FjJb
zA@B9nn&MkFJ^dcnp-y3|2aKD|eM03}a|6oP1G)D3XB<@E_3qSt%HNU<McBYS!qm2_
z16<xBK1#{SZO=`o^gMcxoU`Mf=sPK4*$ZT>MefT6FK)o2Fv~<q#XZM!jPYwAcArpm
zDZu5U;^8*WD-?kuQ)XpS7%>qM?vE7FoutUh!GXX~N<@!+lLXz94eyZlS&d}XaK3rg
zotp}0nG1x!1ezk$O$9J;af-?MzzUn*9UBDV%i8$8Lg)7&K?id-vv6r0H8Mz|#>X1o
zjfE~36}}hOZf9O*r*a~`Wpo&ZOfH+&^?gC2a^NQ&?5182N8{4e);Q8A$NHY~)WGl%
zC#^=)tB5vQ=>bu7Cn^M)nabr#((o@W3zXlf(a&nvm#P+8f!qiI7Ys8+^1Fs|X`ykj
z>zkzv1&UouWMTH3&wO5JMXuUOQ2&AKbA;m+V-aup4W%>tifK2p-YA;RP@fx+PLSs|
zG|`p<wUvr^;FK=kT5@FQ75j{Xa_IRl31@jXy1g|#t}(MH%>&7vm2B}yl)4(pe?PC(
z<JC2wa_qC~{QO!@nQl#2l;(iUEuZNRIS62{KiYXDjt86jk6<&N1RLt=?1Do6z`M7M
zs!4(=U}hqZVhm?M9x+f6D*KngUic)2*csKX9qR4L@3$Tn#G|M)sBm<>h;EGD_O-Z(
zFp(Gyt)Lj{C`VSkv18hj`$_r&P!#L--jp8@+(8^F%6nffkI6P?w0D{&P7ZpJ{{{-2
z>Ijw&>upL{oh6RCYA~!f7QndA=OwqlxpMhPlNL6$@pTe)))Sfa2!UfE#5FTQ<P(Sg
z>P`Il&vnj;|J5Dk1QY*vH^i5`D-v$+LWu3(oiRciA0S=8UMgSrWUe7yM6SK6gUAk%
zWD2J73T21TLr>BWh|p})!lCzuq_~B}L6})TIr6!WsJd^49%dky-a)8pP@pJOy}nHc
zY)XlC^KIo|B&$%hZ}rd>+r4ydB;#q+_&0N~sCRRUf#4W#TFytmgIho*((^b&lEdOo
zpBR~-hEk;2{Q!=_UIljch-qJYu0XZudKHGx`BCk}iWij*^1P5VF<YZExW>Yro|eNB
zCFRWcg(a5Ml)M1-Uh<hU?gTjY#U<Xf;)7-UBh7@i_XCcTKuj2R*yAMV>h?s)xF0&7
z!`jp5NBHxC{D{z2%Dv>zgf;Fq-(Us1C6|Z`YdFnblt?roZe~JwY%e+rnVBhR(yy_y
z7aM|dSty^zXVy%=>};P~w0BbHgQ_s|R@`Nilek{Y!c)9!lVKy0P#$qz_xv!Mjb2XR
zh?ZZ1nvNca**N9!rD1r7@F$i5geA!Kev)+9K{oY-#DZ52h;js37IM9_yo9+ga$fV9
zmXE8Kb`OVC`r5J4W5HIsDy6&?bLhMD+-_FECrlZ|WW}I(8z)dRH=^6Rx4@Kl)gs1A
z6ULT&Q+1nqUTv=F-SY4!UCv2qxyG*h)}!?3hdkp7D;T!^a|(GTlwXIgn3ugxS&pT)
zKW#R56Fj(oa3{rf7xm!LqjTCk7joz^O=e`|Odf8qne`?k7V?HG6I1M1fhzLGOz#@f
zh=`aCNO*PVB7kRAUE(O+U!`;a$N}W69qeRZ0$!KvYnQ2qkM=x{mYNrEEdB;cP^xBS
z^@Pg8@XHH}9&M+pJ8#RG<*3u?i_h<OxTU9)%=-d{ugxjbLJIx&pJuQZy|TO{o@J$Y
z$G)dP+vg>aLqDqRP^=^Fp%ZrNym;ha!ipYu_FJmo^K<XCPT71(bH(0TyQ}Za_?kzR
zC*X*kQf34LT!7jIpL*%S#9HY+nZbs*RffO@h5$6QV?0;Z6=YL-7S&}G*Y~#<<k?8-
z{olX2mFcXd)Z=|>>*b?LG#k^Yqr;l#!#(fxZIL(oj7$m_-WgY7+;?+~&>zTN=D2(Y
zWO>yJ9&49@kU@$Xbh)Uen%G!QGwh?$phEv7qyrW;s8AzCd-cI61H?*!w2|4wS<|~u
zy)9zI$AdI_r(!lMY}27~Zf?(hlaY|n?i*N4BtDs?$^TT}r~gi9XBQ7UxHF-ydkoA_
z<3CpM;6oi!3Rh+b#MK~QjX7y#m)ta&V|HdVbuD_pLJ8L*x&Y1L8Q3M+e7)K8dWrL(
zs}<TBMLra$9uYCA9(PVCzdIowF@J3TL5}}v8L!hc*b-!9-QK#NC*GA&CK7hu@gRWh
zq7)cX7-(Z_{ggSLu|T&Pa@G|vBDN!Mrja5dCd@uJep`Oc9h+wZ&<Y>Tl^L}$kbIdF
z5ulJ!t^wDR2+4@SO@0c@3uyx~D+H<m9_kf_ead@N&NkOwOu`mv;3AtqU10Tgq9W|H
zocpWANOLRV%T;g1cibmLlMKIdXj*BPbNXdGgU<){=wf%l=c@9JRZI(NnHH5P8%H&p
zdHHWc%H>}JtRf{H6iI77i|cL|xdfc%wWLjq7<&B37gAlt?Ssf}2<#pNvZ}&h7)jc@
z+>%Tw2e>7>)XrY(l?=8nuANBN^ZE^vNZ_ri1zlX>=NX9bnpZ;+y4Jjap*1jQ`0^DR
z!@<%Cz6m`gvc5Hp$bu+1-Ktn6CKW2Az4c714k!OWFumz38c6H>ojpi3TP#*Hwrin%
zEMMT(i|o2H=QAfj2tYKz7_saZR6iZ1cV{Rm?$JPXDAWqaa44RT8L7#o?l(A{c2kVi
z%Z{R${!jrE<riEsLF&IRQzm|vga_HO@nQRRLJ93Xgl~Q>n&$CV8Asr8I#-Z+!-4H~
ziV1lw>cU24J$!kL`+DExTo<Sd36+okx)M6lav#nwh9G%7blgiThoo!!vMns<aLUu)
zyQaZ4pX1(YuQKrmnXYZw%Jn+$<L|>StG2j&w|vJkI5*=sN1e%Lksfo=X(KUR*>Vm<
zUh@`?MCR=mOumP8zq|d;<VI33Cy)C%lPsR|qd|bPiV53Ogsd)YMr|hGvwDg0Pg%|I
zcUIp+*}E86-L?B6Pb+m*bebjTYPYGn+*c~dpJe753bat}D0h)!)^p2XCHy7o%Nvdp
zs@K9vuPxl$=<e<Abvl|UT9zo?H0anb3MWxN$jZ<cnA+{zdjWPJyXv>|Bu1XUYiGrN
zEUq%Ioe#|z=Pi)LiUSLe&|IBPKt(VZ9Zd7)qGSsugP<I>pvCO<A_!BfzmugJ?WGx+
zOZCOi-wKvYae-??$SUXt0UzRH7i`r8zeT>u>GzxqZ$0C=xx%G;ei=wI7cl6HhUuxr
z?7d<0<jy-gDAk{spX4~HBH>W_z;|ePHR$8|D($4sLoxfcO7&=)!j4zfGTo0W9!ip*
zdTm$MY`L;Zb;X|X>WbHfiMBw}RLzr;4xa}XMl16khe*jk*r6mqo3p3U#DNz&hNORj
z_P;BjGu*|+#vbvbWx#d2befe<)owGtoBWuoR-2j292Y{G3{iYDA-kqAwqv^{9bM9<
zp07N6&-0!<cgdBs43XaWG5w+*C%r_HtgN?fP^9864kxHzs%)2@6<qn6H>u~_y~_sA
z_&R5=$*2|wjwh>v;bK@u;50zsSx99xX)tErn!HoHb3WWSC*zeneBYAvtMy3DR;Uou
z^%#l2o@FjY=aljHTT*)qddjXoi3Iod4=<K^rCKcqn2UdWLa*v<>e!*sOLp_D@1YTP
z-%S8Sk!YLx&3~iTbh1DtOE>MPL8}p~rG*wXxLbg%YZi`qop^EMrslzWeS=VwkS$=;
zQVR!wLR5U!YMr)TubF3O2m9%VI!D*U$Wh{P%*3DnEUXBE(d0TlJ@IPF!ob4;?rcVA
zzID-fBqiu>>>Y))n?zTaDpLro`L!|B2;GORs@n$hluryCE6kmHsmq)gdnR_XyKbph
z)=AN<@T#%z_Q0~A={;zq&mu_8jCme@96uTU5f<GibH>ZBZkz>1?^l;Iv<(nil!B^2
zbEI|3eA0?DV>eG55!bQ5C=r&q>O*T2%ngg2y514%s>VI?;DeK%P9zp1o=MeIgcd*$
zcS#%Bl=P~s?NQu@tY&LsVQd$YxEsPpEDp6wYsQJym<2<ZjzZk2k0Nt@5)r0VQ6!OU
z&NEMi?b1se84^=GQWYa5ycVDBJP@NTKMI{{4qX|_q_5_kk>G>yoR)?Q%cZx0GtT)}
zTNBUTxnl9na;W3$$fEMsJjYDz7@8RKy-iGJd80onH_Tr468Xla*&9j>b89wt_e^sx
z@bNJddV{xKbw0UnxZx`<h`|mW<W6)peNi0KFXv7Vl0$NT>iD61FR${}o8zg$0SNRE
zlQ0qe+yXc``x3SQezg-D0>3++oX7|j^3%@ZU-dtO^6%~^XAHvF_S0JMz3#v6{~6+b
z_dcoQ32F7yB*{*A|C~yGf51uYOA!1&jS>&ipXC4VPdX_&e#L<m_<xl1@9rnNJ3*QJ
zw2u`3asR1Pes@0EzX%J!Piw$)`6G&7mV@8@PuA_P-_B0+_sdW8yx-kV=GCt(+@<}$
WrLL<9z+*y0M1?=R@iY_vVE$j?Xy$nU

literal 0
HcmV?d00001

diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
index aa195fdc..f680ad79 100644
--- a/openformats/tests/formats/docx/test_docx.py
+++ b/openformats/tests/formats/docx/test_docx.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import re
 import unittest
 
 from openformats.formats.docx import DocxFile, DocxHandler
@@ -626,3 +627,51 @@ def test_ampersand(self):
         self.assertEqual(openstring.order, 0)
         self.assertEqual(openstring.string, translation)
         self.assertEqual(openstring.string, openstring.key)
+
+    def test_lt_pattern(self):
+        for original, escaped in (("ab", "ab"),
+                                  ("a<b", "a&lt;b"),
+                                  ("a<tx>b", "a<tx>b"),
+                                  ("a<tx>b<c</tx>", "a<tx>b&lt;c</tx>")):
+            self.assertEqual(re.sub(DocxHandler.LT_PATTERN, '&lt;', original),
+                             escaped)
+
+    def test_lt(self):
+        # Parse original file
+        path = '{}/with_lt.docx'.format(self.TESTFILE_BASE)
+        with open(path, 'rb') as f:
+            content = f.read()
+        handler = DocxHandler()
+        template, stringset = handler.parse(content)
+
+        # Make sure extracted data is OK
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string,
+                         u'This is a < lessthan')
+        self.assertEqual(openstring.string, openstring.key)
+
+        # Compile with altered translation
+        translation = U'THIS IS AN < LESSTHAN'
+        stringset = [
+            OpenString(openstring.key, translation, order=0)
+        ]
+        content = handler.compile(template, stringset)
+
+        # Make sure compiled file has altered data
+        docx = DocxFile(content)
+        self.assertFalse("This is a" in docx.get_document())
+        self.assertFalse("lessthan" in docx.get_document())
+        self.assertTrue("THIS IS A" in docx.get_document())
+        self.assertTrue("LESSTHAN" in docx.get_document())
+
+        # Parse compiled file
+        template, stringset = handler.parse(content)
+
+        # Make sure compiled file has the correct translation
+        self.assertEqual(len(stringset), 1)
+        openstring = stringset[0]
+        self.assertEqual(openstring.order, 0)
+        self.assertEqual(openstring.string, translation)
+        self.assertEqual(openstring.string, openstring.key)

From 751ebf81be1ade937423d9c1d6c775c5a215cbea Mon Sep 17 00:00:00 2001
From: Konstantinos Bairaktaris <kbairak@transifex.com>
Date: Thu, 15 Jul 2021 09:48:23 +0300
Subject: [PATCH 4/4] Avoid using regex for escaping

---
 openformats/formats/docx.py                 | 38 +++++++++++----------
 openformats/tests/formats/docx/test_docx.py |  8 ++---
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py
index eb1bf9b3..5cf42905 100644
--- a/openformats/formats/docx.py
+++ b/openformats/formats/docx.py
@@ -1,17 +1,15 @@
+import io
 import itertools
-import re
 import os
+import shutil
 import tempfile
 import uuid
-import six
-import io
-import shutil
+from zipfile import ZIP_DEFLATED, ZipFile
 
+import six
 from bs4 import BeautifulSoup
-from zipfile import ZipFile, ZIP_DEFLATED
-
-from openformats.strings import OpenString
 from openformats.handlers import Handler
+from openformats.strings import OpenString
 
 
 class DocxFile(object):
@@ -171,11 +169,6 @@ class DocxHandler(Handler):
     EXTRACTS_RAW = False
     name = "DOCX"
 
-    LT_PATTERN = r'\<(?!tx|/tx)'
-    LT_PATTERN = (LT_PATTERN.decode("utf8")
-                  if isinstance(LT_PATTERN, six.binary_type)
-                  else LT_PATTERN)
-
     @classmethod
     def get_hyperlink_url(cls, element, document_rels):
         run_parent = element.find_parent('w:r').parent
@@ -328,12 +321,7 @@ def compile(self, template, stringset, **kwargs):
                 continue
 
             translation = stringset[txid].string
-
-            # Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in
-            # its input. We do expect some '<tx>' and `</tx>` so we only escape
-            # '<' if it's not part of '<tx>'
-            translation = translation.replace(u"&", u"&amp;")
-            translation = re.sub(self.LT_PATTERN, u"&lt;", translation)
+            translation = self._escape_xml(translation)
 
             translation_soup = BeautifulSoup(
                 u'<wrapper>{}</wrapper>'.format(translation), 'xml',
@@ -395,3 +383,17 @@ def compile(self, template, stringset, **kwargs):
         result = docx.compress()
         docx.delete()
         return result
+
+    @staticmethod
+    def _escape_xml(translation):
+        """ Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in its
+            input. We do expect some '<tx>' and `</tx>` so we first replace
+            these tags to placeholders, do the escaping and restore them.
+        """
+        return translation.\
+            replace(u"<tx", u"__TX__OPENING__TAG__").\
+            replace(u"</tx>", u"__TX__CLOSING__TAG__").\
+            replace(u"&", "&amp;").\
+            replace(u"<", "&lt;").\
+            replace(u"__TX__OPENING__TAG__", u"<tx").\
+            replace(u"__TX__CLOSING__TAG__", u"</tx>")
diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py
index f680ad79..b27bdd61 100644
--- a/openformats/tests/formats/docx/test_docx.py
+++ b/openformats/tests/formats/docx/test_docx.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-import re
 import unittest
 
 from openformats.formats.docx import DocxFile, DocxHandler
@@ -628,13 +627,12 @@ def test_ampersand(self):
         self.assertEqual(openstring.string, translation)
         self.assertEqual(openstring.string, openstring.key)
 
-    def test_lt_pattern(self):
+    def test_escape_xml(self):
         for original, escaped in (("ab", "ab"),
                                   ("a<b", "a&lt;b"),
                                   ("a<tx>b", "a<tx>b"),
                                   ("a<tx>b<c</tx>", "a<tx>b&lt;c</tx>")):
-            self.assertEqual(re.sub(DocxHandler.LT_PATTERN, '&lt;', original),
-                             escaped)
+            self.assertEqual(DocxHandler._escape_xml(original), escaped)
 
     def test_lt(self):
         # Parse original file
@@ -653,7 +651,7 @@ def test_lt(self):
         self.assertEqual(openstring.string, openstring.key)
 
         # Compile with altered translation
-        translation = U'THIS IS AN < LESSTHAN'
+        translation = U'THIS IS A < LESSTHAN'
         stringset = [
             OpenString(openstring.key, translation, order=0)
         ]