From 58f4c500b1a2bfda0b7536c8a70ae4ed4d7c858f Mon Sep 17 00:00:00 2001 From: Konstantinos Bairaktaris Date: Wed, 14 Jul 2021 09:40:59 +0300 Subject: [PATCH 1/4] Apply flake8 suggestions Doing this first so that the fix commit is simpler --- openformats/formats/docx.py | 36 ++++++++++----------- openformats/tests/formats/docx/test_docx.py | 5 ++- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py index 758b25ce..5f4c70f0 100644 --- a/openformats/formats/docx.py +++ b/openformats/formats/docx.py @@ -4,7 +4,6 @@ import uuid import six import io -import re import shutil from bs4 import BeautifulSoup @@ -16,7 +15,7 @@ class DocxFile(object): """ - A class used to wrap and expose the internals of a .docx file + A class used to wrap and expose the internals of a .docx file A docx file is a zipped file that when unzipped, generates a similar file/folder structure: @@ -77,7 +76,8 @@ class DocxFile(object): ``` ... - + ... ``` @@ -103,11 +103,12 @@ def __init__(self, content): with io.open(base_rels_path, 'r') as f: base_rels = f.read() - document_relative_path = next( - relationship for relationship in BeautifulSoup(base_rels, 'xml').find_all( - attrs={'Target': True} - ) if relationship.attrs.get('Type').endswith('/officeDocument') - ).attrs['Target'] + document_relative_path = next(( + relationship + for relationship in (BeautifulSoup(base_rels, 'xml'). + find_all(attrs={'Target': True})) + if relationship.attrs.get('Type').endswith('/officeDocument') + )).attrs['Target'] self.__document_path = '{}/{}'.format( self.__tmp_folder, document_relative_path @@ -197,7 +198,7 @@ def parse(self, content, **kwargs): """ We will segment the text by paragraph `` as this is defined in the docx structure. - + For all the text `` inside a paragraph, we use tag separators ``, in order to denote text style changes (normal->bold, bold->italic, 10px->14px etc) @@ -216,7 +217,7 @@ def parse(self, content, **kwargs): order = itertools.count() for paragraph in soup.find_all('w:p'): paragraph_text = [] - text_elements = paragraph.find_all('w:t') + text_elements = paragraph.find_all('w:t') if not text_elements: continue @@ -308,7 +309,7 @@ def compile(self, template, stringset, **kwargs): rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml') for paragraph in soup.find_all('w:p'): - text_elements = paragraph.find_all('w:t') + text_elements = paragraph.find_all('w:t') if not text_elements: continue @@ -331,7 +332,7 @@ def compile(self, template, stringset, **kwargs): for index, text_element in enumerate(text_elements): text = six.text_type(text_element.text) # detect text elements that contain no text - # and remove leading whitespace from the next string + # and remove leading whitespace from the next string if not text.strip(): leading_spaces = len(text) - len(text.strip()) continue @@ -354,17 +355,16 @@ def compile(self, template, stringset, **kwargs): translation = translation[leading_spaces:] leading_spaces = 0 - # the text parts of the translation are more that the - # text parts of the document, so we will compress the + # text parts of the document, so we will compress the # remaining translation parts into one string - if index == len(text_elements) - 1 and len(translation_soup) > 0: + if (index == len(text_elements) - 1 and + len(translation_soup) > 0): translation = "".join( [translation] + [six.text_type(t) for t in translation_soup] ) - if hyperlink_url: # attempt to find a parent containing `href` attribute # in order to extract the potential modified url. @@ -376,10 +376,10 @@ def compile(self, template, stringset, **kwargs): ) text_element.clear() text_element.insert(0, translation) - + docx.set_document(six.text_type(soup)) docx.set_document_rels(six.text_type(rels_soup)) result = docx.compress() docx.delete() - return result \ No newline at end of file + return result diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py index f52a656a..2bd6a829 100644 --- a/openformats/tests/formats/docx/test_docx.py +++ b/openformats/tests/formats/docx/test_docx.py @@ -13,7 +13,7 @@ def test_broken_file(self): with open(path, 'rb') as f: content = f.read() - docx = DocxFile(content) + DocxFile(content) # Make sure no errors happen during init handler = DocxHandler() template, stringset = handler.parse(content) @@ -47,7 +47,6 @@ def test_broken_file(self): u'Φου βαρ βαζ' ) - def test_docx_file(self): path = '{}/hello_world.docx'.format(self.TESTFILE_BASE) with open(path, 'rb') as f: @@ -515,7 +514,7 @@ def test_two_text_elements_file(self): docx = DocxFile(content) expected_strings = [ - + u'Hello world', u'Goodbye world', u'This is a link', From 7de2b749725415ff4f5373ee2b163e7a7e6a3352 Mon Sep 17 00:00:00 2001 From: Konstantinos Bairaktaris Date: Wed, 14 Jul 2021 10:10:22 +0300 Subject: [PATCH 2/4] Replace user's '&' with '&' in compiled docx The issue is that BeautifulSoup does a "best effort" when converting its input to XML data. So, when a plain '&' is in the string, the information will be lost: >>> BeautifulSoup('a & b', 'xml').w.string <<< u'a b' This commit adds a `.replace('&', '&')` to the constructor of the soup object used for translations to fix this issue --- openformats/formats/docx.py | 3 +- .../formats/docx/files/with_ampersand.docx | Bin 0 -> 4257 bytes openformats/tests/formats/docx/test_docx.py | 40 ++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 openformats/tests/formats/docx/files/with_ampersand.docx diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py index 5f4c70f0..ae114011 100644 --- a/openformats/formats/docx.py +++ b/openformats/formats/docx.py @@ -324,7 +324,8 @@ def compile(self, template, stringset, **kwargs): translation = stringset[txid].string translation_soup = BeautifulSoup( - u'{}'.format(translation), 'xml' + u'{}'. + format(translation.replace("&", "&")), 'xml' ).find_all(text=True) leading_spaces = 0 diff --git a/openformats/tests/formats/docx/files/with_ampersand.docx b/openformats/tests/formats/docx/files/with_ampersand.docx new file mode 100644 index 0000000000000000000000000000000000000000..fdbee23d331284a51e341a624edff227a5c8607b GIT binary patch literal 4257 zcmaJ^2{@GP8lJIa?E7TPu2IA&6@J^;GPXfv-(oOgW-J+NHL?^cM1{s0vhPIVM^L_7oKlgLL_hW8M&%g%&fj|JYv!5*i`+}AF z+a&<$8>py2`z}v0HKvEL#V+pf>n=?C%3bRru1V@(Bv+ZD((pr`9lYrp*|t6H5A2MhZUP`G?PY*(jAYMkfZb5c;L6L zb(v6#m6iM-W=eP>EW6duOV$=8`%gBk-@AM~CtGQB%%{iUF4SAz}!F#ktHxTz~VFz)677;K=TJ0<|B z5aQ>XX6c0JfbvDI+sH@0ZAQcTj~nF0Lf|G>ydz3KBdx|OP^?!kV;ZaRy$P80H-TXb zHp?W~xpvV+{`w3_o-SEHmyd##s{W@@^_DaEi&p;jqb!B9AaH+t=KZJ7)FAt>Jo?X*7{u3_p_B`{=fO%OoyRL>>-fbDL z`~q@%$%!d8T}GbN+IxBo+4UTWbv9x_FJdfNa3ff2wWR!Su3fUe8Has_%;vWs8-8C} zC|d#M8O=6kSX?>);E8)6y8FVsvb>Ik#LJRm=dHvB%kg9-_DwGS#;>s ztS}v{@tShyxFeWLXmdmC8iqJpSTAgMu$sRMBf+0ph4;qWH!hud3!36Qv*<5#I!GGP zw3nz1S1=G_YwkU;mbVkmI2V7j>hr4wO-+2$r3c&_6U;WSY|##_z3PnvwDM%yo430H z0f5)PEf3fJ^0;BKwCWI&`_UaxuIRA7ozpBa;pF(LPPs0=I63u;q>wLveO-ZZAUIDDZ039-EzGh%akR+Tf5CH9hu96@m<2a40U?wu2A!1YsaO>a3zB2vH@Ik}Qoeo+2R4xlE@Nis-r49$Huw<31DM$e^EaA;;X5VpCyOZ9p+H5On zGD*(Ki&Vr{@BzQ_%|KPFhOn2m=esgHKABQaFMlX1iF{Tnaba1gf}&U3i8Kw@Z1# zp}>_h2%}PL6Ie6B-j7^3k9@g%OEF7K>MN$6@jb-!sEgs+^5-A%Gt)8i?=Hu7j714ZpmG|h0F8}+8`#E zSsVwJa%Rq6EkS4b2|qW!UTyx00eiv{7|pB-1G#!up*sN_#i9;@l!9}yv&Kh8Mg~G2 ztbyFh@~2;M>N6Zg6(#i+u)5DHi6XeoJ{mljL705Z^xI;h5hu*5!u%Vpj~IV zIDtW-zQ{mY|MXkEHk*NRMZT@Pw(V@mb>2c>+t)&0X2j{(67a0oknO6UUz}y@CBM@O zPXg`+yQFNGsA~oiLx>8Zj4wq8YzsYL9GuP4x^U~~?;nV-XT}n_D7D<`;roYu6w;0Ey^>+_Oo=bw>o-P(oZ$dETgc#FH{1T z4P!l`)?#^!*%F&7$Cf2NwvJg2%9Z83BgPhNB3~gN$5-N0h@Q2gw!OA6^n}0-M5RY? zLag=ExOrpLdNfzr<2p8$03@GWtwYi&VZcZCoZ%(<`U|;o5ddahXyW^K8%KoZ_QC+p zN%@Q(*?0c**ZCIAJ0lKZ)+IHyIe9Hx;lAYqZrIHgkC&YgiMbK#5Yl=ij_epSo|xt{ zwoWJ5b#D=9;ZbOWZGSq3#Qn$MVsPMRjbT^U^V;M1AU5{C zsKlrp^^7AhSGCd@sRFI56Ox<8byq!_yNxmvqjh zx(x4x++7emHXm!(Xl>4b8?7Om$=ycX-BYw*wP=kT$aQOwg>0W>3UC0gL z9gBJX!(vYiI>^fHyzh5!RBjG4>x6Q(EVK0C=IfZE^9uQSrlZ4h{sKC94cpd?Cyz_7 ztUof7ljzCKAyWvXbdp_#7J*;mz^SaGs@eRO;wTqkHLI_Y>j8VRjAwF$${=O$x$P=! zaDBPH>iS?5;gQHu(zA&)qiXaZ@MT0?YSl9)CZ)(u7`x_((~?m7?dDn^3qHC%affe?De!i9*&#ClB?FsuBc@=w`$r~LeYCZ(uZ zy!N4yjPUv5A_Mjp#beDAK>Ad6nZ%B;UZ8E(W*(&1sn?^LPmtYtF)?%u=x9}5wk^Rj zY6W=#mg-Y|CL2kAXbl>s*}mD6su#fc_TjeUn) z6QDXtR>tE50#5z1Wt`E#y~#F{c9Ka<7`f+KVJi;2XIcDsZf(x9*aq?BlJyoV4KG48 zQ{opY)i6}d{@JJVQdcqi$SUfE~Yn`rMw&~F+DfcqTDS0k(4T()5L9gnlZFw zVOZ|C93-^9dWx4a+A5yFMLAiJrKqixEyyY)SMMLs7!kx?pV(G?V74nL4Bj*d3^LSR z5l3EPJ?YRO(d01mif+XR zt6na=W#~14Wmo_7X4Uj)N7VzloDbYH!Is*`y%f};G3+aL7c*YSlFGE6JBM^inxu7bsHdb9~czF~3TiCr-mZuaGX%jZLr zSR@B|rs%pbpL a(aiRL`_tT*@c?ZRGxeiRtrCKR_U~Vs3EMRQ literal 0 HcmV?d00001 diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py index 2bd6a829..aa195fdc 100644 --- a/openformats/tests/formats/docx/test_docx.py +++ b/openformats/tests/formats/docx/test_docx.py @@ -586,3 +586,43 @@ def test_two_text_elements_file(self): for url in [u'https://transifex.com/']: self.assertTrue(url in docx.get_document_rels()) + + def test_ampersand(self): + # Parse original file + path = '{}/with_ampersand.docx'.format(self.TESTFILE_BASE) + with open(path, 'rb') as f: + content = f.read() + handler = DocxHandler() + template, stringset = handler.parse(content) + + # Make sure extracted data is OK + self.assertEqual(len(stringset), 1) + openstring = stringset[0] + self.assertEqual(openstring.order, 0) + self.assertEqual(openstring.string, + u'This is an & ampersand') + self.assertEqual(openstring.string, openstring.key) + + # Compile with altered translation + translation = U'THIS IS AN & AMPERSAND' + stringset = [ + OpenString(openstring.key, translation, order=0) + ] + content = handler.compile(template, stringset) + + # Make sure compiled file has altered data + docx = DocxFile(content) + self.assertFalse("This is an" in docx.get_document()) + self.assertFalse("ampersand" in docx.get_document()) + self.assertTrue("THIS IS AN" in docx.get_document()) + self.assertTrue("AMPERSAND" in docx.get_document()) + + # Parse compiled file + template, stringset = handler.parse(content) + + # Make sure compiled file has the correct translation + self.assertEqual(len(stringset), 1) + openstring = stringset[0] + self.assertEqual(openstring.order, 0) + self.assertEqual(openstring.string, translation) + self.assertEqual(openstring.string, openstring.key) From c5f2d4c8fbc3ec1c521144d84468c684ed8e9553 Mon Sep 17 00:00:00 2001 From: Konstantinos Bairaktaris Date: Wed, 14 Jul 2021 14:11:22 +0300 Subject: [PATCH 3/4] Also escape '<' to '<' --- openformats/formats/docx.py | 15 +++++- .../tests/formats/docx/files/with_lt.docx | Bin 0 -> 4756 bytes openformats/tests/formats/docx/test_docx.py | 49 ++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 openformats/tests/formats/docx/files/with_lt.docx diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py index ae114011..eb1bf9b3 100644 --- a/openformats/formats/docx.py +++ b/openformats/formats/docx.py @@ -1,4 +1,5 @@ import itertools +import re import os import tempfile import uuid @@ -170,6 +171,11 @@ class DocxHandler(Handler): EXTRACTS_RAW = False name = "DOCX" + LT_PATTERN = r'\<(?!tx|/tx)' + LT_PATTERN = (LT_PATTERN.decode("utf8") + if isinstance(LT_PATTERN, six.binary_type) + else LT_PATTERN) + @classmethod def get_hyperlink_url(cls, element, document_rels): run_parent = element.find_parent('w:r').parent @@ -323,9 +329,14 @@ def compile(self, template, stringset, **kwargs): translation = stringset[txid].string + # Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in + # its input. We do expect some '' and `` so we only escape + # '<' if it's not part of '' + translation = translation.replace(u"&", u"&") + translation = re.sub(self.LT_PATTERN, u"<", translation) + translation_soup = BeautifulSoup( - u'{}'. - format(translation.replace("&", "&")), 'xml' + u'{}'.format(translation), 'xml', ).find_all(text=True) leading_spaces = 0 diff --git a/openformats/tests/formats/docx/files/with_lt.docx b/openformats/tests/formats/docx/files/with_lt.docx new file mode 100644 index 0000000000000000000000000000000000000000..4fcfb994bc71918a1e8fb1f874cbb5c29cb52375 GIT binary patch literal 4756 zcmaJ_2UHX55~d|IL8|m3h=72Apn#XE80kVNO6VprfP~Ny0i{S06zRQpgwTY5gd$CP zktPu7Rip?Ah!kJqd-sXHcbz@EXU|DycjxN_Ds8*;OUMrRE52&3+_)+^Y#hvUI!*=spUeZCUvQ5bd4Ih zrS_MW__HTCkLn-hMRSn@3Tu2r{H#7TrBA!=Rn!J)S6N$Fm*l8!E#EEf)M=*zMSS&% z?@=>fO5_i48g$-*bV|~&R(cGQ&eYhfOB}8G*bf*Z(S8+`BKdA<5y$VJr>dHWvC{}b z-Wp({WIJI6YLZhdIPIz~mB5G=`xLFxtL-P>}jHy=cjQIH~)bO z!!J`MKIu5GQJlhFa_Ye_?2#z-2GMx1My?RmwRv9{Gs~r0@9x!oMzl-CYUqS&Tno{$ z?3c%C#`6^oe9jaBj8jRZN<{`r#AK#ahFIE-(rVa}}MA0!tu@L1i(AKDO{;)4tj;p0p9_=;uv(^FjJb zA@B9nn&MkFJ^dcnp-y3|2aKD|eM03}a|6oP1G)D3XB<@E_3qSt%HNUMefT6FK)o2Fv~qM?vE7FoutUh!GXX~N<@!+lLXz94eyZlS&d}XaK3rg zotp}0nG1x!1ezk$O$9J;af-?MzzUn*9UBDV%i8$8Lg)7&K?id-vv6r0H8Mz|#>X1o zjfE~36}}hOZf9O*r*a~`Wpo&ZOfH+&^?gC2a^NQ&?5182N8{4e);Q8A$NHY~)WGl% zC#^=)tB5vQ=>bu7Cn^M)nabr#((o@W3zXlf(a&nvm#P+8f!qiI7Ys8+^1Fs|X`ykj z>zkzv1&UouWMTH3&wO5JMXuUOQ2&AKbA;m+V-aup4W%>tifK2p-YA;RP@fx+PLSs| zG|`p&7vm2B}yl)4(pe?PC( zh;EGD_O-Z( zFp(Gyt)Lj{C`VSkv18hj`$_r&P!#L--jp8@+(8^F%6nffkI6P?w0D{&P7ZpJ{{{-2 z>Ijw&>upL{oh6RCYA~!f7QndA=OwqlxpMhPlNL6$@pTe)))Sfa2!UfE#5FTQP`Il&vnj;|J5Dk1QY*vH^i5`D-v$+LWu3(oiRciA0S=8UMgSrWUe7yM6SK6gUAk% zWD2J73T21TLr>BWh|p})!lCzuq_~B}L6})TIr6!WsJd^49%dky-a)8pP@pJOy}nHc zY)XlC^KIo|B&$%hZ}rd>+r4ydB;#q+_&0N~sCRRUf#4W#TFytmgIho*((^b&lEdOo zpBR~-hEk;2{Q!=_UIljch-qJYu0XZudKHGx`BCk}iWij*^1P5VFYro|eNB zCFRWcg(a5Ml)M1-Uhh?s)xF0&7 z!`jp5NBHxC{D{z2%Dv>zgf;Fq-(Us1C6|Z`YdFnblt?roZe~JwY%e+rnVBhR(yy_y z7aM|dSty^zXVy%=>};P~w0BbHgQ_s|R@`Nilek{Y!c)9!lVKy0P#$qz_xv!Mjb2XR zh?ZZ1nvNca**N9!rD1r7@F$i5geA!Kev)+9K{oY-#DZ52h;js37IM9_yo9+ga$fV9 zmXE8Kb`OVC`r5J4W5HIsDy6&?bLhMD+-_FECrlZ|WW}I(8z)dRH=^6Rx4@Kl)gs1A z6ULT&Q+1nqUTv=F-SY4!UCv2qxyG*h)}!?3hdkp7D;T!^a|(GTlwXIgn3ugxS&pT) zKW#R56Fj(oa3{rf7xm!LqjTCk7joz^O=e`|Odf8qne`?k7V?HG6I1M1fhzLGOz#@f zh=`aCNO*PVB7kRAUE(O+U!`;a$N}W69qeRZ0$!KvYnQ2qkM=x{mYNrEEdB;cP^xBS z^@Pg8@XHH}9&M+pJ8#RG<*3u?i_haLqDqRP^=^Fp%ZrNym;ha!ipYu_FJmo^K>*b?LG#k^Yqr;l#!#(fxZIL(oj7$m_-WgY7+;?+~&>zTN=D2(Y zWO>yJ9&49@kU@$Xbh)Uen%G!QGwh?$phEv7qyrW;s8AzCd-cI61H?*!w2|4wS<|~u zy)9zI$AdI_r(!lMY}27~Zf?(hlaY|n?i*N4BtDs?$^TT}r~gi9XBQ7UxHF-ydkoA_ z<3CpM;6oi!3Rh+b#MK~QjX7y#m)ta&V|HdVbuD_pLJ8L*x&Y1L8Q3M+e7)K8dWrL( zs}Tl^L}$kbIdF z5ulJ!t^wDR2+4@SO@0c@3uyx~D+H}JtRf{H6iI77i|cL|xdfc%wWLjq7<&B37gAlt?Ssf}2<#pNvZ}&h7)jc@ z+>%Tw2e>7>)XrY(l?=8nuANBN^ZE^vNZ_ri1zlX>=NX9bnpZ;+y4Jjap*1jQ`0^DR z!@<%Cz6m`gvc5Hp$bu+1-Ktn6CKW2Az4c714k!OWFumz38c6H>ojpi3TP#*Hwrin% zEMMT(i|o2H=QAfj2tYKz7_saZR6iZ1cV{Rm?$JPXDAWqaa44RT8L7#o?l(A{c2kVi z%Z{R${!jrEn&$CV8Asr8I#-Z+!-4H~ ziV1lw>cU24J$!kL`+DExToStG2j&w|vJkI5*=sN1e%Lksfo=X(KUR*>Vm< zUh@`?MCR=mOumP8zq|d;|Bu1XUYiGrN zEUq%Ioe#|z=Pi)LiUSLe&|IBPKt(VZ9Zd7)qGSsugPpvCOu>GzxqZ$0C=xx%G;ei=wI7cl6HhUuxr z?7d<0W_z;|ePHR$8|D($4sLoxfcO7&=)!j4zfGTo0W9!ip* zdTm$MY`L;Zb;X|X>WbHfiMBw}RLzr;4xa}XMl16khe*jk*r6mqo3p3U#DNz&hNORj z_P;BjGu*|+#vbvbWx#d2befe<)owGtoBWuoR-2j292Y{G3{iYDA-kqAwqv^{9bM9< zp07N6&-0!u~_y~_sA z_&R5=$*2|wjwh>v;bK@u;50zsSx99xX)tErn!HoHb3WWSC*zeneBYAvtMy3DR;Uou z^%#l2o@FjY=aljHTT*)qddjXoi3Iod4=e!*sOLp_D@1YTP z-%S8Sk!YLx&3~iTbh1DtOE>MPL8}p~rG*wXxLbg%YZi`qop^EMrslzWeS=VwkS$=; zQVR!wLR5U!YMr)TubF3O2m9%VI!D*U$Wh{P%*3DnEUXBE(d0TlJ@IPF!ob4;?rcVA zzID-fBqiu>>>Y))n?zTaDpLro`L!|B2;GORs@n$hluryCE6kmHsmq)gdnR_XyKbph z)=AN<@T#%z_Q0~A={;zq&mu_8jCme@96uTU5fZBZkz>1?^l;Iv<(nil!B^2 zbEI|3eA0?DV>eG55!bQ5C=r&q>O*T2%ngg2y514%s>VI?;DeK%P9zp1o=MeIgcd*$ zcS#%Bl=P~s?NQu@tY&LsVQd$YxEsPpEDp6wYsQJym<2G>yoR)?Q%cZx0GtT)} zTNBUTxnl9na;W3$$fEMsJjYDz7@8RKy-iGJd80onH_Tr468Xla*&9j>b89wt_e^sx z@bNJddV{xKbw0UnxZx`iD61FR${}o8zg$0SNRE zlQ0qe+yXc``x3SQezg-D0>3++oX7|j^3%@ZU-dtO^6%~^XAHvF_S0JMz3#v6{~6+b z_dcoQ32F7yB*{*A|C~yGf51uYOA!1&jS>&ipXC4VPdX_&e#Lb", "ab"), + ("ab", "ab<c")): + self.assertEqual(re.sub(DocxHandler.LT_PATTERN, '<', original), + escaped) + + def test_lt(self): + # Parse original file + path = '{}/with_lt.docx'.format(self.TESTFILE_BASE) + with open(path, 'rb') as f: + content = f.read() + handler = DocxHandler() + template, stringset = handler.parse(content) + + # Make sure extracted data is OK + self.assertEqual(len(stringset), 1) + openstring = stringset[0] + self.assertEqual(openstring.order, 0) + self.assertEqual(openstring.string, + u'This is a < lessthan') + self.assertEqual(openstring.string, openstring.key) + + # Compile with altered translation + translation = U'THIS IS AN < LESSTHAN' + stringset = [ + OpenString(openstring.key, translation, order=0) + ] + content = handler.compile(template, stringset) + + # Make sure compiled file has altered data + docx = DocxFile(content) + self.assertFalse("This is a" in docx.get_document()) + self.assertFalse("lessthan" in docx.get_document()) + self.assertTrue("THIS IS A" in docx.get_document()) + self.assertTrue("LESSTHAN" in docx.get_document()) + + # Parse compiled file + template, stringset = handler.parse(content) + + # Make sure compiled file has the correct translation + self.assertEqual(len(stringset), 1) + openstring = stringset[0] + self.assertEqual(openstring.order, 0) + self.assertEqual(openstring.string, translation) + self.assertEqual(openstring.string, openstring.key) From 751ebf81be1ade937423d9c1d6c775c5a215cbea Mon Sep 17 00:00:00 2001 From: Konstantinos Bairaktaris Date: Thu, 15 Jul 2021 09:48:23 +0300 Subject: [PATCH 4/4] Avoid using regex for escaping --- openformats/formats/docx.py | 38 +++++++++++---------- openformats/tests/formats/docx/test_docx.py | 8 ++--- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/openformats/formats/docx.py b/openformats/formats/docx.py index eb1bf9b3..5cf42905 100644 --- a/openformats/formats/docx.py +++ b/openformats/formats/docx.py @@ -1,17 +1,15 @@ +import io import itertools -import re import os +import shutil import tempfile import uuid -import six -import io -import shutil +from zipfile import ZIP_DEFLATED, ZipFile +import six from bs4 import BeautifulSoup -from zipfile import ZipFile, ZIP_DEFLATED - -from openformats.strings import OpenString from openformats.handlers import Handler +from openformats.strings import OpenString class DocxFile(object): @@ -171,11 +169,6 @@ class DocxHandler(Handler): EXTRACTS_RAW = False name = "DOCX" - LT_PATTERN = r'\<(?!tx|/tx)' - LT_PATTERN = (LT_PATTERN.decode("utf8") - if isinstance(LT_PATTERN, six.binary_type) - else LT_PATTERN) - @classmethod def get_hyperlink_url(cls, element, document_rels): run_parent = element.find_parent('w:r').parent @@ -328,12 +321,7 @@ def compile(self, template, stringset, **kwargs): continue translation = stringset[txid].string - - # Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in - # its input. We do expect some '' and `` so we only escape - # '<' if it's not part of '' - translation = translation.replace(u"&", u"&") - translation = re.sub(self.LT_PATTERN, u"<", translation) + translation = self._escape_xml(translation) translation_soup = BeautifulSoup( u'{}'.format(translation), 'xml', @@ -395,3 +383,17 @@ def compile(self, template, stringset, **kwargs): result = docx.compress() docx.delete() return result + + @staticmethod + def _escape_xml(translation): + """ Do escaping: BeautifulSoup doesn't like unescaped '&' or '<' in its + input. We do expect some '' and `` so we first replace + these tags to placeholders, do the escaping and restore them. + """ + return translation.\ + replace(u"", u"__TX__CLOSING__TAG__").\ + replace(u"&", "&").\ + replace(u"<", "<").\ + replace(u"__TX__OPENING__TAG__", u"") diff --git a/openformats/tests/formats/docx/test_docx.py b/openformats/tests/formats/docx/test_docx.py index f680ad79..b27bdd61 100644 --- a/openformats/tests/formats/docx/test_docx.py +++ b/openformats/tests/formats/docx/test_docx.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import re import unittest from openformats.formats.docx import DocxFile, DocxHandler @@ -628,13 +627,12 @@ def test_ampersand(self): self.assertEqual(openstring.string, translation) self.assertEqual(openstring.string, openstring.key) - def test_lt_pattern(self): + def test_escape_xml(self): for original, escaped in (("ab", "ab"), ("ab", "ab"), ("ab", "ab<c")): - self.assertEqual(re.sub(DocxHandler.LT_PATTERN, '<', original), - escaped) + self.assertEqual(DocxHandler._escape_xml(original), escaped) def test_lt(self): # Parse original file @@ -653,7 +651,7 @@ def test_lt(self): self.assertEqual(openstring.string, openstring.key) # Compile with altered translation - translation = U'THIS IS AN < LESSTHAN' + translation = U'THIS IS A < LESSTHAN' stringset = [ OpenString(openstring.key, translation, order=0) ]