From 4e8b36da631dd85cc8ead4d652b80448dd1b4acb Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 30 Jul 2024 13:59:18 -0400 Subject: [PATCH 1/3] Disable all img src munging done by TinyMCE. --- web/frontend/libs/tinymce_extensions.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/web/frontend/libs/tinymce_extensions.js b/web/frontend/libs/tinymce_extensions.js index 7b2b017ca..13f4228f1 100644 --- a/web/frontend/libs/tinymce_extensions.js +++ b/web/frontend/libs/tinymce_extensions.js @@ -436,6 +436,10 @@ export function getInitConfig(selector, enhanced, code) { paste_remove_styles: true, paste_remove_styles_if_webkit: true, paste_strip_class_attributes: "all", + // image URLs + relative_urls: false, + convert_urls: false, + remove_script_host : false, media_dimensions: false, extended_valid_elements: extend_valid_elements, setup: (editor) => { From f873dbcdaf36df5070e8f692362e8cccd6caf1f0 Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 30 Jul 2024 16:27:01 -0400 Subject: [PATCH 2/3] Test that disallowed image srcs are stripped during export. --- web/main/test/test_export.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/web/main/test/test_export.py b/web/main/test/test_export.py index fd02958e6..7e2e3c6be 100644 --- a/web/main/test/test_export.py +++ b/web/main/test/test_export.py @@ -1,4 +1,5 @@ from io import BytesIO +import itertools from pathlib import Path from zipfile import ZipFile @@ -366,3 +367,39 @@ def test_annotated_export_invalid_clamped(annotations_factory): resource = annotations_factory("LegalDocument", input)[1] resource.annotations.update(global_end_offset=1000) # move end offset past end of text assert annotated_content_for_export(resource) == expected + + +def test_disallowed_images_stripped(rf, text_block_factory, resource_factory): + request = rf.get('/spoof-export-request') + + disallowed_srcs = [ + '/etc/hosts', + '../../images/foo', + 'http://example.com' + ] + + allowed_srcs = [ + f"http://{request.get_host()}/foo", + f"https://{request.get_host()}/foo", + ] + + text = '' + for src in itertools.chain(disallowed_srcs, allowed_srcs): + text = text + f'' + + text_block = text_block_factory(content=text) + resource = resource_factory(resource=text_block, resource_type='TextBlock') + + # Establish that all images are present in the unaltered HTML + unaltered_html = resource.export(False, None, file_type='html') + for src in itertools.chain(disallowed_srcs, allowed_srcs): + assert src in unaltered_html + + # Provide a spoofed `request` object which is a required argument for proper export of rich text + # https://github.com/harvard-lil/h2o/blob/dd67276720fe3a7af7e110da958448399a92399f/web/main/utils.py#L282 + # Then, establish that only allowed image sources are present. + html = resource.export(False, None, file_type='html', export_options={"request": request}) + for src in disallowed_srcs: + assert src not in html + for src in allowed_srcs: + assert src in html From d962c45b19594fd59fb191b64a548862f32ed72c Mon Sep 17 00:00:00 2001 From: Rebecca Cremona Date: Tue, 30 Jul 2024 17:14:01 -0400 Subject: [PATCH 3/3] Format the way `black` likes it. --- web/main/test/test_export.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/web/main/test/test_export.py b/web/main/test/test_export.py index 7e2e3c6be..cad8e94c8 100644 --- a/web/main/test/test_export.py +++ b/web/main/test/test_export.py @@ -370,35 +370,31 @@ def test_annotated_export_invalid_clamped(annotations_factory): def test_disallowed_images_stripped(rf, text_block_factory, resource_factory): - request = rf.get('/spoof-export-request') + request = rf.get("/spoof-export-request") - disallowed_srcs = [ - '/etc/hosts', - '../../images/foo', - 'http://example.com' - ] + disallowed_srcs = ["/etc/hosts", "../../images/foo", "http://example.com"] allowed_srcs = [ f"http://{request.get_host()}/foo", f"https://{request.get_host()}/foo", ] - text = '' + text = "" for src in itertools.chain(disallowed_srcs, allowed_srcs): text = text + f'' text_block = text_block_factory(content=text) - resource = resource_factory(resource=text_block, resource_type='TextBlock') + resource = resource_factory(resource=text_block, resource_type="TextBlock") # Establish that all images are present in the unaltered HTML - unaltered_html = resource.export(False, None, file_type='html') + unaltered_html = resource.export(False, None, file_type="html") for src in itertools.chain(disallowed_srcs, allowed_srcs): assert src in unaltered_html # Provide a spoofed `request` object which is a required argument for proper export of rich text # https://github.com/harvard-lil/h2o/blob/dd67276720fe3a7af7e110da958448399a92399f/web/main/utils.py#L282 # Then, establish that only allowed image sources are present. - html = resource.export(False, None, file_type='html', export_options={"request": request}) + html = resource.export(False, None, file_type="html", export_options={"request": request}) for src in disallowed_srcs: assert src not in html for src in allowed_srcs: