diff --git a/web/frontend/libs/tinymce_extensions.js b/web/frontend/libs/tinymce_extensions.js index 7b2b017ca..13f4228f1 100644 --- a/web/frontend/libs/tinymce_extensions.js +++ b/web/frontend/libs/tinymce_extensions.js @@ -436,6 +436,10 @@ export function getInitConfig(selector, enhanced, code) { paste_remove_styles: true, paste_remove_styles_if_webkit: true, paste_strip_class_attributes: "all", + // image URLs + relative_urls: false, + convert_urls: false, + remove_script_host : false, media_dimensions: false, extended_valid_elements: extend_valid_elements, setup: (editor) => { diff --git a/web/main/test/test_export.py b/web/main/test/test_export.py index fd02958e6..cad8e94c8 100644 --- a/web/main/test/test_export.py +++ b/web/main/test/test_export.py @@ -1,4 +1,5 @@ from io import BytesIO +import itertools from pathlib import Path from zipfile import ZipFile @@ -366,3 +367,35 @@ def test_annotated_export_invalid_clamped(annotations_factory): resource = annotations_factory("LegalDocument", input)[1] resource.annotations.update(global_end_offset=1000) # move end offset past end of text assert annotated_content_for_export(resource) == expected + + +def test_disallowed_images_stripped(rf, text_block_factory, resource_factory): + request = rf.get("/spoof-export-request") + + disallowed_srcs = ["/etc/hosts", "../../images/foo", "http://example.com"] + + allowed_srcs = [ + f"http://{request.get_host()}/foo", + f"https://{request.get_host()}/foo", + ] + + text = "" + for src in itertools.chain(disallowed_srcs, allowed_srcs): + text = text + f'' + + text_block = text_block_factory(content=text) + resource = resource_factory(resource=text_block, resource_type="TextBlock") + + # Establish that all images are present in the unaltered HTML + unaltered_html = resource.export(False, None, file_type="html") + for src in itertools.chain(disallowed_srcs, allowed_srcs): + assert src in unaltered_html + + # Provide a spoofed `request` object which is a required argument for proper export of rich text + # https://github.com/harvard-lil/h2o/blob/dd67276720fe3a7af7e110da958448399a92399f/web/main/utils.py#L282 + # Then, establish that only allowed image sources are present. + html = resource.export(False, None, file_type="html", export_options={"request": request}) + for src in disallowed_srcs: + assert src not in html + for src in allowed_srcs: + assert src in html