diff --git a/evernote_to_sqlite/utils.py b/evernote_to_sqlite/utils.py index 398cba7..1d8fa81 100644 --- a/evernote_to_sqlite/utils.py +++ b/evernote_to_sqlite/utils.py @@ -31,9 +31,15 @@ def save_note(db, note): updated = note.find("updated").text else: updated = created - # Some content has   which breaks the XML parser - content_xml = resolve_entities(note.find("content").text.strip()) - content = ET.tostring(ET.fromstring(content_xml)).decode("utf-8") + + # At this point content pretends to be XML - it starts with a + # prolog and a + # BUT... it's simply not valid! Should treat as HTML instead. + # https://github.com/dogsheep/evernote-to-sqlite/issues/13 + # Strip the and wrapper + content = resolve_entities( + strip_prolog_and_doctype(note.find("content").text.strip()) + ) row = { "title": title, "content": content, @@ -106,6 +112,8 @@ def convert_datetime(s): _entities_re = re.compile(r"&(\w+);") +_prolog_re = re.compile(r"<\?xml[^>]+>") +_doctype_re = re.compile(r"]+>") def resolve_entities(s): @@ -113,3 +121,7 @@ def resolve_entities(s): return _entities_re.sub( lambda m: html.entities.entitydefs.get(m.group(1), m.group(1)), s ) + + +def strip_prolog_and_doctype(not_xml): + return _doctype_re.sub("", _prolog_re.sub("", not_xml)).strip() diff --git a/tests/test_evernote_to_sqlite.py b/tests/test_evernote_to_sqlite.py index eb9666e..43775bb 100644 --- a/tests/test_evernote_to_sqlite.py +++ b/tests/test_evernote_to_sqlite.py @@ -39,9 +39,9 @@ def test_enex(tmpdir): } assert list(db["notes"].rows) == [ { - "id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5", + "id": "9db0b9b53af965103eca742e32118808c4b2706c", "title": "Example note with images", - "content": '
This note includes two images. š.

The Python logo



The Evernote logo



This image contains text:


', + "content": '
This note includes two images. ลก.

The Python logo



The Evernote logo



This image contains text:


', "created": "2020-10-11T21:28:22", "updated": "2020-10-11T23:30:38", "latitude": "37.77742571705006", @@ -96,15 +96,15 @@ def test_enex(tmpdir): ] assert list(db["note_resources"].rows) == [ { - "note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5", + "note_id": "9db0b9b53af965103eca742e32118808c4b2706c", "resource_id": "61098c2c541de7f0a907c301dd6542da", }, { - "note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5", + "note_id": "9db0b9b53af965103eca742e32118808c4b2706c", "resource_id": "91bd26175acac0b2ffdb6efac199f8ca", }, { - "note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5", + "note_id": "9db0b9b53af965103eca742e32118808c4b2706c", "resource_id": "76dd28b07797cc9f3f129c4871c5293c", }, ]