Skip to content

Commit

Permalink
Handle invalid en-note XML, closes #13
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Aug 26, 2021
1 parent 36a466f commit d877d83
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
18 changes: 15 additions & 3 deletions evernote_to_sqlite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,15 @@ def save_note(db, note):
updated = note.find("updated").text
else:
updated = created
# Some content has   which breaks the XML parser
content_xml = resolve_entities(note.find("content").text.strip())
content = ET.tostring(ET.fromstring(content_xml)).decode("utf-8")

# At this point content pretends to be XML - it starts with a
# <?xml?> prolog and a <!DOCTYPE and wraps content in <en-note>
# BUT... it's simply not valid! Should treat as HTML instead.
# https://github.com/dogsheep/evernote-to-sqlite/issues/13
# Strip the <?xml?> and <!DOCTYPE but leave the <en-note> wrapper
content = resolve_entities(
strip_prolog_and_doctype(note.find("content").text.strip())
)
row = {
"title": title,
"content": content,
Expand Down Expand Up @@ -106,10 +112,16 @@ def convert_datetime(s):


_entities_re = re.compile(r"&(\w+);")
_prolog_re = re.compile(r"<\?xml[^>]+>")
_doctype_re = re.compile(r"<!DOCTYPE[^>]+>")


def resolve_entities(s):
# Replace all &nbsp; entities with their unicode equivalents
return _entities_re.sub(
lambda m: html.entities.entitydefs.get(m.group(1), m.group(1)), s
)


def strip_prolog_and_doctype(not_xml):
return _doctype_re.sub("", _prolog_re.sub("", not_xml)).strip()
10 changes: 5 additions & 5 deletions tests/test_evernote_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ def test_enex(tmpdir):
}
assert list(db["notes"].rows) == [
{
"id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"id": "9db0b9b53af965103eca742e32118808c4b2706c",
"title": "Example note with images",
"content": '<en-note><div>This note includes two images. &#353;.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>',
"content": '<en-note><div>This note includes two images. š.</div><div><br /></div><div><span style="font-weight: bold;">The Python logo</span></div><div><br /></div><div><en-media hash="61098c2c541de7f0a907c301dd6542da" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div><span style="font-weight: bold;">The Evernote logo</span></div><div><br /></div><div><en-media hash="91bd26175acac0b2ffdb6efac199f8ca" type="image/svg+xml" width="125" /><br /></div><div><br /></div><div>This image contains text:</div><div><br /></div><div><en-media hash="76dd28b07797cc9f3f129c4871c5293c" type="image/png" /></div><div><br /></div></en-note>',
"created": "2020-10-11T21:28:22",
"updated": "2020-10-11T23:30:38",
"latitude": "37.77742571705006",
Expand Down Expand Up @@ -96,15 +96,15 @@ def test_enex(tmpdir):
]
assert list(db["note_resources"].rows) == [
{
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"note_id": "9db0b9b53af965103eca742e32118808c4b2706c",
"resource_id": "61098c2c541de7f0a907c301dd6542da",
},
{
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"note_id": "9db0b9b53af965103eca742e32118808c4b2706c",
"resource_id": "91bd26175acac0b2ffdb6efac199f8ca",
},
{
"note_id": "1ea8b9baeca91343cdb6a12d44f5cdb2edf5f2e5",
"note_id": "9db0b9b53af965103eca742e32118808c4b2706c",
"resource_id": "76dd28b07797cc9f3f129c4871c5293c",
},
]
Expand Down

0 comments on commit d877d83

Please sign in to comment.