From b09e0197720edc7ab5efacf5d4c6afd7194c4446 Mon Sep 17 00:00:00 2001 From: Thore Schoeller Date: Tue, 24 Sep 2024 11:01:22 +0200 Subject: [PATCH] proper testing of data_to_xml and write_file --- .gitignore | 1 + mailcom/inout.py | 2 +- mailcom/parse.py | 14 +++++++------- mailcom/test/data/test.out | 1 + mailcom/test/test_inout.py | 14 +++++++++++--- 5 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 mailcom/test/data/test.out diff --git a/.gitignore b/.gitignore index 6e14643..a00cceb 100644 --- a/.gitignore +++ b/.gitignore @@ -136,6 +136,7 @@ dmypy.json *.xml* *.html* !mailcom/test/data/*.eml +!mailcom/test/data/*.xml # models test/models diff --git a/mailcom/inout.py b/mailcom/inout.py index f870910..15c7252 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -70,7 +70,7 @@ def validate_data(self): def data_to_xml(self, text): my_item_func = lambda x: 'content' - xml = dicttoxml(text, custom_root='email', item_func = my_item_func) # Different options for review + xml = dicttoxml(text, custom_root='email', item_func = my_item_func) return xml.decode() def write_file(self, text: str, name: str)-> None: diff --git a/mailcom/parse.py b/mailcom/parse.py index 5dbc469..a21f881 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -119,8 +119,6 @@ def make_dir(path: str): # process the text io = InoutHandler(path_input) io.list_of_files() - io = InoutHandler(path_input) - io.list_of_files() # html_files = list_of_files(path_input, "html") for file in io.email_list: text = io.get_text(file) @@ -133,16 +131,18 @@ def make_dir(path: str): # print(io.email_content["attachement type"]) # skip this text if email could not be parsed if not text: - continue - # doc_spacy = nlp_spacy(text) + continue + ### nlp = init_spacy(sprache) + # doc_spacy = nlp_spacy(text) ### fehlt - alte version # text = get_sentences(doc_spacy) # start with first line # here you can limit the number of sentences to parse # newlist = [] - # max_i = len(text) + # max_i = len(text) ### weg + ### init transformers # for i in range(0, max_i): - # if tool == "transformers": - # nlps = nlp_transformers(text[i]) + # if tool == "transformers": ### gibt nur eins + # nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc # doc = nlps # newlist.append(process_doc(doc, ner_tool=tool, text=text[i])) # newlist[i] = " ".join(newlist[i]) diff --git a/mailcom/test/data/test.out b/mailcom/test/data/test.out new file mode 100644 index 0000000..d8575e9 --- /dev/null +++ b/mailcom/test/data/test.out @@ -0,0 +1 @@ +This is nothing more than a test2024-04-17T15:13:56+00:002jpg \ No newline at end of file diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 3e74afd..1f30ff0 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -3,10 +3,12 @@ from pathlib import Path from importlib import resources import datetime +import filecmp pkg = resources.files("mailcom") FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml") +XML_PATH = Path(pkg / "test" / "data" / "test.out") TEXT_REF = "J'espère que tu vas bien!" XML_REF = "" @@ -44,7 +46,6 @@ def test_get_text(get_instant): assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] with pytest.raises(OSError): get_instant.get_text(get_instant.directory_name / "nonexisting.eml") - return text def test_get_html_text(get_instant): html = """Test""" @@ -52,5 +53,12 @@ def test_get_html_text(get_instant): noHtml = """Test""" assert get_instant.get_html_text(noHtml) == 'Test' -def test_data_to_xml(get_instant): - assert get_instant.data_to_xml(test_get_text)[0:66] == XML_REF \ No newline at end of file +def test_data_to_xml(get_instant,tmp_path): + xml_content = {"content": "This is nothing more than a test", + "date": "2024-04-17T15:13:56+00:00", + "attachment": 2, + "attachement type": {'jpg', 'jpg'} + } + xml = get_instant.data_to_xml(xml_content) + get_instant.write_file(xml, tmp_path / "test") + assert filecmp.cmp(XML_PATH, tmp_path / "test.out")