proper testing of data_to_xml and write_file

ssciwr · Sep 24, 2024 · b09e019 · b09e019
1 parent e4e3e56
commit b09e019
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -136,6 +136,7 @@ dmypy.json
 *.xml*
 *.html*
 !mailcom/test/data/*.eml
+!mailcom/test/data/*.xml
 
 # models
 test/models
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -70,7 +70,7 @@ def validate_data(self):
 
     def data_to_xml(self, text):
         my_item_func = lambda x: 'content'
-        xml = dicttoxml(text, custom_root='email', item_func = my_item_func)  # Different options for review
+        xml = dicttoxml(text, custom_root='email', item_func = my_item_func)
         return xml.decode()
 
     def write_file(self, text: str, name: str)-> None:

diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -119,8 +119,6 @@ def make_dir(path: str):
     # process the text
     io = InoutHandler(path_input)
     io.list_of_files()
-    io = InoutHandler(path_input)
-    io.list_of_files()
     # html_files = list_of_files(path_input, "html")
     for file in io.email_list:
         text = io.get_text(file)
@@ -133,16 +131,18 @@ def make_dir(path: str):
         # print(io.email_content["attachement type"])
         # skip this text if email could not be parsed
         if not text:
-            continue    
-        # doc_spacy = nlp_spacy(text)
+            continue 
+        ### nlp = init_spacy(sprache)   
+        # doc_spacy = nlp_spacy(text) ### fehlt - alte version
         # text = get_sentences(doc_spacy)
         # start with first line
         # here you can limit the number of sentences to parse
         # newlist = []
-        # max_i = len(text)
+        # max_i = len(text) ### weg
+        ### init transformers
         # for i in range(0, max_i):
-        #     if tool == "transformers":
-        #         nlps = nlp_transformers(text[i])
+        #     if tool == "transformers": ### gibt nur eins
+        #         nlps = nlp_transformers(text[i]) ### fehlty bzw process_doc
         #         doc = nlps
         #     newlist.append(process_doc(doc, ner_tool=tool, text=text[i]))
         #     newlist[i] = " ".join(newlist[i])

diff --git a/mailcom/test/data/test.out b/mailcom/test/data/test.out
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">This is nothing more than a test</content><date type="str">2024-04-17T15:13:56+00:00</date><attachment type="int">2</attachment><attachement_type type="list"><content type="str">jpg</content></attachement_type></email>
diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py
@@ -3,10 +3,12 @@
 from pathlib import Path
 from importlib import resources
 import datetime
+import filecmp
 
 pkg = resources.files("mailcom")
 
 FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")
+XML_PATH = Path(pkg / "test" / "data" / "test.out")
 
 TEXT_REF = "J'espère que tu vas bien!"
 XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"
@@ -44,13 +46,19 @@ def test_get_text(get_instant):
     assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
     with pytest.raises(OSError):
         get_instant.get_text(get_instant.directory_name / "nonexisting.eml")
-    return text
 
 def test_get_html_text(get_instant):
     html = """<html><head><title>Test</title></head></html>"""
     assert get_instant.get_html_text(html) == 'Test'
     noHtml = """Test"""
     assert get_instant.get_html_text(noHtml) == 'Test'
 
-def test_data_to_xml(get_instant):
-    assert get_instant.data_to_xml(test_get_text)[0:66] == XML_REF
+def test_data_to_xml(get_instant,tmp_path):
+    xml_content = {"content": "This is nothing more than a test", 
+                    "date": "2024-04-17T15:13:56+00:00", 
+                    "attachment": 2, 
+                    "attachement type": {'jpg', 'jpg'}
+                    }
+    xml = get_instant.data_to_xml(xml_content)
+    get_instant.write_file(xml, tmp_path / "test")
+    assert filecmp.cmp(XML_PATH, tmp_path / "test.out")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">This is nothing more than a test</content><date type="str">2024-04-17T15:13:56+00:00</date><attachment type="int">2</attachment><attachement_type type="list"><content type="str">jpg</content></attachement_type></email>