Skip to content

Commit

Permalink
use instant of the class for better interface
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy committed Sep 13, 2024
1 parent ec1e25d commit 70e96f3
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 66 deletions.
62 changes: 31 additions & 31 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,27 @@
from bs4 import BeautifulSoup

class InoutHandler:
@staticmethod
def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.
Args:
directory_name (str): The directory where the files are located.
Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list

@staticmethod
def get_html_text(text_check: str) -> str:
def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
Expand All @@ -35,8 +36,7 @@ def get_html_text(text_check: str) -> str:
text_check = soup.get_text()
return text_check

@staticmethod
def get_text(file: Path) -> str:
def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Args:
Expand All @@ -57,24 +57,24 @@ def get_text(file: Path) -> str:
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])
return(self.email_content["content"])

def validate_data():
return
def validate_data(self):
pass

Check warning on line 68 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L68

Added line #L68 was not covered by tests

def data_to_xml():
return
def data_to_xml(self):
pass

Check warning on line 71 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L71

Added line #L71 was not covered by tests

def write_file(text: str, name: str)-> None:
"""Write the extracted string to a text file.
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)
def write_file(self, text: str, name: str)-> None:
"""Write the extracted string to a text file.
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

Check warning on line 80 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L79-L80

Added lines #L79 - L80 were not covered by tests
64 changes: 29 additions & 35 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,50 @@
import pytest
from pathlib import Path
from importlib import resources
import datetime

pkg = resources.files("mailcom")
io = inout.InoutHandler()

FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")

TEXT_REF = "J'espère que tu vas bien!"

def test_list_of_files_found(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
assert len(io.list_of_files(tmp_path)) != 0
@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)

def test_list_of_files_empty(tmp_path):
def test_list_of_files(get_instant):
with pytest.raises(ValueError):
io.list_of_files(tmp_path)

def test_list_of_files_dir_not_existing():
with pytest.raises(OSError):
io.list_of_files("nonexistingDir")

def test_list_of_files_correct_format(tmp_path):
p = tmp_path / "test.eml"
get_instant.list_of_files()
p = get_instant.directory_name / "test.eml"
p.write_text("test")
p = tmp_path / "test2.html"
get_instant.list_of_files()
assert len(get_instant.email_list) != 0
get_instant2 = inout.InoutHandler("nonexistingDir")
with pytest.raises(OSError):
get_instant2.list_of_files()
p = get_instant.directory_name / "test2.html"
p.write_text("test2")
p = tmp_path / "test3.xml"
p = get_instant.directory_name / "test3.xml"
p.write_text("test3")
assert tmp_path / "test3.xml" not in io.list_of_files(tmp_path)
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list

def test_get_text(tmp_path):
p = tmp_path / "test.eml"
def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
assert io.get_text(p) == 'test'
text = io.get_text(FILE_PATH)
print(text[0:25])
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF

def test_get_text_err():
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
with pytest.raises(OSError):
io.list_of_files("nonexistingDir")
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")

def test_get_html_text():
def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert io.get_html_text(html) == 'Test'

def test_get_html_text_noHtml():
assert get_instant.get_html_text(html) == 'Test'
noHtml = """Test"""
assert io.get_html_text(noHtml) == 'Test'

def test_get_text_no_file(tmp_path):
p = tmp_path / "test.eml"
with pytest.raises(OSError):
io.get_text(p)
assert get_instant.get_html_text(noHtml) == 'Test'

0 comments on commit 70e96f3

Please sign in to comment.