Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data extraction refactoring #69

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
01a5ed4
Update Dockerfile
Ana-Sovat Dec 2, 2023
c01aa33
Revert "Update Dockerfile"
Ana-Sovat Dec 2, 2023
a0220c0
change podman to docker in build command
Ana-Sovat Dec 2, 2023
e7b41bc
fix typo
Ana-Sovat Dec 2, 2023
934793c
changing podman to docker in makefile for setup
Ana-Sovat Dec 2, 2023
c5f2974
change podman to docker in wait-for
Ana-Sovat Dec 19, 2023
721aff3
change podman to docker in create-pod and destroy-pod
Ana-Sovat Dec 19, 2023
89ba401
connecting containers to 'pod'
Ana-Sovat Dec 19, 2023
65e7f30
remove podman exclusive flag for docker rm
Ana-Sovat Dec 19, 2023
0ae3e7c
remove all --ignore flags
Ana-Sovat Dec 19, 2023
a12a3f9
add sleep command to docker run when creating the 'pod'
Ana-Sovat Dec 19, 2023
f4fb2ac
change podman to docker in wait-for (--pod flag)
Ana-Sovat Dec 19, 2023
0f9187f
remove --ignore from docker stop
Ana-Sovat Dec 19, 2023
5e3ebed
remove 'docker stop' line from stop-apache-tika-server
Ana-Sovat Dec 19, 2023
3c0ecdb
change podman to docker in
Ana-Sovat Dec 19, 2023
e131344
fix port publishing
Ana-Sovat Dec 22, 2023
2f0ea07
change podman to docker in re-run
Ana-Sovat Dec 22, 2023
ba1c9d8
added .env to be moved to querido-diario repo
Ana-Sovat Dec 23, 2023
86f5990
refactoring file type recognition
Ana-Sovat Dec 29, 2023
347689d
more refactoring
Ana-Sovat Dec 29, 2023
977504a
boolean fix for refresh_index
Ana-Sovat Dec 30, 2023
94ae417
Revert "boolean fix for refresh_index"
Ana-Sovat Dec 30, 2023
a4c7f18
revert changes made for developing environment
Ana-Sovat Dec 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions data_extraction/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,44 +22,45 @@ def _return_file_content(self, filepath: str) -> str:
with open(filepath, "r") as file:
return file.read()

def _try_extract_text(self, filepath: str) -> str:
if self.is_txt(filepath):
def _try_extract_text(self, filepath: str, file_type) -> str:
if self.is_txt(file_type):
return self._return_file_content(filepath)
with open(filepath, "rb") as file:
headers = {"Content-Type": self._get_file_type(filepath)}
headers = {"Content-Type": file_type}
response = requests.put(f"{self._url}/tika", data=file, headers=headers)
response.encoding = "UTF-8"
return response.text

def extract_text(self, filepath: str) -> str:
logging.debug(f"Extracting text from {filepath}")
self.check_file_exists(filepath)
self.check_file_type_supported(filepath)
file_type = self.get_file_type(filepath)
self.check_file_type_supported(file_type)
try:
return self._try_extract_text(filepath)
return self._try_extract_text(filepath, file_type)
except Exception as e:
raise Exception("Could not extract file content") from e

def check_file_exists(self, filepath: str):
if not os.path.exists(filepath):
raise Exception(f"File does not exists: {filepath}")

def check_file_type_supported(self, filepath: str) -> None:
def check_file_type_supported(self, found_type) -> None:
if (
not self.is_doc(filepath)
and not self.is_pdf(filepath)
and not self.is_txt(filepath)
not self.is_doc(found_type)
and not self.is_pdf(found_type)
and not self.is_txt(found_type)
):
raise Exception("Unsupported file type: " + self.get_file_type(filepath))
raise Exception("Unsupported file type: " + found_type)

def is_pdf(self, filepath):
def is_pdf(self, found_type):
"""
If the file type is pdf returns True. Otherwise,
returns False
"""
return self.is_file_type(filepath, file_types=["application/pdf"])
return found_type in ["application/pdf"]

def is_doc(self, filepath):
def is_doc(self, found_type):
"""
If the file type is doc or similar returns True. Otherwise,
returns False
Expand All @@ -69,26 +70,26 @@ def is_doc(self, filepath):
"application/vnd.oasis.opendocument.text",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
return self.is_file_type(filepath, file_types)
return found_type in file_types

def is_txt(self, filepath):
def is_txt(self, found_type):
"""
If the file type is txt returns True. Otherwise,
returns False
"""
return self.is_file_type(filepath, file_types=["text/plain"])
return found_type in ["text/plain"]

def get_file_type(self, filepath):
"""
Returns the file's type
"""
return magic.from_file(filepath, mime=True)

def is_file_type(self, filepath, file_types):
def is_file_type(self, found_type, file_types):
"""
Generic method to check if a identified file type matches a given list of types
"""
return self.get_file_type(filepath) in file_types
return found_type in file_types


def get_apache_tika_server_url():
Expand Down