Skip to content

Commit

Permalink
put clean text back in, correct default image data path (#145)
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy authored Aug 31, 2023
1 parent e120c10 commit 8eb4fca
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
21 changes: 21 additions & 0 deletions ammico/test/test_text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import ammico.text as tt
import spacy


@pytest.fixture
Expand Down Expand Up @@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
assert not test_obj.analyse_text


def test_run_spacy(set_testdict, get_path):
test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
ref_file = get_path + "text_IMG_3755.txt"
with open(ref_file, "r") as file:
reference_text = file.read()
test_obj.subdict["text_english"] = reference_text
test_obj._run_spacy()
assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)


def test_clean_text(set_testdict):
nlp = spacy.load("en_core_web_md")
doc = nlp("I like cats and fjejg")
test_obj = tt.TextDetector(set_testdict["IMG_3755"])
test_obj.doc = doc
test_obj.clean_text()
result = "I like cats and"
assert test_obj.subdict["text_clean"] == result


def test_init_revision_numbers_and_models():
test_obj = tt.TextDetector({})
# check the default options
Expand Down
25 changes: 25 additions & 0 deletions ammico/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __init__(
if not isinstance(analyse_text, bool):
raise ValueError("analyse_text needs to be set to true or false")
self.analyse_text = analyse_text
if self.analyse_text:
self._initialize_spacy()
if model_names:
self._check_valid_models(model_names)
if revision_numbers:
Expand Down Expand Up @@ -139,6 +141,14 @@ def set_keys(self) -> dict:
params = {"text": None, "text_language": None, "text_english": None}
return params

def _initialize_spacy(self):
"""Initialize the Spacy library for text analysis."""
try:
self.nlp = spacy.load("en_core_web_md")
except Exception:
spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md")

def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text.
Expand All @@ -149,6 +159,8 @@ def analyse_image(self) -> dict:
self.translate_text()
self.remove_linebreaks()
if self.analyse_text:
self._run_spacy()
self.clean_text()
self.text_summary()
self.text_sentiment_transformers()
self.text_ner()
Expand Down Expand Up @@ -200,6 +212,19 @@ def remove_linebreaks(self):
"\n", " "
)

def _run_spacy(self):
"""Generate Spacy doc object for further text analysis."""
self.doc = self.nlp(self.subdict["text_english"])

def clean_text(self):
"""Clean the text from unrecognized words and any numbers."""
templist = []
for token in self.doc:
templist.append(
token.text
) if token.pos_ != "NUM" and token.has_vector else None
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()

def text_summary(self):
"""Generate a summary of the text using the Transformers pipeline."""
# use the transformers pipeline to summarize the text
Expand Down
4 changes: 2 additions & 2 deletions ammico/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def find_files(
Args:
path (str, optional): The base directory where we are looking for the images. Defaults
to None, which uses the XDG data directory if set or the current
to None, which uses the ammico data directory if set or the current
working directory otherwise.
pattern (str|list, optional): The naming pattern that the filename should match.
Use either '.ext' or just 'ext'
Expand All @@ -122,7 +122,7 @@ def find_files(
"""

if path is None:
path = os.environ.get("XDG_DATA_HOME", ".")
path = os.environ.get("AMMICO_DATA_HOME", ".")

if isinstance(pattern, str):
pattern = [pattern]
Expand Down

0 comments on commit 8eb4fca

Please sign in to comment.