put clean text back in, correct default image data path (#145)

ssciwr · Aug 31, 2023 · 8eb4fca · 8eb4fca
1 parent e120c10
commit 8eb4fca
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 2 deletions.
diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py
@@ -1,5 +1,6 @@
 import pytest
 import ammico.text as tt
+import spacy
 
 
 @pytest.fixture
@@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
         assert not test_obj.analyse_text
 
 
+def test_run_spacy(set_testdict, get_path):
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
+    ref_file = get_path + "text_IMG_3755.txt"
+    with open(ref_file, "r") as file:
+        reference_text = file.read()
+    test_obj.subdict["text_english"] = reference_text
+    test_obj._run_spacy()
+    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
+
+
+def test_clean_text(set_testdict):
+    nlp = spacy.load("en_core_web_md")
+    doc = nlp("I like cats and fjejg")
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"])
+    test_obj.doc = doc
+    test_obj.clean_text()
+    result = "I like cats and"
+    assert test_obj.subdict["text_clean"] == result
+
+
 def test_init_revision_numbers_and_models():
     test_obj = tt.TextDetector({})
     # check the default options

diff --git a/ammico/text.py b/ammico/text.py
@@ -45,6 +45,8 @@ def __init__(
         if not isinstance(analyse_text, bool):
             raise ValueError("analyse_text needs to be set to true or false")
         self.analyse_text = analyse_text
+        if self.analyse_text:
+            self._initialize_spacy()
         if model_names:
             self._check_valid_models(model_names)
         if revision_numbers:
@@ -139,6 +141,14 @@ def set_keys(self) -> dict:
         params = {"text": None, "text_language": None, "text_english": None}
         return params
 
+    def _initialize_spacy(self):
+        """Initialize the Spacy library for text analysis."""
+        try:
+            self.nlp = spacy.load("en_core_web_md")
+        except Exception:
+            spacy.cli.download("en_core_web_md")
+            self.nlp = spacy.load("en_core_web_md")
+
     def analyse_image(self) -> dict:
         """Perform text extraction and analysis of the text.
 
@@ -149,6 +159,8 @@ def analyse_image(self) -> dict:
         self.translate_text()
         self.remove_linebreaks()
         if self.analyse_text:
+            self._run_spacy()
+            self.clean_text()
             self.text_summary()
             self.text_sentiment_transformers()
             self.text_ner()
@@ -200,6 +212,19 @@ def remove_linebreaks(self):
                 "\n", " "
             )
 
+    def _run_spacy(self):
+        """Generate Spacy doc object for further text analysis."""
+        self.doc = self.nlp(self.subdict["text_english"])
+
+    def clean_text(self):
+        """Clean the text from unrecognized words and any numbers."""
+        templist = []
+        for token in self.doc:
+            templist.append(
+                token.text
+            ) if token.pos_ != "NUM" and token.has_vector else None
+        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
+
     def text_summary(self):
         """Generate a summary of the text using the Transformers pipeline."""
         # use the transformers pipeline to summarize the text

diff --git a/ammico/utils.py b/ammico/utils.py
@@ -105,7 +105,7 @@ def find_files(
 
     Args:
         path (str, optional): The base directory where we are looking for the images. Defaults
-            to None, which uses the XDG data directory if set or the current
+            to None, which uses the ammico data directory if set or the current
             working directory otherwise.
         pattern (str|list, optional): The naming pattern that the filename should match.
                 Use either '.ext' or just 'ext'
@@ -122,7 +122,7 @@ def find_files(
     """
 
     if path is None:
-        path = os.environ.get("XDG_DATA_HOME", ".")
+        path = os.environ.get("AMMICO_DATA_HOME", ".")
 
     if isinstance(pattern, str):
         pattern = [pattern]