From c163376f86b24f9c3b29222def88d63e6d922f46 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 13:33:06 +0100 Subject: [PATCH 1/3] manage occurence of full stops in a better way --- ammico/test/test_text.py | 13 ++++++++++ ammico/text.py | 51 ++++++++++++++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index 67ae0c76..5ebb00df 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted): tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted) +def test_check_add_space_after_full_stop(accepted): + test_obj = tt.TextDetector({}, accept_privacy=accepted) + test_obj.subdict["text"] = "I like cats. I like dogs." + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "I like cats. I like dogs." + test_obj.subdict["text"] = "I like cats." + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "I like cats." + test_obj.subdict["text"] = "www.icanhascheezburger.com" + test_obj._check_add_space_after_full_stop() + assert test_obj.subdict["text"] == "www. icanhascheezburger. com" + + @pytest.mark.gcv def test_analyse_image(set_testdict, set_environ, accepted): for item in set_testdict: diff --git a/ammico/text.py b/ammico/text.py index 61499022..b3260bb4 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -4,6 +4,7 @@ import spacy import io import os +import re from ammico.utils import AnalysisMethod import grpc import pandas as pd @@ -225,6 +226,48 @@ def _initialize_spacy(self): spacy.cli.download("en_core_web_md") self.nlp = spacy.load("en_core_web_md") + def _check_add_space_after_full_stop(self): + """Add a space after a full stop. Required by googletrans.""" + # we have found text, now we check for full stops + index_stop = [ + i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa + ] + if not index_stop: # no full stops found + return + # check if this includes the last string item + end_of_list = False + if len(self.subdict["text"]) <= (index_stop[-1] + 1): + # the last found full stop is at the end of the string + # but we can include all others + if len(index_stop) == 1: + end_of_list = True + else: + index_stop.pop() + print( + "End of list", + end_of_list, + len(self.subdict["text"]), + index_stop, + index_stop[-1] + 1, + "text", + self.subdict["text"], + ) + if end_of_list: # only one full stop at end of string + return + # if this is not the end of the list, check if there is a space after the full stop + no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "] + if not no_space: # all full stops have a space after them + return + # else, amend the text + add_one = 1 + for i in no_space: + self.subdict["text"] = ( + self.subdict["text"][: i + add_one] + + " " + + self.subdict["text"][i + add_one :] + ) + add_one += 1 + def analyse_image(self) -> dict: """Perform text extraction and analysis of the text. @@ -239,13 +282,7 @@ def analyse_image(self) -> dict: else: # make sure all full stops are followed by whitespace # otherwise googletrans breaks - index_stop = self.subdict["text"].find(".") - if self.subdict["text"][index_stop + 1] != " ": - self.subdict["text"] = ( - self.subdict["text"][: index_stop + 1] - + " " - + self.subdict["text"][index_stop + 1 :] - ) + self._check_add_space_after_full_stop() self.translate_text() self.remove_linebreaks() if self.analyse_text: From 9b1eaa3fac7f58be589dc271bdb8e6bfe4b64765 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 14:30:05 +0100 Subject: [PATCH 2/3] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ff1e8516..cbd55bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "ammico" -version = "0.2.3" +version = "0.2.4" description = "AI Media and Misinformation Content Analysis Tool" readme = "README.md" maintainers = [ From fd48428d5ce9544cea757999220513424576fb93 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 14:33:24 +0100 Subject: [PATCH 3/3] cleanup --- ammico/text.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ammico/text.py b/ammico/text.py index b3260bb4..0d020afc 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -243,15 +243,6 @@ def _check_add_space_after_full_stop(self): end_of_list = True else: index_stop.pop() - print( - "End of list", - end_of_list, - len(self.subdict["text"]), - index_stop, - index_stop[-1] + 1, - "text", - self.subdict["text"], - ) if end_of_list: # only one full stop at end of string return # if this is not the end of the list, check if there is a space after the full stop