Toufool · Avasam · Jun 16, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ This program can be used to automatically start, split, and reset your preferred
 - Download the [latest version](/../../releases/latest)
 - You can also check out the [latest dev builds](/../../actions/workflows/lint-and-build.yml?query=event%3Apush+is%3Asuccess) (requires a GitHub account)  
   (If you don't have a GitHub account, you can try [nightly.link](https://nightly.link/Toufool/AutoSplit/workflows/lint-and-build/dev))
+- Tesseract-OCR (optional; required for text recognition as an alternative comparison method). See [Tesseract install](#tesseract-install) below for installation instructions.
 
 - Linux users must ensure they are in the `tty` and `input` groups and have write access to `/dev/uinput`. You can run the following commands to do so:
 
@@ -226,6 +227,57 @@ You can have one (and only one) image with the keyword `reset` in its name. Auto
 
 The Start Image is similar to the Reset Image. You can only have one Start Image with the keyword `start_auto_splitter`.You can reload the image using the "`Reload Start Image`" button. The pause time is the amount of seconds AutoSplit will wait before starting comparisons of the first split image. Delay times will be used to delay starting your timer after the threshold is met.
 
+### Text Recognition (OCR)
+
+You can use text recognition as an alternative comparison method.
+
+#### Tesseract install
+
+First you need to install tesseract and include it in your system or user environment variables.
+- See <https://tesseract-ocr.github.io/tessdoc/Installation.html> for installation instruction on all platforms.
+- For Windows:
+  1. You can go directly to <https://github.com/UB-Mannheim/tesseract/wiki> to find the installer.
+  2. If you change the "Destination Folder" during install, then you'll also need to add it to your `PATH` environment variable.
+
+#### Usage
+
+To use this feature you need to place a text file (.txt) in your splits folder instead of an image file.
+
+An example file name and content could look like this:
+
+Filename: `001_start_auto_splitter.txt`
+
+Content:
+
+```toml
+texts = ["complete any 2 encounters"]
+top_left = 275
+top_right = 540
+bottom_left = 70
+bottom_right = 95
+method = 0
+fps_limit = 1
+```
+
+The `texts` field is an array and can take more than one text to look for:
+
+```toml
+texts = ["look for me", "or this text"]
+```
+
+Note: for now we only use lowercase letters in the comparison. All uppercase letters are converted to lowercase before the comparison.
+
+The `top_left` and `top_right` (both X-axis) and `bottom_left` and `bottom_right` (both Y-axis) options define a rectangle where the text you are looking for is expected to appear in the image.
+
+Currently there are three comparison methods:
+
+* `0` - uses the Levenshtein distance (the default)
+* `1` - checks if the OCR text contains the searched text
+* `2` - looks for a perfect 1:1 match
+
+Note: This method can cause high CPU usage at the standard comparison FPS. You should therefor limit the comparison FPS when you use this method to 1 or 2 FPS using the `fps_limit` option.
+The size of the selected rectangle can also impact the CPU load (bigger = more CPU load).
+
 ### Profiles
 
 <!-- TODO: Profiles are saved under `%appdata%\AutoSplit\profiles` and -->

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -4,6 +4,7 @@
 #
 # Dependencies:
 git+https://github.com/boppreh/keyboard.git#egg=keyboard  # Fix install on macos and linux-ci https://github.com/boppreh/keyboard/pull/568
+Levenshtein
 numpy>=1.26  # Python 3.12 support
 opencv-python-headless>=4.9.0.80  # Typing fixes
 packaging

diff --git a/src/AutoSplit.py b/src/AutoSplit.py
@@ -307,7 +307,8 @@ def __reload_start_image(self, started_by_button: bool = False, wait_for_delay:
         self.highest_similarity = 0.0
         self.reset_highest_similarity = 0.0
         self.split_below_threshold = False
-        self.timer_start_image.start(int(ONE_SECOND / self.settings_dict["fps_limit"]))
+
+        self.timer_start_image.start(int(ONE_SECOND / self.start_image.get_fps_limit(self)))
 
         QApplication.processEvents()
 
@@ -683,7 +684,7 @@ def __similarity_threshold_loop(self, number_of_split_images: int, dummy_splits_
             QApplication.processEvents()
 
             # Limit the number of time the comparison runs to reduce cpu usage
-            frame_interval = 1 / self.settings_dict["fps_limit"]
+            frame_interval = 1 / self.split_image.get_fps_limit(self)
             # Use a time delta to have a consistant check interval
             wait_delta_ms = int((frame_interval - (time() - start) % frame_interval) * ONE_SECOND)
 
@@ -867,7 +868,11 @@ def __update_split_image(self, specific_image: AutoSplitImage | None = None):
 
         # Get split image
         self.split_image = specific_image or self.split_images_and_loop_number[0 + self.split_image_number][0]
-        if is_valid_image(self.split_image.byte_array):
+        if self.split_image.is_ocr:
+            # TODO: test if setText clears a set image
+            text = "\nor\n".join(self.split_image.texts)
+            self.current_split_image.setText(f"Looking for OCR text:\n{text}")
+        elif is_valid_image(self.split_image.byte_array):
             set_preview_image(self.current_split_image, self.split_image.byte_array)
 
         self.current_image_file_label.setText(self.split_image.filename)

diff --git a/src/AutoSplitImage.py b/src/AutoSplitImage.py
@@ -5,11 +5,12 @@
 
 import cv2
 import numpy as np
+import toml
 from cv2.typing import MatLike
 
 import error_messages
-from compare import check_if_image_has_transparency, get_comparison_method_by_index
-from utils import BGR_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
+from compare import check_if_image_has_transparency, extract_and_compare_text, get_comparison_method_by_index
+from utils import BGR_CHANNEL_COUNT, MAXBYTE, TESSERACT_PATH, ColorChannel, ImageShape, is_valid_image
 
 if TYPE_CHECKING:
     from AutoSplit import AutoSplit
@@ -40,13 +41,28 @@ class AutoSplitImage:
     image_type: ImageType
     byte_array: MatLike | None = None
     mask: MatLike | None = None
+    texts: list[str] = []
     # This value is internal, check for mask instead
     _has_transparency = False
     # These values should be overriden by some Defaults if None. Use getters instead
     __delay_time: float | None = None
     __comparison_method: int | None = None
     __pause_time: float | None = None
     __similarity_threshold: float | None = None
+    __x: int = 0
+    __xx: int = 0
+    __y: int = 0
+    __yy: int = 0
+    __ocr_comparison_method: int = 0
+    __fps_limit: int = 0
+
+    @property
+    def is_ocr(self):
+        """
+        Whether a "split image" is actually for Optical Text Recognition
+        based on whether there's any text strings to search for.
+        """
+        return bool(self.texts)
 
     def get_delay_time(self, default: "AutoSplit | int"):
         """Get image's delay time or fallback to the default value from spinbox."""
@@ -80,6 +96,12 @@ def get_similarity_threshold(self, default: "AutoSplit | float"):
             return default
         return default.settings_dict["default_similarity_threshold"]
 
+    def get_fps_limit(self, default: "AutoSplit"):
+        """Get image's fps limit or fallback to the default value from spinbox."""
+        if self.__fps_limit != 0:
+            return self.__fps_limit
+        return default.settings_dict["fps_limit"]
+
     def __init__(self, path: str):
         self.path = path
         self.filename = os.path.split(path)[-1].lower()
@@ -89,7 +111,10 @@ def __init__(self, path: str):
         self.__comparison_method = comparison_method_from_filename(self.filename)
         self.__pause_time = pause_from_filename(self.filename)
         self.__similarity_threshold = threshold_from_filename(self.filename)
-        self.__read_image_bytes(path)
+        if path.endswith("txt"):
+            self.__parse_text_file(path)
+        else:
+            self.__read_image_bytes(path)
 
         if START_KEYWORD in self.filename:
             self.image_type = ImageType.START
@@ -98,6 +123,28 @@ def __init__(self, path: str):
         else:
             self.image_type = ImageType.SPLIT
 
+    def __parse_text_file(self, path: str):
+        if not TESSERACT_PATH:
+            error_messages.tesseract_missing(path)
+            return
+
+        with open(path, encoding="utf-8") as f:
+            data = toml.load(f)
+            self.texts = [text.lower().strip() for text in data["texts"]]
+            self.__x = abs(data["top_left"])
+            self.__xx = abs(data["top_right"])
+            self.__y = abs(data["bottom_left"])
+            self.__yy = abs(data["bottom_right"])
+            if "method" in data:
+                self.__ocr_comparison_method = abs(data["method"])
+            self.__fps_limit = 1
+            if "fps_limit" in data:
+                self.__fps_limit = abs(data["fps_limit"])
+
+        if self.__xx <= self.__x or self.__yy <= self.__y:
+            error_messages.wrong_ocr_coordinates(path)
+            return
+
     def __read_image_bytes(self, path: str):
         image = cv2.imread(path, cv2.IMREAD_UNCHANGED)
         if not is_valid_image(image):
@@ -140,8 +187,19 @@ def compare_with_capture(
         default: "AutoSplit | int",
         capture: MatLike | None,
     ):
-        """Compare image with capture using image's comparison method. Falls back to combobox."""
-        if not is_valid_image(self.byte_array) or not is_valid_image(capture):
+        """
+        Compare image with capture using image's comparison method. Falls back to combobox.
+        For OCR text files: extract image text from rectangle position and compare it with the expected string.
+        """
+        if not is_valid_image(capture):
+            return 0.0
+
+        if self.is_ocr:
+            return extract_and_compare_text(
+                capture[self.__y:self.__yy, self.__x:self.__xx], self.texts, self.__ocr_comparison_method,
+            )
+
+        if not is_valid_image(self.byte_array):
             return 0.0
         resized_capture = cv2.resize(capture, self.byte_array.shape[1::-1])
 

diff --git a/src/compare.py b/src/compare.py
@@ -1,17 +1,19 @@
 from math import sqrt
 
 import cv2
+import Levenshtein
 import numpy as np
 from cv2.typing import MatLike
 from scipy import fft
 
-from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image
+from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image, run_tesseract
 
 MAXRANGE = MAXBYTE + 1
 CHANNELS = [ColorChannel.Red.value, ColorChannel.Green.value, ColorChannel.Blue.value]
 HISTOGRAM_SIZE = [8, 8, 8]
 RANGES = [0, MAXRANGE, 0, MAXRANGE, 0, MAXRANGE]
 MASK_SIZE_MULTIPLIER = ColorChannel.Alpha * MAXBYTE * MAXBYTE
+MAX_VALUE = 1.0
 
 
 def compare_histograms(source: MatLike, capture: MatLike, mask: MatLike | None = None):
@@ -126,10 +128,65 @@ def compare_phash(source: MatLike, capture: MatLike, mask: MatLike | None = None
     return 1 - (hash_diff / 64.0)
 
 
+def extract_and_compare_text(capture: MatLike, texts: list[str], method_index: int):
+    """
+    Compares the extracted text of the given image and returns the similarity between the two texts.
+    The best match of all texts is returned.
+
+    @param capture: Image of any given shape as a numpy array
+    @param texts: a list of strings to match for
+    @param method_index: the comparison method index to use
+    @return: The similarity between the text in the image and the text supplied as a number 0 to 1.
+    """
+    method = get_ocr_comparison_method_by_index(method_index)
+    png = np.array(cv2.imencode(".png", capture)[1]).tobytes()
+    # Especially with stylised characters, OCR could conceivably get the right
+    # letter, but mix up the casing (m/M, o/O, t/T, etc.)
+    image_string = run_tesseract(png).lower().strip()
+
+    ratio = 0.0
+    for text in texts:
+        ratio = max(ratio, method(text, image_string))
+        if ratio == MAX_VALUE:
+            break
+    # TODO: debug: remove me
+    if ratio > 0.9:  # noqa: PLR2004
+        print(f"text from image ({ratio:,.2f}): {image_string}")
+    return ratio
+
+
+def compare_levenshtein(a: str, b: str):
+    return Levenshtein.ratio(a, b)  # pyright: ignore [reportUnknownMemberType]
+
+
+def compare_submatch(a: str, b: str):
+    if a in b:
+        return MAX_VALUE
+    return 0.0
+
+
+def compare_one_to_one(a: str, b: str):
+    if a == b:
+        return MAX_VALUE
+    return 0.0
+
+
 def __compare_dummy(*_: object):
     return 0.0
 
 
+def get_ocr_comparison_method_by_index(comparison_method_index: int):
+    match comparison_method_index:
+        case 0:
+            return compare_levenshtein
+        case 1:
+            return compare_submatch
+        case 2:
+            return compare_one_to_one
+        case _:
+            return __compare_dummy
+
+
 def get_comparison_method_by_index(comparison_method_index: int):
     match comparison_method_index:
         case 0:

diff --git a/src/error_messages.py b/src/error_messages.py
@@ -228,3 +228,19 @@ def handle_top_level_exceptions(exception: Exception) -> NoReturn:
     else:
         traceback.print_exception(type(exception), exception, exception.__traceback__)
     sys.exit(1)
+
+
+def tesseract_missing(ocr_split_file_path: str):
+    set_text_message(
+        f"{ocr_split_file_path!r} is an Optical Character Recognition split file but tesseract couldn't be found."
+        + f'\nPlease read <a href="https://github.com/{GITHUB_REPOSITORY}#install-tesseract">'
+        + f"github.com/{GITHUB_REPOSITORY}#install-tesseract</a> for installation instructions.",
+    )
+
+
+def wrong_ocr_coordinates(ocr_split_file_path: str):
+    set_text_message(
+        f"{ocr_split_file_path!r} has invalid coordinates."
+        + "\nPlease make sure that the 'top_right' and 'bottom_right' coordinates are not euqal to or lower then the "
+        + "'top_left' and 'bottom_left' coordinates.",
+    )
diff --git a/src/split_parser.py b/src/split_parser.py
@@ -209,7 +209,7 @@ def parse_and_validate_images(autosplit: "AutoSplit"):
     else:
         for image in split_images:
             # Test for image without transparency
-            if not is_valid_image(image.byte_array):
+            if not image.is_ocr and not is_valid_image(image.byte_array):
                 error_message = partial(error_messages.image_validity, image.filename)
                 break