From 4f47abb7efc27c575431bd381ff915db672509f8 Mon Sep 17 00:00:00 2001 From: ston1th Date: Sun, 16 Jun 2024 19:29:14 +0200 Subject: [PATCH] implemented text recognition (ocr) (#272) This could be a useful addition in cases where modern game rendering and visual effects (clutter) cause difficulties to find good comparison images. It currently depends on pytesseract and Tesseract-OCR but tests with EasyOCR have also been conducted. Both seem to get similar good recognition results. EasyOCR looks like to cause higher CPU load then tesseract. Tesseract on the other hand is an external dependency that needs to be installed seperatly. The text comparison of the expected and recognized string has two modes. A the levenshtein ratio or a partial string match * display text that is searched for * call tesseract ourselfs which ditches all python binding libraries to not include Pillow * include PATH variable use --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Avasam --- docs/tutorial.md | 70 +++++++++++ scripts/requirements.txt | 1 + src/AutoSplit.py | 11 +- src/AutoSplitImage.py | 79 ++++++++++-- .../VideoCaptureDeviceCaptureMethod.py | 3 +- src/capture_method/__init__.py | 34 ++---- src/compare.py | 44 ++++++- src/error_messages.py | 16 +++ src/split_parser.py | 2 +- src/utils.py | 112 +++++++++++++++++- 10 files changed, 326 insertions(+), 46 deletions(-) diff --git a/docs/tutorial.md b/docs/tutorial.md index 3167d377..c1722a51 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -174,6 +174,76 @@ You can have one (and only one) image with the keyword `reset` in its name. Auto The Start Image is similar to the Reset Image. You can only have one Start Image with the keyword `start_auto_splitter`.You can reload the image using the "`Reload Start Image`" button. The pause time is the amount of seconds AutoSplit will wait before starting comparisons of the first split image. Delay times will be used to delay starting your timer after the threshold is met. +### Text Recognition / Optical Character Recognition (OCR) ⚠️EXPERIMENTAL⚠️ + +You can use text recognition as an alternative comparison method. + +#### Tesseract install + +First you need to install tesseract and include it in your system or user environment variables. + +- See for installation instruction on all platforms. +- For Windows: + 1. You can go directly to to find the installer. + 2. If you change the "Destination Folder" during install, then you'll also need to add it to your `PATH` environment variable. + +#### Usage + +To use this feature you need to place a text file (`.txt`) in your splits folder instead of an image file. + +An example file name and content could look like this: + +Filename: `001_start_auto_splitter.txt` + +Content: + +```toml +texts = ["complete any 2 encounters"] +left = 275 +right = 540 +top = 70 +bottom = 95 +methods = [0] +fps_limit = 1 +``` + +The `texts` field is an array and can take more than one text to look for: + +```toml +texts = ["look for me", "or this text"] +``` + +Note: for now we only use lowercase letters in the comparison. All uppercase letters are converted to lowercase before the comparison. + +The rectangle coordinates where the text you are looking for is expected to appear in the image are configured as follows: + +```toml +left = 275 +right = 540 +top = 70 +bottom = 95 +``` + +If you're used to working in corner coordinates, you can think of `top_left = [left, top]` and `bottom_right = [right, bottom]`. + +Currently there are two comparison methods: + +- `0` - uses the Levenshtein distance (the default) +- `1` - checks if the OCR text contains the searched text (results in matches of either `0.0` or `1.0`) + +If you only want a perfect full match, use "Levenshtein" with a threshold of `(1.0)` on your file name. + +You can also chain multiple comparison methods using the array notation: + +```toml +methods = [1, 0] +``` + +The methods are then checked in the order you defined and the best match upon them wins. + +Note: This method can cause high CPU usage at the standard comparison FPS. You should therefor limit the comparison FPS when you use this method to 1 or 2 FPS using the `fps_limit` option. +The size of the selected rectangle can also impact the CPU load (bigger = more CPU load). + ### Profiles diff --git a/scripts/requirements.txt b/scripts/requirements.txt index e89669ef..93a6a541 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -4,6 +4,7 @@ # # Dependencies: git+https://github.com/boppreh/keyboard.git#egg=keyboard # Fix install on macos and linux-ci https://github.com/boppreh/keyboard/pull/568 +Levenshtein>=0.25 numpy>=1.26 # Python 3.12 support opencv-python-headless>=4.9.0.80 # Typing fixes packaging diff --git a/src/AutoSplit.py b/src/AutoSplit.py index 20bbfab0..db50e5e6 100644 --- a/src/AutoSplit.py +++ b/src/AutoSplit.py @@ -307,7 +307,8 @@ def __reload_start_image(self, *, started_by_button: bool = False, wait_for_dela self.highest_similarity = 0.0 self.reset_highest_similarity = 0.0 self.split_below_threshold = False - self.timer_start_image.start(int(ONE_SECOND / self.settings_dict["fps_limit"])) + + self.timer_start_image.start(int(ONE_SECOND / self.start_image.get_fps_limit(self))) QApplication.processEvents() @@ -687,7 +688,7 @@ def __similarity_threshold_loop(self, number_of_split_images: int, dummy_splits_ QApplication.processEvents() # Limit the number of time the comparison runs to reduce cpu usage - frame_interval = 1 / self.settings_dict["fps_limit"] + frame_interval = 1 / self.split_image.get_fps_limit(self) # Use a time delta to have a consistant check interval wait_delta_ms = int((frame_interval - (time() - start) % frame_interval) * ONE_SECOND) @@ -870,7 +871,11 @@ def __update_split_image(self, specific_image: AutoSplitImage | None = None): # Get split image self.split_image = specific_image or self.split_images_and_loop_number[0 + self.split_image_number][0] - if is_valid_image(self.split_image.byte_array): + if self.split_image.is_ocr: + # TODO: test if setText clears a set image + text = "\nor\n".join(self.split_image.texts) + self.current_split_image.setText(f"Looking for OCR text:\n{text}") + elif is_valid_image(self.split_image.byte_array): set_preview_image(self.current_split_image, self.split_image.byte_array) self.current_image_file_label.setText(self.split_image.filename) diff --git a/src/AutoSplitImage.py b/src/AutoSplitImage.py index d4176560..052edd1e 100644 --- a/src/AutoSplitImage.py +++ b/src/AutoSplitImage.py @@ -5,11 +5,12 @@ import cv2 import numpy as np +import toml from cv2.typing import MatLike import error_messages -from compare import check_if_image_has_transparency, get_comparison_method_by_index -from utils import BGR_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image +from compare import check_if_image_has_transparency, extract_and_compare_text, get_comparison_method_by_index +from utils import BGR_CHANNEL_COUNT, MAXBYTE, TESSERACT_PATH, ColorChannel, ImageShape, is_valid_image if TYPE_CHECKING: from AutoSplit import AutoSplit @@ -33,20 +34,26 @@ class ImageType(IntEnum): class AutoSplitImage: - path: str - filename: str - flags: int - loops: int image_type: ImageType byte_array: MatLike | None = None mask: MatLike | None = None # This value is internal, check for mask instead _has_transparency = False - # These values should be overriden by some Defaults if None. Use getters instead + # These values should be overridden by some Defaults if None. Use getters instead __delay_time: float | None = None __comparison_method: int | None = None __pause_time: float | None = None __similarity_threshold: float | None = None + __rect = (0, 0, 1, 1) + __fps_limit = 0 + + @property + def is_ocr(self): + """ + Whether a "split image" is actually for Optical Text Recognition + based on whether there's any text strings to search for. + """ + return bool(self.texts) def get_delay_time(self, default: "AutoSplit | int"): """Get image's delay time or fallback to the default value from spinbox.""" @@ -80,6 +87,12 @@ def get_similarity_threshold(self, default: "AutoSplit | float"): return default return default.settings_dict["default_similarity_threshold"] + def get_fps_limit(self, default: "AutoSplit"): + """Get image's fps limit or fallback to the default value from spinbox.""" + if self.__fps_limit != 0: + return self.__fps_limit + return default.settings_dict["fps_limit"] + def __init__(self, path: str): self.path = path self.filename = os.path.split(path)[-1].lower() @@ -89,7 +102,12 @@ def __init__(self, path: str): self.__comparison_method = comparison_method_from_filename(self.filename) self.__pause_time = pause_from_filename(self.filename) self.__similarity_threshold = threshold_from_filename(self.filename) - self.__read_image_bytes(path) + self.texts: list[str] = [] + self. __ocr_comparison_methods: list[int] = [] + if path.endswith("txt"): + self.__parse_text_file(path) + else: + self.__read_image_bytes(path) if START_KEYWORD in self.filename: self.image_type = ImageType.START @@ -98,6 +116,31 @@ def __init__(self, path: str): else: self.image_type = ImageType.SPLIT + def __parse_text_file(self, path: str): + if not TESSERACT_PATH: + error_messages.tesseract_missing(path) + return + + with open(path, encoding="utf-8") as f: + data = toml.load(f) + + self.texts = [text.lower().strip() for text in data["texts"]] + self.__rect = (data["left"], data["right"], data["top"], data["bottom"]) + self.__ocr_comparison_methods = data.get("methods", [0]) + self.__fps_limit = data.get("fps_limit", 0) + + if self.__validate_ocr(): + error_messages.wrong_ocr_values(path) + return + + def __validate_ocr(self): + values = [*self.__rect, *self.__ocr_comparison_methods, self.__fps_limit] + return ( + all(value >= 0 for value in values) # Check for invalid negative values + and self.__rect[1] > self.__rect[0] + and self.__rect[3] > self.__rect[2] + ) + def __read_image_bytes(self, path: str): image = cv2.imread(path, cv2.IMREAD_UNCHANGED) if not is_valid_image(image): @@ -140,8 +183,24 @@ def compare_with_capture( default: "AutoSplit | int", capture: MatLike | None, ): - """Compare image with capture using image's comparison method. Falls back to combobox.""" - if not is_valid_image(self.byte_array) or not is_valid_image(capture): + """ + Compare image with capture using image's comparison method. Falls back to combobox. + For OCR text files: extract image text from rectangle position and compare it with the expected string. + """ + if not is_valid_image(capture): + return 0.0 + + if self.is_ocr: + return extract_and_compare_text( + capture[ + self.__rect[2]:self.__rect[3], + self.__rect[0]:self.__rect[1], + ], + self.texts, + self.__ocr_comparison_methods, + ) + + if not is_valid_image(self.byte_array): return 0.0 resized_capture = cv2.resize(capture, self.byte_array.shape[1::-1]) diff --git a/src/capture_method/VideoCaptureDeviceCaptureMethod.py b/src/capture_method/VideoCaptureDeviceCaptureMethod.py index 29606f95..e87a19ca 100644 --- a/src/capture_method/VideoCaptureDeviceCaptureMethod.py +++ b/src/capture_method/VideoCaptureDeviceCaptureMethod.py @@ -7,10 +7,9 @@ from cv2.typing import MatLike from typing_extensions import override -from capture_method import get_input_device_resolution from capture_method.CaptureMethodBase import CaptureMethodBase from error_messages import CREATE_NEW_ISSUE_MESSAGE, exception_traceback -from utils import ImageShape, is_valid_image +from utils import ImageShape, get_input_device_resolution, is_valid_image if TYPE_CHECKING: from AutoSplit import AutoSplit diff --git a/src/capture_method/__init__.py b/src/capture_method/__init__.py index e099d444..9e5ea0b5 100644 --- a/src/capture_method/__init__.py +++ b/src/capture_method/__init__.py @@ -10,10 +10,10 @@ from capture_method.CaptureMethodBase import CaptureMethodBase from capture_method.VideoCaptureDeviceCaptureMethod import VideoCaptureDeviceCaptureMethod -from utils import WGC_MIN_BUILD, WINDOWS_BUILD_NUMBER, first, try_get_direct3d_device +from utils import WGC_MIN_BUILD, WINDOWS_BUILD_NUMBER, first, get_input_device_resolution, try_get_direct3d_device if sys.platform == "win32": - from _ctypes import COMError # noqa: PLC2701 + from _ctypes import COMError # noqa: PLC2701 # comtypes is untyped from pygrabber.dshow_graph import FilterGraph @@ -76,7 +76,12 @@ def __hash__(self): @override @staticmethod - def _generate_next_value_(name: str, start: int, count: int, last_values: list["str | CaptureMethodEnum"]): + def _generate_next_value_( + name: "str | CaptureMethodEnum", + start: int, + count: int, + last_values: list["str | CaptureMethodEnum"], + ): return name NONE = "" @@ -200,29 +205,6 @@ def get_input_devices(): return cameras -def get_input_device_resolution(index: int) -> tuple[int, int] | None: - if sys.platform != "win32": - return (0, 0) - filter_graph = FilterGraph() - try: - filter_graph.add_video_input_device(index) - # This can happen with virtual cameras throwing errors. - # For example since OBS 29.1 updated FFMPEG breaking VirtualCam 3.0 - # https://github.com/Toufool/AutoSplit/issues/238 - except COMError: - return None - - try: - resolution = filter_graph.get_input_device().get_current_format() - # For unknown reasons, some devices can raise "ValueError: NULL pointer access". - # For instance, Oh_DeeR's AVerMedia HD Capture C985 Bus 12 - except ValueError: - return None - finally: - filter_graph.remove_filters() - return resolution - - def get_all_video_capture_devices(): named_video_inputs = get_input_devices() diff --git a/src/compare.py b/src/compare.py index f93788da..fefc2056 100644 --- a/src/compare.py +++ b/src/compare.py @@ -1,17 +1,20 @@ +from collections.abc import Iterable from math import sqrt import cv2 +import Levenshtein import numpy as np from cv2.typing import MatLike from scipy import fft -from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image +from utils import BGRA_CHANNEL_COUNT, MAXBYTE, ColorChannel, ImageShape, is_valid_image, run_tesseract MAXRANGE = MAXBYTE + 1 CHANNELS = (ColorChannel.Red.value, ColorChannel.Green.value, ColorChannel.Blue.value) HISTOGRAM_SIZE = (8, 8, 8) RANGES = (0, MAXRANGE, 0, MAXRANGE, 0, MAXRANGE) MASK_SIZE_MULTIPLIER = ColorChannel.Alpha * MAXBYTE * MAXBYTE +MAX_VALUE = 1.0 def compare_histograms(source: MatLike, capture: MatLike, mask: MatLike | None = None): @@ -126,10 +129,49 @@ def compare_phash(source: MatLike, capture: MatLike, mask: MatLike | None = None return 1 - (hash_diff / 64.0) +def extract_and_compare_text(capture: MatLike, texts: Iterable[str], methods_index: Iterable[int]): + """ + Compares the extracted text of the given image and returns the similarity between the two texts. + The best match of all texts and methods is returned. + + @param capture: Image of any given shape as a numpy array + @param texts: a list of strings to match for + @param methods_index: a list of comparison methods to use in order + @return: The similarity between the text in the image and the text supplied as a number 0 to 1. + """ + methods = [get_ocr_comparison_method_by_index(i) for i in methods_index] + png = np.array(cv2.imencode(".png", capture)[1]).tobytes() + # Especially with stylised characters, OCR could conceivably get the right + # letter, but mix up the casing (m/M, o/O, t/T, etc.) + image_string = run_tesseract(png).lower().strip() + + ratio = 0.0 + for text in texts: + for method in methods: + ratio = max(ratio, method(text, image_string)) + if ratio == MAX_VALUE: + return ratio # we found the best match; try to return early + return ratio + + +def compare_submatch(a: str, b: str): + return float(a in b) + + def __compare_dummy(*_: object): return 0.0 +def get_ocr_comparison_method_by_index(comparison_method_index: int): + match comparison_method_index: + case 0: + return Levenshtein.ratio + case 1: + return compare_submatch + case _: + return __compare_dummy + + def get_comparison_method_by_index(comparison_method_index: int): match comparison_method_index: case 0: diff --git a/src/error_messages.py b/src/error_messages.py index 218cb3b7..ceb79df8 100644 --- a/src/error_messages.py +++ b/src/error_messages.py @@ -228,3 +228,19 @@ def handle_top_level_exceptions(exception: Exception) -> NoReturn: else: traceback.print_exception(type(exception), exception, exception.__traceback__) sys.exit(1) + + +def tesseract_missing(ocr_split_file_path: str): + set_text_message( + f"{ocr_split_file_path!r} is an Optical Character Recognition split file but tesseract couldn't be found." + + f'\nPlease read ' + + f"github.com/{GITHUB_REPOSITORY}#install-tesseract for installation instructions.", + ) + + +def wrong_ocr_values(ocr_split_file_path: str): + set_text_message( + f"{ocr_split_file_path!r} has invalid values." + + "\nPlease make sure that `left < right` and `top < bottom`. " + + "Also check for negative values in the 'methods' or 'fps_limit' settings", + ) diff --git a/src/split_parser.py b/src/split_parser.py index 374d0a25..b49fa99b 100644 --- a/src/split_parser.py +++ b/src/split_parser.py @@ -225,7 +225,7 @@ def parse_and_validate_images(autosplit: "AutoSplit"): else: for image in split_images: # Test for image without transparency - if not is_valid_image(image.byte_array): + if not image.is_ocr and not is_valid_image(image.byte_array): error_message = partial(error_messages.image_validity, image.filename) break diff --git a/src/utils.py b/src/utils.py index f8b492ac..50d3f47f 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,5 +1,7 @@ import asyncio import os +import shutil +import subprocess # noqa: S404 import sys from collections.abc import Callable, Iterable from enum import IntEnum @@ -7,7 +9,7 @@ from itertools import chain from platform import version from threading import Thread -from typing import TYPE_CHECKING, Any, TypeGuard, TypeVar +from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, TypeGuard, TypeVar from cv2.typing import MatLike @@ -16,12 +18,18 @@ if sys.platform == "win32": import ctypes import ctypes.wintypes + from _ctypes import COMError # noqa: PLC2701 # comtypes is untyped import win32gui import win32ui + from pygrabber.dshow_graph import FilterGraph from winsdk.windows.ai.machinelearning import LearningModelDevice, LearningModelDeviceKind from winsdk.windows.media.capture import MediaCapture + STARTUPINFO: TypeAlias = subprocess.STARTUPINFO +else: + STARTUPINFO: TypeAlias = None + if sys.platform == "linux": import fcntl @@ -37,6 +45,19 @@ T = TypeVar("T") + +def find_tesseract_path(): + search_path = os.environ.get("PATH", os.defpath) + if sys.platform == "win32": + search_path += r";C:\Program Files\Tesseract-OCR;C:\Program Files (x86)\Tesseract-OCR" + return shutil.which(TESSERACT_EXE, path=search_path) + + +TESSERACT_EXE = "tesseract" +TESSERACT_PATH = find_tesseract_path() +"""The path to execute tesseract. `None` if it can't be found.""" +TESSERACT_CMD = (TESSERACT_PATH or TESSERACT_EXE, "-", "-", "--oem", "1", "--psm", "6") + DWMWA_EXTENDED_FRAME_BOUNDS = 9 MAXBYTE = 255 ONE_SECOND = 1000 @@ -60,6 +81,14 @@ class ColorChannel(IntEnum): Alpha = 3 +class SubprocessKWArgs(TypedDict): + stdin: int + stdout: int + stderr: int + startupinfo: STARTUPINFO | None + env: os._Environ[str] | None # pyright: ignore[reportPrivateUsage] + + def decimal(value: float): # Using ljust instead of :2f because of python float rounding errors return f"{int(value * 100) / 100}".ljust(4, "0") @@ -122,12 +151,34 @@ def get_window_bounds(hwnd: int) -> tuple[int, int, int, int]: return window_left_bounds, window_top_bounds, window_width, window_height +# Note: maybe reorganize capture_method module to have different helper modules and a methods submodule +def get_input_device_resolution(index: int) -> tuple[int, int] | None: + if sys.platform != "win32": + return (0, 0) + filter_graph = FilterGraph() + try: + filter_graph.add_video_input_device(index) + # This can happen with virtual cameras throwing errors. + # For example since OBS 29.1 updated FFMPEG breaking VirtualCam 3.0 + # https://github.com/Toufool/AutoSplit/issues/238 + except COMError: + return None + + try: + resolution = filter_graph.get_input_device().get_current_format() + # For unknown reasons, some devices can raise "ValueError: NULL pointer access". + # For instance, Oh_DeeR's AVerMedia HD Capture C985 Bus 12 + except ValueError: + return None + finally: + filter_graph.remove_filters() + return resolution + + def open_file(file_path: str | bytes | os.PathLike[str] | os.PathLike[bytes]): if sys.platform == "win32": os.startfile(file_path) # noqa: S606 else: - import subprocess # noqa: PLC0415, S404 - opener = "xdg-open" if sys.platform == "linux" else "open" subprocess.call([opener, file_path]) # noqa: S603 @@ -209,6 +260,61 @@ def flatten(nested_iterable: Iterable[Iterable[T]]) -> chain[T]: return chain.from_iterable(nested_iterable) +def subprocess_kwargs(): + """ + Create a set of arguments which make a ``subprocess.Popen`` (and + variants) call work with or without Pyinstaller, ``--noconsole`` or + not, on Windows and Linux. + + Typical use: + ```python + subprocess.call(['program_to_run', 'arg_1'], **subprocess_args()) + ``` + --- + Originally found in https://github.com/madmaze/pytesseract/blob/master/pytesseract/pytesseract.py + Recipe from https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess + which itself is taken from https://github.com/bjones1/enki/blob/master/enki/lib/get_console_output.py + """ + # The following is true only on Windows. + if sys.platform == "win32": + # On Windows, subprocess calls will pop up a command window by default when run from + # Pyinstaller with the ``--noconsole`` option. Avoid this distraction. + startupinfo = STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + # https://github.com/madmaze/pytesseract/blob/88839f03590578a10e806a5244704437c9d477da/pytesseract/pytesseract.py#L236 + startupinfo.wShowWindow = subprocess.SW_HIDE + # Windows doesn't search the path by default. Pass it an environment so it will. + env = os.environ + else: + startupinfo = None + env = None + # On Windows, running this from the binary produced by Pyinstaller + # with the ``--noconsole`` option requires redirecting everything + # (stdin, stdout, stderr) to avoid an OSError exception + # "[Error 6] the handle is invalid." + return SubprocessKWArgs( + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + startupinfo=startupinfo, + env=env, + ) + + +def run_tesseract(png: bytes): + """ + Executes the tesseract CLI and pipes a PNG encoded image to it. + @param png: PNG encoded image as byte array + @return: The recognized output string from tesseract. + """ + return ( + subprocess + .Popen(TESSERACT_CMD, **subprocess_kwargs()) # noqa: S603 # Only using known literal strings + .communicate(input=png)[0] + .decode() + ) + + # Environment specifics WINDOWS_BUILD_NUMBER = int(version().split(".")[-1]) if sys.platform == "win32" else -1 FIRST_WIN_11_BUILD = 22000