diff --git a/README.md b/README.md index 56e8007..8240839 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,33 @@ -Attempting to read a portion of the screen and plot its data real time. +Attempting to read portions of a screen or a video and exporting its data in a CSV file. -> Blazingly slow, but it kinda works +> *Blazingly slow, but it kinda works* + +**Contents:** + +1. [Features](#features) +2. [Install](#install) +3. [Run](#run) +4. [Notes](#notes) +5. [Dependencies](#dependencies) ## Features - Real time recognitions of numbers of the screen. - Offline number recognition in a video file. - Works on easily configurable areas, as many as one wants. -- Easy to integrate new OCR methods (see below TODO) +- Integrated Tesseract OCR and EasyOCR. +- Easy to add new OCR methods (see `src/ocr.py`). +- Dead simple to use! ## Install +This has been mainly developed and tested on Ubuntu 22.04, with Python 3.10. + ```bash -# Install Python version (if necessary) +# Install Python version >= 3.10 (necessary on lower Ubuntu versions) sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update sudo apt install python3.13 python3.13-venv python3.13-tk @@ -48,22 +60,31 @@ python gui.py ## Notes +- Known OCR issues: + - Characters confusion, depending on the font: `0` and `8`, `1` and `7`, `5` and `9`. + - Missing dot (*e.g.* `42.42` turned into `4242`). +- Tips to improve OCR reliability: + - Upscale the detected area, to get a better characters resolution. + - Use min and max bounds to filter outliers out. + - Don't trust the OCR output too much. Potentially implement post-filtering based on knowledge of the recorded data. For example if measuring a variable that can only evolve slowly, big jumps in the output value can be marked as outliers and discarded. - When processing a video, enabling the preview can induce up to 20% overhead. -- EasyOCR requires PyTorch and Scipy, so isn't lightweight. The first time the program is started, it will download necessary model weights (stored in `~/.EasyOCR/model`). See more details on the EasyOCR GitHub ([link](https://github.com/JaidedAI/EasyOCR)). +- EasyOCR requires PyTorch and Scipy, so isn't lightweight. The first time the program is started, it will download necessary model weights (stored in `~/.EasyOCR/model`). See more details on the EasyOCR GitHub ([link](https://github.com/JaidedAI/EasyOCR)). With this application, it seems that EasyOCR is slower than Tesseract. ## Dependencies -- https://github.com/opencv/opencv-python -- https://github.com/tomschimansky/customtkinter -- https://github.com/tesseract-ocr/tesseract -- https://github.com/sirfz/tesserocr -- https://github.com/JaidedAI/EasyOCR +This work is merely a wrapper and a graphical interface for some already existing OCR implementations. It heavily uses Tkinter and CustomTkinter for the interface. + +- [OpenCV](https://github.com/opencv/opencv-python) (MIT license): image processing. +- [CustomTkinter](https://github.com/tomschimansky/customtkinter) (MIT license): beautiful interface and GUI. +- [Tesseract](https://github.com/tesseract-ocr/tesseract) (Apache 2.0): OCR API. +- [Tesserocr](https://github.com/sirfz/tesserocr) (MIT license): Python wrapper for Tesseract. +- [EasyOCR](https://github.com/JaidedAI/EasyOCR) (Apache 2.0): another OCR API. ## TODO - [x] Loading and processing videos - [x] Saving/loading configuration - [x] Multi threading, for less blazing slowness -- [ ] Make it easy to add new OCR methods, and documenting it +- [ ] Make it easier to add new OCR methods, and documenting it - [ ] Logging not only in the Python terminal, but also in the logging text box - [ ] Real time graphing diff --git a/gui.py b/gui.py index a7ad9a2..c728d7b 100644 --- a/gui.py +++ b/gui.py @@ -1,5 +1,10 @@ +#!/usr/bin/env python3 """ -TODO +Main entry point for the application + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ from src.ocr import BaseOcrEngine, OcrMethod @@ -44,8 +49,8 @@ def __init__(self): self.title("DemOCRatos - OCR for the people") # self.geometry(f"{1100}x{580}") self.iconphoto(True, ImageTk.PhotoImage(file="assets/logo_low.png")) - self.bind("", lambda _: sys.exit()) # FIXME: for development only - self.bind("q", lambda _: sys.exit()) # FIXME: for development only + # self.bind("", lambda _: sys.exit()) # FIXME: for development only + # self.bind("q", lambda _: sys.exit()) # FIXME: for development only self.protocol("WM_DELETE_WINDOW", self._on_closing_cb) self._rect_selec_window = None # reference to the window to select the capture zone @@ -85,12 +90,12 @@ def __init__(self): self.grid_rowconfigure(0, weight=0) # Set default values and statuses - self._stop_btn.configure(state="disabled") # TODO: change state dynamically + self._stop_btn.configure(state="disabled") self._status_txt.configure(text="10:03 (12 FPS)") self._status_txt.configure( fg_color="green", text_color="white" - ) # TODO: change the colour depending on the status + ) self._fps_settings_menu.set("10") self._ocr_settings_menu.set("Tesseract") @@ -261,11 +266,23 @@ def __load_settings_cb(): with open(path) as file: try: config = json.load(file) - except (UnicodeDecodeError, json.decoder.JSONDecodeError): + except (UnicodeDecodeError, json.decoder.JSONDecodeError) as e: print("ERROR: coudn't parse JSON config") + print(e) return - self._captures.load_config(config) + try: + new_captures = Captures(self._output_frame) + new_captures.load_config(config["captures"]) + self._fps_settings_menu.set(config["fps"]) + self._ocr_settings_menu.set(config["ocr_method"]) + self._max_threads_entry.set_value(config["max_threads"]) + except KeyError as e: + print("ERROR: couldn't parse JSON config") + print(e) + return + + self._captures = new_captures self._data_recorder.reset_fields(self._captures.get_names()) self._selected_capture = self._captures.get_first() self._update_capture_options() @@ -282,7 +299,11 @@ def __save_settings_cb(): return with open(path, "w") as file: - config = self._captures.get_config() + config = {} + config["captures"] = self._captures.get_config() + config["fps"] = self._fps_settings_menu.get() + config["ocr_method"] = self._ocr_settings_menu.get() + config["max_threads"] = self._max_threads_entry.get_value() json.dump(config, file) print(config) @@ -720,13 +741,15 @@ def _update_capture_options(self, selected: None | str = None): def __update_entry_text(entry: ctk.CTkEntry, text): entry.delete(0, tk.END) - entry.insert(0, text) + entry.insert(0, text if text is not None else "") self._selected_capture.toggle_edit(False) __update_entry_text(self._rect_xmin_entry, self._selected_capture.x_min) __update_entry_text(self._rect_xmax_entry, self._selected_capture.x_max) __update_entry_text(self._rect_ymin_entry, self._selected_capture.y_min) __update_entry_text(self._rect_ymax_entry, self._selected_capture.y_max) + __update_entry_text(self._min_entry, self._selected_capture.min_value) + __update_entry_text(self._max_entry, self._selected_capture.max_value) self._selected_capture.toggle_edit(True) self._pre_process_config_frame.update_elements( diff --git a/main.py b/main.py deleted file mode 100644 index fa0fe5f..0000000 --- a/main.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3 -""" - Run OCR on a motor video to extract useful information -""" - -import pytesseract -from cv2.typing import MatLike -import cv2 -import numpy as np -import csv - - -class DataConfig: - def __init__( - self, min_value: float, max_value: float, slice_x: slice, slice_y: slice - ): - """ - Args: - - min_value: Minimum value the data field can take - - max_value: Maximum value the data field can take - - position_mask: Pixel location of the data field - """ - self.min_value = min_value - self.max_value = max_value - self.slice_x = slice_x - self.slice_y = slice_y - - -# ========================================================= -# CONFIGURATION -# -video_path = "resources/long.mp4" -image_path = "resources/frame_raw.png" -output_file = "output/output.csv" -max_cnt = 100000 - -data_config = { - "temperature": DataConfig(20.0, 100.0, slice(295, 311), slice(749, 770)), - "current": DataConfig(0.0, 20.0, slice(295, 311), slice(576, 620)), - "speed": DataConfig(-10.0, 1500, slice(295, 311), slice(402, 450)), -} - - -# ========================================================= -# OCR -# -def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0): - """ - Return a sharpened version of the image, using an unsharp mask - """ - blurred = cv2.GaussianBlur(image, kernel_size, sigma) - sharpened = float(amount + 1) * image - float(amount) * blurred - sharpened = np.maximum(sharpened, np.zeros(sharpened.shape)) - sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape)) - sharpened = sharpened.round().astype(np.uint8) - - if threshold > 0: - low_contrast_mask = np.absolute(image - blurred) < threshold - np.copyto(sharpened, image, where=low_contrast_mask) - - return sharpened - - -def preprocess_frame(frame: MatLike, slice_x: slice, slice_y: slice) -> MatLike: - """ - Crops a frame and preprocesses it ahead of OCR (upscaling, sharpening and - thresholding) - """ - cropped = frame[slice_x, slice_y] - shape = np.shape(cropped) - cropped = cv2.resize( - cropped, (shape[1] * 2, shape[0] * 2), interpolation=cv2.INTER_LINEAR - ) - - cropped = unsharp_mask(cropped, kernel_size=(5, 5), amount=2, sigma=1) - cropped = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) - cropped = cv2.threshold( - cropped, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU - )[1] - - return cropped - - -def ocr(image: MatLike) -> str: - """ - Reads the portion of image, and returns the raw output - """ - img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - output = pytesseract.image_to_string( - img_rgb, config="--dpi 20 --psm 13 -c tessedit_char_whitelist=-0123456789." - ) - - return output.strip() - - -# ========================================================= -# POST PROCESSING -# -def filter(raw_string: str, min_value: float, max_value: float) -> float | None: - """ - Tries to convert a string into a float, and returns only if the resulting value - is within some given bounds. - """ - try: - value = float(raw_string) - except ValueError: - return None - - if value < min_value or value > max_value: - return None - - return value - - -# ========================================================= -# FILE PROCESSING -# -def run_video(): - cap = cv2.VideoCapture(video_path) - cnt = 0 - outputs = {key: [] for key in data_config} - outputs["t"] = [] - - while cap.isOpened(): - # Read video frame - ret, frame = cap.read() - - cnt += 1 - if cnt % 25 != 0: - continue - if cnt >= max_cnt: - break - - if not ret: - print("Can't receive frame (stream end?). Exiting...") - break - - print(cnt) - - # Perform OCR on the various fields - for key in data_config: - config = data_config[key] - cropped = preprocess_frame(frame, config.slice_x, config.slice_y) - raw_output = ocr(cropped) - outputs[key].append(filter(raw_output, config.min_value, config.max_value)) - - outputs["t"].append(cnt) - - # Display the information nicely - for key in data_config: - config = data_config[key] - cv2.rectangle( - frame, - (config.slice_y.start, config.slice_x.start), - (config.slice_y.stop - 1, config.slice_x.stop - 1), - (0, 255, 0), - 1, - ) - font = cv2.FONT_HERSHEY_SIMPLEX - value = str(outputs[key][-1]) - - if outputs[key][-1] is None: - colour = (255, 0, 0) - value = "None" - else: - colour = (0, 255, 0) - - position = [config.slice_y.start, config.slice_x.start - 8] - cv2.putText(frame, value, position, font, 0.5, colour, 1, cv2.LINE_AA) - - cv2.imshow("frame", frame) - - if cv2.waitKey(1) == ord("q"): - break - - cap.release() - cv2.destroyAllWindows() - - return outputs - - -def run_image(): - img = cv2.imread(image_path) - img = img[295:311, 748:770] - # img = img[287:311, 661:760] - # img = img[287:311, 661:826] - - ocr(img) - - cv2.imshow(image_path, img) - cv2.waitKey(0) - - -# ========================================================= -# RESULTS PROCESSING -# - -def export_outputs(outputs: dict): - keys = list(outputs.keys()) - del keys[keys.index("t")] - keys.insert(0, "t") - - with open(output_file, "w", newline="") as csv_file: - writer = csv.DictWriter( - csv_file, fieldnames=keys, quotechar='"', quoting=csv.QUOTE_ALL - ) - writer.writeheader() - - for k in range(len(outputs["t"])): - writer.writerow({key: outputs[key][k] for key in keys}) - - -if __name__ == "__main__": - outputs = run_video() - export_outputs(outputs) - - print(outputs) diff --git a/src/capture.py b/src/capture.py index 37979a3..398243d 100644 --- a/src/capture.py +++ b/src/capture.py @@ -1,5 +1,9 @@ """ Class storing configuration data about a capture + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ from src.gui_elements import TkImage2 @@ -30,8 +34,8 @@ def __init__(self, name: str, img_root: ctk.CTkBaseClass, ocr_method: OcrMethod) self.show_preview = True # whether to draw a preview of the captured area self._output_img = TkImage2(img_root) # displayed output image self._output_txt = ctk.CTkLabel(img_root, text="-") - self._min_value = None # minimum acceptable value for post-processing (not used if None) - self._max_value = None # maximum acceptable value for post-processing (not used if None) + self.min_value = None # minimum acceptable value for post-processing (not used if None) + self.max_value = None # maximum acceptable value for post-processing (not used if None) self.set_ocr_method(ocr_method) @@ -51,8 +55,9 @@ def set_min_max_values(self, min_value: float | None, max_value: float | None): If None, the extremum won't be used. """ - self._min_value = min_value - self._max_value = max_value + if self._can_edit: + self.min_value = min_value + self.max_value = max_value def display(self, column_idx: int): """ @@ -94,7 +99,6 @@ def update(self, output: str, processed_img: np.ndarray): self._output_img.update(processed_img) self._output_txt.configure(text=f"{self.name}: {output}") - def slice_area(self, array: np.ndarray) -> np.ndarray: """ Slices a Numpy array according to the capture area coordinates, along @@ -155,10 +159,10 @@ def post_process(self, output_str: str | None): except ValueError: return None - if self._min_value is not None and value < self._min_value: + if self.min_value is not None and value < self.min_value: return None - if self._max_value is not None and value > self._max_value: + if self.max_value is not None and value > self.max_value: return None return value @@ -211,6 +215,8 @@ def get_config(self): config[name] = {} config[name]["area"] = [capture.x_min, capture.y_min, capture.x_max, capture.y_max] + config[name]["min_value"] = capture.min_value + config[name]["max_value"] = capture.max_value config[name]["is_enabled"] = capture.is_enabled config[name]["show_preview"] = capture.show_preview config[name]["ocr"] = {} @@ -231,6 +237,8 @@ def load_config(self, config): capture = self.add_capture() capture.name = name capture.set_area(*config[name]["area"]) + capture.min_value = config[name]["min_value"] + capture.max_value = config[name]["max_value"] capture.show_preview = config[name]["show_preview"] ocr = BaseOcrEngine.PreProcessConfig() diff --git a/src/data_recorder.py b/src/data_recorder.py index 4acaef8..569a3ff 100644 --- a/src/data_recorder.py +++ b/src/data_recorder.py @@ -1,5 +1,9 @@ """ - Class to record real time data, and keep track of time +Class to record real time data, and keep track of time + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ import numpy as np @@ -88,7 +92,7 @@ def toggle_recording(self, is_recording: bool) -> str: self._data = {key: [] for key in self._data} self._start_time = time() self._last_times = [] - else: + elif self._is_recording: path = self._save_data() self._is_recording = is_recording diff --git a/src/gui_elements.py b/src/gui_elements.py index 6fff4e8..53f07f1 100644 --- a/src/gui_elements.py +++ b/src/gui_elements.py @@ -1,5 +1,9 @@ """ Helper functions and classes for the TK app + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ from __future__ import annotations @@ -82,9 +86,6 @@ class TkImage: """ def __init__(self, master: ctk.CTkBaseClass): - """ - TODO - """ self._fig = Figure(layout="tight") self._fig.patch.set_facecolor("xkcd:mint green") self._ax: Axes = self._fig.add_subplot(111) @@ -108,28 +109,28 @@ def update(self, img: np.ndarray): def get_tk_canvas(self) -> tk.Canvas: """ - TODO + Returns a reference to the drawable canvas """ return self._tk_canvas.get_tk_widget() class TkImage2: """ - TODO + Utility class to draw images in tkinter (more optimised than `TkImage`) """ def __init__(self, master: ctk.CTkBaseClass): - """ - TODO - """ self._img = None self._canvas_img = None self._canvas = tk.Canvas(master) - def update(self, img: np.ndarray): + def update(self, img: np.ndarray | None): """ Updates the image given a raw image (Numpy array) """ + if img is None: + return + if self._canvas_img is not None: self._canvas.delete(self._canvas_img) @@ -139,7 +140,7 @@ def update(self, img: np.ndarray): def get_tk_canvas(self) -> tk.Canvas: """ - TODO + Returns a reference to the drawable canvas """ return self._canvas @@ -154,7 +155,7 @@ def __init__(self, screen_img, *args, **kwargs): # Start new window super().__init__(*args, **kwargs) - self.attributes("-fullscreen", True) # TODO: bring this back + self.attributes("-fullscreen", True) self.config(cursor="cross") self.bind("", lambda _: self.destroy()) diff --git a/src/ocr.py b/src/ocr.py index 532c118..f45b223 100644 --- a/src/ocr.py +++ b/src/ocr.py @@ -1,6 +1,9 @@ -#!/usr/bin/env python3 """ - Runs OCR on an image to extract numbers +Runs OCR on an image to extract numbers + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ from cv2.typing import MatLike diff --git a/src/video_processor.py b/src/video_processor.py index d6b9962..c06bf59 100644 --- a/src/video_processor.py +++ b/src/video_processor.py @@ -1,5 +1,9 @@ """ Processes a video and runs OCR on it + +Author: CorentinChauvin +Year: 2024 +License: Apache 2.0 """ from src.data_recorder import DataRecorder