diff --git a/README.md b/README.md
index 56e8007..8240839 100644
--- a/README.md
+++ b/README.md
@@ -8,21 +8,33 @@
-Attempting to read a portion of the screen and plot its data real time.
+Attempting to read portions of a screen or a video and exporting its data in a CSV file.
-> Blazingly slow, but it kinda works
+> *Blazingly slow, but it kinda works*
+
+**Contents:**
+
+1. [Features](#features)
+2. [Install](#install)
+3. [Run](#run)
+4. [Notes](#notes)
+5. [Dependencies](#dependencies)
## Features
- Real time recognitions of numbers of the screen.
- Offline number recognition in a video file.
- Works on easily configurable areas, as many as one wants.
-- Easy to integrate new OCR methods (see below TODO)
+- Integrated Tesseract OCR and EasyOCR.
+- Easy to add new OCR methods (see `src/ocr.py`).
+- Dead simple to use!
## Install
+This has been mainly developed and tested on Ubuntu 22.04, with Python 3.10.
+
```bash
-# Install Python version (if necessary)
+# Install Python version >= 3.10 (necessary on lower Ubuntu versions)
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install python3.13 python3.13-venv python3.13-tk
@@ -48,22 +60,31 @@ python gui.py
## Notes
+- Known OCR issues:
+ - Characters confusion, depending on the font: `0` and `8`, `1` and `7`, `5` and `9`.
+ - Missing dot (*e.g.* `42.42` turned into `4242`).
+- Tips to improve OCR reliability:
+ - Upscale the detected area, to get a better characters resolution.
+ - Use min and max bounds to filter outliers out.
+ - Don't trust the OCR output too much. Potentially implement post-filtering based on knowledge of the recorded data. For example if measuring a variable that can only evolve slowly, big jumps in the output value can be marked as outliers and discarded.
- When processing a video, enabling the preview can induce up to 20% overhead.
-- EasyOCR requires PyTorch and Scipy, so isn't lightweight. The first time the program is started, it will download necessary model weights (stored in `~/.EasyOCR/model`). See more details on the EasyOCR GitHub ([link](https://github.com/JaidedAI/EasyOCR)).
+- EasyOCR requires PyTorch and Scipy, so isn't lightweight. The first time the program is started, it will download necessary model weights (stored in `~/.EasyOCR/model`). See more details on the EasyOCR GitHub ([link](https://github.com/JaidedAI/EasyOCR)). With this application, it seems that EasyOCR is slower than Tesseract.
## Dependencies
-- https://github.com/opencv/opencv-python
-- https://github.com/tomschimansky/customtkinter
-- https://github.com/tesseract-ocr/tesseract
-- https://github.com/sirfz/tesserocr
-- https://github.com/JaidedAI/EasyOCR
+This work is merely a wrapper and a graphical interface for some already existing OCR implementations. It heavily uses Tkinter and CustomTkinter for the interface.
+
+- [OpenCV](https://github.com/opencv/opencv-python) (MIT license): image processing.
+- [CustomTkinter](https://github.com/tomschimansky/customtkinter) (MIT license): beautiful interface and GUI.
+- [Tesseract](https://github.com/tesseract-ocr/tesseract) (Apache 2.0): OCR API.
+- [Tesserocr](https://github.com/sirfz/tesserocr) (MIT license): Python wrapper for Tesseract.
+- [EasyOCR](https://github.com/JaidedAI/EasyOCR) (Apache 2.0): another OCR API.
## TODO
- [x] Loading and processing videos
- [x] Saving/loading configuration
- [x] Multi threading, for less blazing slowness
-- [ ] Make it easy to add new OCR methods, and documenting it
+- [ ] Make it easier to add new OCR methods, and documenting it
- [ ] Logging not only in the Python terminal, but also in the logging text box
- [ ] Real time graphing
diff --git a/gui.py b/gui.py
index a7ad9a2..c728d7b 100644
--- a/gui.py
+++ b/gui.py
@@ -1,5 +1,10 @@
+#!/usr/bin/env python3
"""
-TODO
+Main entry point for the application
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
from src.ocr import BaseOcrEngine, OcrMethod
@@ -44,8 +49,8 @@ def __init__(self):
self.title("DemOCRatos - OCR for the people")
# self.geometry(f"{1100}x{580}")
self.iconphoto(True, ImageTk.PhotoImage(file="assets/logo_low.png"))
- self.bind("", lambda _: sys.exit()) # FIXME: for development only
- self.bind("q", lambda _: sys.exit()) # FIXME: for development only
+ # self.bind("", lambda _: sys.exit()) # FIXME: for development only
+ # self.bind("q", lambda _: sys.exit()) # FIXME: for development only
self.protocol("WM_DELETE_WINDOW", self._on_closing_cb)
self._rect_selec_window = None # reference to the window to select the capture zone
@@ -85,12 +90,12 @@ def __init__(self):
self.grid_rowconfigure(0, weight=0)
# Set default values and statuses
- self._stop_btn.configure(state="disabled") # TODO: change state dynamically
+ self._stop_btn.configure(state="disabled")
self._status_txt.configure(text="10:03 (12 FPS)")
self._status_txt.configure(
fg_color="green", text_color="white"
- ) # TODO: change the colour depending on the status
+ )
self._fps_settings_menu.set("10")
self._ocr_settings_menu.set("Tesseract")
@@ -261,11 +266,23 @@ def __load_settings_cb():
with open(path) as file:
try:
config = json.load(file)
- except (UnicodeDecodeError, json.decoder.JSONDecodeError):
+ except (UnicodeDecodeError, json.decoder.JSONDecodeError) as e:
print("ERROR: coudn't parse JSON config")
+ print(e)
return
- self._captures.load_config(config)
+ try:
+ new_captures = Captures(self._output_frame)
+ new_captures.load_config(config["captures"])
+ self._fps_settings_menu.set(config["fps"])
+ self._ocr_settings_menu.set(config["ocr_method"])
+ self._max_threads_entry.set_value(config["max_threads"])
+ except KeyError as e:
+ print("ERROR: couldn't parse JSON config")
+ print(e)
+ return
+
+ self._captures = new_captures
self._data_recorder.reset_fields(self._captures.get_names())
self._selected_capture = self._captures.get_first()
self._update_capture_options()
@@ -282,7 +299,11 @@ def __save_settings_cb():
return
with open(path, "w") as file:
- config = self._captures.get_config()
+ config = {}
+ config["captures"] = self._captures.get_config()
+ config["fps"] = self._fps_settings_menu.get()
+ config["ocr_method"] = self._ocr_settings_menu.get()
+ config["max_threads"] = self._max_threads_entry.get_value()
json.dump(config, file)
print(config)
@@ -720,13 +741,15 @@ def _update_capture_options(self, selected: None | str = None):
def __update_entry_text(entry: ctk.CTkEntry, text):
entry.delete(0, tk.END)
- entry.insert(0, text)
+ entry.insert(0, text if text is not None else "")
self._selected_capture.toggle_edit(False)
__update_entry_text(self._rect_xmin_entry, self._selected_capture.x_min)
__update_entry_text(self._rect_xmax_entry, self._selected_capture.x_max)
__update_entry_text(self._rect_ymin_entry, self._selected_capture.y_min)
__update_entry_text(self._rect_ymax_entry, self._selected_capture.y_max)
+ __update_entry_text(self._min_entry, self._selected_capture.min_value)
+ __update_entry_text(self._max_entry, self._selected_capture.max_value)
self._selected_capture.toggle_edit(True)
self._pre_process_config_frame.update_elements(
diff --git a/main.py b/main.py
deleted file mode 100644
index fa0fe5f..0000000
--- a/main.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-"""
- Run OCR on a motor video to extract useful information
-"""
-
-import pytesseract
-from cv2.typing import MatLike
-import cv2
-import numpy as np
-import csv
-
-
-class DataConfig:
- def __init__(
- self, min_value: float, max_value: float, slice_x: slice, slice_y: slice
- ):
- """
- Args:
- - min_value: Minimum value the data field can take
- - max_value: Maximum value the data field can take
- - position_mask: Pixel location of the data field
- """
- self.min_value = min_value
- self.max_value = max_value
- self.slice_x = slice_x
- self.slice_y = slice_y
-
-
-# =========================================================
-# CONFIGURATION
-#
-video_path = "resources/long.mp4"
-image_path = "resources/frame_raw.png"
-output_file = "output/output.csv"
-max_cnt = 100000
-
-data_config = {
- "temperature": DataConfig(20.0, 100.0, slice(295, 311), slice(749, 770)),
- "current": DataConfig(0.0, 20.0, slice(295, 311), slice(576, 620)),
- "speed": DataConfig(-10.0, 1500, slice(295, 311), slice(402, 450)),
-}
-
-
-# =========================================================
-# OCR
-#
-def unsharp_mask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
- """
- Return a sharpened version of the image, using an unsharp mask
- """
- blurred = cv2.GaussianBlur(image, kernel_size, sigma)
- sharpened = float(amount + 1) * image - float(amount) * blurred
- sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
- sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
- sharpened = sharpened.round().astype(np.uint8)
-
- if threshold > 0:
- low_contrast_mask = np.absolute(image - blurred) < threshold
- np.copyto(sharpened, image, where=low_contrast_mask)
-
- return sharpened
-
-
-def preprocess_frame(frame: MatLike, slice_x: slice, slice_y: slice) -> MatLike:
- """
- Crops a frame and preprocesses it ahead of OCR (upscaling, sharpening and
- thresholding)
- """
- cropped = frame[slice_x, slice_y]
- shape = np.shape(cropped)
- cropped = cv2.resize(
- cropped, (shape[1] * 2, shape[0] * 2), interpolation=cv2.INTER_LINEAR
- )
-
- cropped = unsharp_mask(cropped, kernel_size=(5, 5), amount=2, sigma=1)
- cropped = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
- cropped = cv2.threshold(
- cropped, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
- )[1]
-
- return cropped
-
-
-def ocr(image: MatLike) -> str:
- """
- Reads the portion of image, and returns the raw output
- """
- img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
- output = pytesseract.image_to_string(
- img_rgb, config="--dpi 20 --psm 13 -c tessedit_char_whitelist=-0123456789."
- )
-
- return output.strip()
-
-
-# =========================================================
-# POST PROCESSING
-#
-def filter(raw_string: str, min_value: float, max_value: float) -> float | None:
- """
- Tries to convert a string into a float, and returns only if the resulting value
- is within some given bounds.
- """
- try:
- value = float(raw_string)
- except ValueError:
- return None
-
- if value < min_value or value > max_value:
- return None
-
- return value
-
-
-# =========================================================
-# FILE PROCESSING
-#
-def run_video():
- cap = cv2.VideoCapture(video_path)
- cnt = 0
- outputs = {key: [] for key in data_config}
- outputs["t"] = []
-
- while cap.isOpened():
- # Read video frame
- ret, frame = cap.read()
-
- cnt += 1
- if cnt % 25 != 0:
- continue
- if cnt >= max_cnt:
- break
-
- if not ret:
- print("Can't receive frame (stream end?). Exiting...")
- break
-
- print(cnt)
-
- # Perform OCR on the various fields
- for key in data_config:
- config = data_config[key]
- cropped = preprocess_frame(frame, config.slice_x, config.slice_y)
- raw_output = ocr(cropped)
- outputs[key].append(filter(raw_output, config.min_value, config.max_value))
-
- outputs["t"].append(cnt)
-
- # Display the information nicely
- for key in data_config:
- config = data_config[key]
- cv2.rectangle(
- frame,
- (config.slice_y.start, config.slice_x.start),
- (config.slice_y.stop - 1, config.slice_x.stop - 1),
- (0, 255, 0),
- 1,
- )
- font = cv2.FONT_HERSHEY_SIMPLEX
- value = str(outputs[key][-1])
-
- if outputs[key][-1] is None:
- colour = (255, 0, 0)
- value = "None"
- else:
- colour = (0, 255, 0)
-
- position = [config.slice_y.start, config.slice_x.start - 8]
- cv2.putText(frame, value, position, font, 0.5, colour, 1, cv2.LINE_AA)
-
- cv2.imshow("frame", frame)
-
- if cv2.waitKey(1) == ord("q"):
- break
-
- cap.release()
- cv2.destroyAllWindows()
-
- return outputs
-
-
-def run_image():
- img = cv2.imread(image_path)
- img = img[295:311, 748:770]
- # img = img[287:311, 661:760]
- # img = img[287:311, 661:826]
-
- ocr(img)
-
- cv2.imshow(image_path, img)
- cv2.waitKey(0)
-
-
-# =========================================================
-# RESULTS PROCESSING
-#
-
-def export_outputs(outputs: dict):
- keys = list(outputs.keys())
- del keys[keys.index("t")]
- keys.insert(0, "t")
-
- with open(output_file, "w", newline="") as csv_file:
- writer = csv.DictWriter(
- csv_file, fieldnames=keys, quotechar='"', quoting=csv.QUOTE_ALL
- )
- writer.writeheader()
-
- for k in range(len(outputs["t"])):
- writer.writerow({key: outputs[key][k] for key in keys})
-
-
-if __name__ == "__main__":
- outputs = run_video()
- export_outputs(outputs)
-
- print(outputs)
diff --git a/src/capture.py b/src/capture.py
index 37979a3..398243d 100644
--- a/src/capture.py
+++ b/src/capture.py
@@ -1,5 +1,9 @@
"""
Class storing configuration data about a capture
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
from src.gui_elements import TkImage2
@@ -30,8 +34,8 @@ def __init__(self, name: str, img_root: ctk.CTkBaseClass, ocr_method: OcrMethod)
self.show_preview = True # whether to draw a preview of the captured area
self._output_img = TkImage2(img_root) # displayed output image
self._output_txt = ctk.CTkLabel(img_root, text="-")
- self._min_value = None # minimum acceptable value for post-processing (not used if None)
- self._max_value = None # maximum acceptable value for post-processing (not used if None)
+ self.min_value = None # minimum acceptable value for post-processing (not used if None)
+ self.max_value = None # maximum acceptable value for post-processing (not used if None)
self.set_ocr_method(ocr_method)
@@ -51,8 +55,9 @@ def set_min_max_values(self, min_value: float | None, max_value: float | None):
If None, the extremum won't be used.
"""
- self._min_value = min_value
- self._max_value = max_value
+ if self._can_edit:
+ self.min_value = min_value
+ self.max_value = max_value
def display(self, column_idx: int):
"""
@@ -94,7 +99,6 @@ def update(self, output: str, processed_img: np.ndarray):
self._output_img.update(processed_img)
self._output_txt.configure(text=f"{self.name}: {output}")
-
def slice_area(self, array: np.ndarray) -> np.ndarray:
"""
Slices a Numpy array according to the capture area coordinates, along
@@ -155,10 +159,10 @@ def post_process(self, output_str: str | None):
except ValueError:
return None
- if self._min_value is not None and value < self._min_value:
+ if self.min_value is not None and value < self.min_value:
return None
- if self._max_value is not None and value > self._max_value:
+ if self.max_value is not None and value > self.max_value:
return None
return value
@@ -211,6 +215,8 @@ def get_config(self):
config[name] = {}
config[name]["area"] = [capture.x_min, capture.y_min, capture.x_max, capture.y_max]
+ config[name]["min_value"] = capture.min_value
+ config[name]["max_value"] = capture.max_value
config[name]["is_enabled"] = capture.is_enabled
config[name]["show_preview"] = capture.show_preview
config[name]["ocr"] = {}
@@ -231,6 +237,8 @@ def load_config(self, config):
capture = self.add_capture()
capture.name = name
capture.set_area(*config[name]["area"])
+ capture.min_value = config[name]["min_value"]
+ capture.max_value = config[name]["max_value"]
capture.show_preview = config[name]["show_preview"]
ocr = BaseOcrEngine.PreProcessConfig()
diff --git a/src/data_recorder.py b/src/data_recorder.py
index 4acaef8..569a3ff 100644
--- a/src/data_recorder.py
+++ b/src/data_recorder.py
@@ -1,5 +1,9 @@
"""
- Class to record real time data, and keep track of time
+Class to record real time data, and keep track of time
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
import numpy as np
@@ -88,7 +92,7 @@ def toggle_recording(self, is_recording: bool) -> str:
self._data = {key: [] for key in self._data}
self._start_time = time()
self._last_times = []
- else:
+ elif self._is_recording:
path = self._save_data()
self._is_recording = is_recording
diff --git a/src/gui_elements.py b/src/gui_elements.py
index 6fff4e8..53f07f1 100644
--- a/src/gui_elements.py
+++ b/src/gui_elements.py
@@ -1,5 +1,9 @@
"""
Helper functions and classes for the TK app
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
from __future__ import annotations
@@ -82,9 +86,6 @@ class TkImage:
"""
def __init__(self, master: ctk.CTkBaseClass):
- """
- TODO
- """
self._fig = Figure(layout="tight")
self._fig.patch.set_facecolor("xkcd:mint green")
self._ax: Axes = self._fig.add_subplot(111)
@@ -108,28 +109,28 @@ def update(self, img: np.ndarray):
def get_tk_canvas(self) -> tk.Canvas:
"""
- TODO
+ Returns a reference to the drawable canvas
"""
return self._tk_canvas.get_tk_widget()
class TkImage2:
"""
- TODO
+ Utility class to draw images in tkinter (more optimised than `TkImage`)
"""
def __init__(self, master: ctk.CTkBaseClass):
- """
- TODO
- """
self._img = None
self._canvas_img = None
self._canvas = tk.Canvas(master)
- def update(self, img: np.ndarray):
+ def update(self, img: np.ndarray | None):
"""
Updates the image given a raw image (Numpy array)
"""
+ if img is None:
+ return
+
if self._canvas_img is not None:
self._canvas.delete(self._canvas_img)
@@ -139,7 +140,7 @@ def update(self, img: np.ndarray):
def get_tk_canvas(self) -> tk.Canvas:
"""
- TODO
+ Returns a reference to the drawable canvas
"""
return self._canvas
@@ -154,7 +155,7 @@ def __init__(self, screen_img, *args, **kwargs):
# Start new window
super().__init__(*args, **kwargs)
- self.attributes("-fullscreen", True) # TODO: bring this back
+ self.attributes("-fullscreen", True)
self.config(cursor="cross")
self.bind("", lambda _: self.destroy())
diff --git a/src/ocr.py b/src/ocr.py
index 532c118..f45b223 100644
--- a/src/ocr.py
+++ b/src/ocr.py
@@ -1,6 +1,9 @@
-#!/usr/bin/env python3
"""
- Runs OCR on an image to extract numbers
+Runs OCR on an image to extract numbers
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
from cv2.typing import MatLike
diff --git a/src/video_processor.py b/src/video_processor.py
index d6b9962..c06bf59 100644
--- a/src/video_processor.py
+++ b/src/video_processor.py
@@ -1,5 +1,9 @@
"""
Processes a video and runs OCR on it
+
+Author: CorentinChauvin
+Year: 2024
+License: Apache 2.0
"""
from src.data_recorder import DataRecorder