Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Adding Differentiable Binarization #2306

Draft
wants to merge 2 commits into
base: OCR
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions discrete-binarization/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## Download the dataset from Kaggle

```shell
kaggle datasets download -d torkiasalem/text-detection-icdar2015
unzip -qq text-detection-icdar2015.zip
```

## Reference

- https://github.com/MhLiao/DB
89 changes: 89 additions & 0 deletions discrete-binarization/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import json
from pathlib import Path

import cv2
import keras
import numpy as np
from label_generator import generate_text_probability_map
from label_generator import generate_threshold_label


def read_label_file(file_path):
with open(file_path, "r") as f:
data = f.readlines()
return data


def decode_image(image_path):
try:
image = cv2.imread(image_path)
if image.shape[-1] == 1:
image = np.dstack([image, image, image])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
return image
except:
return None


def pad_polygons(polygons):
max_num_points = 0
for polygon in polygons:
if len(polygon) > max_num_points:
max_num_points = len(polygon)
padded_polygons = list()
for polygon in polygons:
polygon = polygon + [polygon[-1]] * (max_num_points - len(polygon))
padded_polygons.append(polygon)
return padded_polygons


def decode_label(label):
label = json.loads(label)
polygons, texts, ignore_flags = list(), list(), list()
for info in label:
text = info["transcription"]
if text in ["*", "###"]:
ignore_flags.append(True)
else:
ignore_flags.append(False)
polygon = info["points"]
polygons.append(polygon)
texts.append(text)
polygons = pad_polygons(polygons)
return polygons, texts, ignore_flags


class ICDARPyDataset(keras.utils.PyDataset):
def __init__(self, image_dir, file_path, **kwargs):
super().__init__(**kwargs)
self.image_dir = image_dir
self.file_path = file_path
self.data_lines = read_label_file(file_path)

def __len__(self):
return len(self.data_lines)

def __getitem__(self, idx):
# 1. Grab a single line from the data_lines list
line = self.data_lines[idx]
# 2. Split the line into image_path and label
image_path, label = line.strip().split("\t")
image_path = str(Path(self.image_dir) / Path(image_path).name)
# 3. Decode the image
image = decode_image(image_path)
if image is None:
return dict()
# 4. Decode the labels
polygons, texts, ignore_flags = decode_label(label)
if len(polygons) == 0:
return dict()
else:
data = {
"image": image,
"polygons": np.array(polygons),
"texts": np.array(texts),
"ignore_flags": np.array(ignore_flags),
}
generate_text_probability_map(data)
generate_threshold_label(data)
return data
279 changes: 279 additions & 0 deletions discrete-binarization/label_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
import cv2
import numpy as np
import pyclipper
from shapely.geometry import Polygon


######################################################################
# Lable Generation for Threshold Maps
######################################################################
def calculate_distance(grid_x, grid_y, segment_start, segment_end):
"""
Calculate the perpendicular distance from each point in a grid to a line segment.

Args:
- grid_x (np.ndarray): The x-coordinates of the grid.
- grid_y (np.ndarray): The y-coordinates of the grid.
- segment_start (tuple): The starting point of the line segment.
- segment_end (tuple): The ending point of the line segment.

Returns:
- np.ndarray: An array of distances.
"""
# Calculating the square of distances from the grid points to the line segment points
distance_to_start_sq = np.square(grid_x - segment_start[0]) + np.square(
grid_y - segment_start[1]
)
distance_to_end_sq = np.square(grid_x - segment_end[0]) + np.square(
grid_y - segment_end[1]
)

# Calculating the square of the distance between the line segment points
segment_length_sq = np.square(
segment_start[0] - segment_end[0]
) + np.square(segment_start[1] - segment_end[1])

# Calculating cosine of the angle at the grid points
cosine_angle = (
segment_length_sq - distance_to_start_sq - distance_to_end_sq
) / (2 * np.sqrt(distance_to_start_sq * distance_to_end_sq) + 1e-6)

# Calculating the square of sine of the angle
sine_square = 1 - np.square(cosine_angle)
sine_square = np.nan_to_num(sine_square)

# Calculating the perpendicular distance
perpendicular_distance = np.sqrt(
distance_to_start_sq
* distance_to_end_sq
* sine_square
/ segment_length_sq
)
perpendicular_distance[cosine_angle < 0] = np.sqrt(
np.fmin(distance_to_start_sq, distance_to_end_sq)
)[cosine_angle < 0]

return perpendicular_distance


def draw_border_map_on_canvas(polygon_points, canvas, mask, shrink_ratio):
"""
Draw a border map on a canvas based on a polygon and a shrink ratio.

Args:
- polygon_points (list): Points of the polygon.
- canvas (np.ndarray): The canvas to draw on.
- mask (np.ndarray): The mask to use for drawing.
- shrink_ratio (float): The ratio to shrink the polygon by.
"""
polygon = np.array(polygon_points)
polygon_shape = Polygon(polygon)

# Exit if the polygon area is not positive
if polygon_shape.area <= 0:
return

# Calculating the distance to shrink the polygon by
shrink_distance = (
polygon_shape.area
* (1 - np.power(shrink_ratio, 2))
/ polygon_shape.length
)

# Creating the shrunk polygon
subject_polygon = [tuple(l) for l in polygon]
polygon_padder = pyclipper.PyclipperOffset()
polygon_padder.AddPath(
subject_polygon, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON
)
shrunk_polygon = np.array(polygon_padder.Execute(shrink_distance)[0])

# Filling the polygon in the mask
cv2.fillPoly(mask, [shrunk_polygon.astype(np.int32)], 1.0)

# Calculating bounding box for the shrunk polygon
xmin, xmax = shrunk_polygon[:, 0].min(), shrunk_polygon[:, 0].max()
ymin, ymax = shrunk_polygon[:, 1].min(), shrunk_polygon[:, 1].max()
width, height = xmax - xmin + 1, ymax - ymin + 1

# Adjusting polygon points relative to the bounding box
adjusted_polygon = polygon.copy()
adjusted_polygon[:, 0] -= xmin
adjusted_polygon[:, 1] -= ymin

# Creating grids for distance calculation
grid_x = np.broadcast_to(
np.linspace(0, width - 1, num=width).reshape(1, width), (height, width)
)
grid_y = np.broadcast_to(
np.linspace(0, height - 1, num=height).reshape(height, 1),
(height, width),
)

# Initializing the distance map
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)

# Calculating the distance to each edge of the polygon
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = calculate_distance(
grid_x, grid_y, adjusted_polygon[i], adjusted_polygon[j]
)
distance_map[i] = np.clip(absolute_distance / shrink_distance, 0, 1)

# Taking the minimum distance to any edge
distance_map = distance_map.min(axis=0)

# Validating and adjusting the bounding box coordinates
xmin_valid, xmax_valid = min(max(0, xmin), canvas.shape[1] - 1), min(
max(0, xmax), canvas.shape[1] - 1
)
ymin_valid, ymax_valid = min(max(0, ymin), canvas.shape[0] - 1), min(
max(0, ymax), canvas.shape[0] - 1
)

# Applying the distance map to the canvas
canvas_slice = canvas[
ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1
]
distance_map_slice = distance_map[
ymin_valid - ymin : ymax_valid - ymax + height,
xmin_valid - xmin : xmax_valid - xmax + width,
]
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1] = np.fmax(
1 - distance_map_slice, canvas_slice
)


def generate_threshold_label(
data, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7
):
"""
Generate a threshold label for an image based on given polygons and ignore tags.

Args:
- data (dict): A dictionary containing 'image', 'polygons', and 'ignore_tags'.
- shrink_ratio (float): Ratio to shrink polygons by.
- thresh_min (float): Minimum threshold value.
- thresh_max (float): Maximum threshold value.
"""
image = data["image"]
polygons = data["polygons"]
ignore_flags = data["ignore_flags"]

# Initializing canvas and mask
canvas = np.zeros(image.shape[:2], dtype=np.float32)
mask = np.zeros(image.shape[:2], dtype=np.float32)

# Drawing each polygon on the canvas
for polygon, ignore_flag in zip(polygons, ignore_flags):
if not ignore_flag:
draw_border_map_on_canvas(
polygon, canvas, mask=mask, shrink_ratio=shrink_ratio
)

# Adjusting the canvas based on thresholds
canvas = canvas * (thresh_max - thresh_min) + thresh_min

# Updating the data dictionary
data["threshold_map"] = canvas
data["threshold_mask"] = mask


######################################################################
# Label Generation for Probability and Binary Maps
######################################################################
def adjust_polygons_within_image_bounds(
polygons, ignore_flags, image_height, image_width
):
"""
Adjusts polygons to ensure they fit within the image bounds and calculates
their area to update ignore flags for invalid polygons.

Parameters:
- polygons (np.ndarray): Array of polygons coordinates.
- ignore_flags (list): List of boolean flags to ignore certain polygons.
- image_height (int): Height of the image.
- image_width (int): Width of the image.

Returns:
- Tuple: (Adjusted polygons, Updated ignore flags)
"""
for i, polygon in enumerate(polygons):
# Clip polygon coordinates to be within image dimensions
polygons[i] = np.clip(
polygon, [0, 0], [image_width - 1, image_height - 1]
)

# If polygon area is too small, mark it as ignored
if Polygon(polygon).area < 1:
ignore_flags[i] = True
polygons[i] = polygons[i][::-1] # Reverse polygon coordinates

return polygons, ignore_flags


def shrink_polygon(polygon, shrink_ratio):
"""
Shrinks a given polygon according to the specified shrink ratio.

Parameters:
- polygon (Polygon): A Shapely Polygon object.
- shrink_ratio (float): Ratio to shrink the polygon.

Returns:
- Polygon: The shrunk polygon.
"""
if polygon.area == 0:
return Polygon()

# Calculate shrinking distance based on area and perimeter
shrink_distance = polygon.area * (1 - shrink_ratio**2) / polygon.length
return polygon.buffer(-shrink_distance, join_style=2)


def generate_text_probability_map(data, min_text_size=8, shrink_ratio=0.4):
"""
Generates a probability map for text detection in an image by processing
the provided polygons.

Parameters:
- data (dict): Contains the image, polygons, and ignore tags.
- min_text_size (int): Minimum size of text to be detected.
- shrink_ratio (float): Ratio for shrinking the polygons.

Modifies:
- data (dict): Adds the shrink_map and shrink_mask to the data.
"""
image = data["image"]
polygons = data["polygons"]
ignore_flags = data["ignore_flags"]
image_height, image_width = image.shape[:2]

polygons, ignore_flags = adjust_polygons_within_image_bounds(
polygons, ignore_flags, image_height, image_width
)
text_region_map = np.zeros((image_height, image_width), dtype=np.float32)
mask_map = np.ones((image_height, image_width), dtype=np.float32)

for i, polygon in enumerate(polygons):
# Check for ignored polygons or those too small to consider
if (
ignore_flags[i]
or min(polygon[:, 1].ptp(), polygon[:, 0].ptp()) < min_text_size
):
cv2.fillPoly(mask_map, [polygon.astype(np.int32)], 0)
continue

shrunk_polygon = shrink_polygon(Polygon(polygon), shrink_ratio)
if shrunk_polygon.is_empty:
cv2.fillPoly(mask_map, [polygon.astype(np.int32)], 0)
else:
cv2.fillPoly(
text_region_map,
[np.array(shrunk_polygon.exterior.coords).astype(np.int32)],
1,
)

data["shrink_map"] = text_region_map
data["shrink_mask"] = mask_map
Loading
Loading