-
Notifications
You must be signed in to change notification settings - Fork 1
/
imagenet_convert_bbox_data.py
270 lines (231 loc) · 9.45 KB
/
imagenet_convert_bbox_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""
This script converts the imagenet bounding box format to json
and crops the imagenet images given the biggest box.
ImageNet format:
ImageNet/bboxes/val/ILSVRC2012_val_00000001.xml
...
A single XML contains the following data:
{'filename': 'ILSVRC2012_val_00050000',
'folder': 'val',
'objects': [{'bndbox': {'xmax': '446',
'xmin': '237',
'ymax': '374',
'ymin': '109'},
'difficult': '0',
'name': 'n02437616',
'pose': 'Unspecified',
'truncated': '0'},
{'bndbox': {'xmax': '474',
'xmin': '175',
'ymax': '342',
'ymin': '89'},
'difficult': '0',
'name': 'n02437616',
'pose': 'Unspecified',
'truncated': '0'}],
'segmented': '0',
'size': {'depth': '3', 'height': '375', 'width': '500'},
'source': {'database': 'ILSVRC_2012'}}
The final output file will contain all data as follows, with coco bbox format (x, y, w, h):
{
"val_00050000": {"image_h": 375, "image_w": 500, "objects": [
{"synset": "n02437616", "bbox": [237, 109, 209, 265]},
{"synset": "n02437616", "bbox": [175, 89, 299, 253]}]},
}
Boxes downloaded from:
https://image-net.org/challenges/LSVRC/2012/2012-downloads.php
Validation bounding box annotations (all tasks) . 2.2MB. MD5: f4cd18b5ea29fe6bbea62ec9c20d80f0
https://image-net.org/data/ILSVRC/2012/ILSVRC2012_bbox_val_v3.tgz
"""
import os
from PIL import Image
from attr import define
from lxml import etree
from pathlib import Path
from tqdm import tqdm
from typing import Optional
from ovqa.datasets.interface_metadata import ClsMetadataInterface
from ovqa.datasets.meta_loading import meta_loader
from ovqa.paths import get_ovqa_annotations_dir
from packg.iotools.jsonext import dump_json, load_json
from packg.log import SHORTEST_FORMAT, configure_logger, get_logger_level_from_args
from packg.paths import get_data_dir
from typedparser import VerboseQuietArgs, TypedParser, add_argument
from visiontext.bboxes import convert_bbox_abs_to_rel, convert_bbox_rel_to_abs, get_bbox_bounds
@define(slots=False)
class Args(VerboseQuietArgs):
data_dir: Optional[Path] = add_argument(
shortcut="-d", type=str, help="Source base dir", default=None
)
crop_min_size: int = add_argument(shortcut="-c", type=int, help="Crop min size", default=64)
crop_rect: bool = add_argument(
shortcut="-r", action="store_true", help="Crop rectangles instead of squares"
)
def main():
parser = TypedParser.create_parser(Args, description=__doc__)
args: Args = parser.parse_args()
configure_logger(level=get_logger_level_from_args(args), format=SHORTEST_FORMAT)
dataset_name = "imagenet1k"
data_dir = get_data_dir() / dataset_name if args.data_dir is None else args.data_dir
anno_dir = get_ovqa_annotations_dir() / dataset_name
split = "val"
output_file = anno_dir / "generated" / f"bboxes_{split}.json"
if not output_file.is_file():
data = read_2012_boxes(data_dir)
os.makedirs(output_file.parent, exist_ok=True)
dump_json(data, output_file)
else:
print(f"File {output_file} already exists, skipping saving.")
data = load_json(output_file)
crop_images(
data,
data_dir,
crop_min_size=args.crop_min_size,
crop_rect=args.crop_rect,
split=split,
)
def read_2012_boxes(data_dir):
# read the 2012 official boxes
bbox_dir = data_dir / "bboxes/val" # ILSVRC2012_val_00036658.xml
files = sorted(os.listdir(bbox_dir))
data_out = {}
for file in tqdm(files, desc="Reading box data"):
assert file.startswith("ILSVRC2012_val_")
key = file[len("ILSVRC2012_") : -len(".xml")]
full_file = bbox_dir / file
tree = etree.parse(full_file)
root = tree.getroot()
# print(etree.tostring(root, pretty_print=True, encoding="unicode"))
data = tree_to_dict(root)
# pprint(data)
data_out[key] = read_box_data(data)
return data_out
def crop_images(data_out, data_dir, crop_min_size: int = 64, crop_rect: bool = False, split="val"):
# crop images
meta: ClsMetadataInterface = meta_loader.load_metadata("imagenet1k", split)
classes_data = meta.classes_data
synset_to_cls = {v["synset"]: v for v in classes_data}
mode = "rect" if crop_rect else "square"
crop_name = f"{mode}"
# print(f"synset_name={synset_name}, cls={cls}")
output_dir = data_dir / f"{crop_name}"
if output_dir.is_dir():
raise FileExistsError(f"Output dir {output_dir} already exists")
pbar = tqdm(total=len(meta.annotations), desc=f"Cropping images to {output_dir}")
stats = {"w": [], "h": [], "area": []}
for i, (key, meta_item) in enumerate(meta.annotations.items()):
image_class_idx = meta_item["class_idx"]
image_file = meta.get_image_file(key)
image = Image.open(image_file)
real_image_w, real_image_h = image.size
bbox_item = data_out[key]
image_h, image_w = bbox_item["image_h"], bbox_item["image_w"]
objects = bbox_item["objects"]
biggest_area, coords = 0, None
for box_obj in objects:
synset_name = box_obj["synset"]
cls = synset_to_cls[synset_name]
class_idx = cls["class_idx"]
assert (
class_idx == image_class_idx
), f"Class mismatch: Box {class_idx} != Image {image_class_idx}"
bbox = box_obj["bbox"]
bx, by, bw, bh = bbox
area = bw * bh
if area < biggest_area:
continue
biggest_area = area
rx, ry, rw, rh = convert_bbox_abs_to_rel(bx, by, bw, bh, image_w, image_h)
fx, fy, fw, fh = convert_bbox_rel_to_abs(rx, ry, rw, rh, real_image_w, real_image_h)
x1, y1, x2, y2 = get_bbox_bounds(
fx,
fy,
fw,
fh,
real_image_w,
real_image_h,
min_w=crop_min_size,
min_h=crop_min_size,
create_squares=not crop_rect,
)
coords = x1, y1, x2, y2
stats["w"].append(fw)
stats["h"].append(fh)
stats["area"].append(fw * fh)
assert coords is not None, f"Could not find coords for {key}"
x1, y1, x2, y2 = coords
cropped_image = image.crop((x1, y1, x2, y2))
cropped_file = output_dir / meta_item["image_file"]
os.makedirs(cropped_file.parent, exist_ok=True)
cropped_image.save(cropped_file)
pbar.update(1)
pbar.close()
# for key, data in stats.items():
# print(f"********** {key} **********")
# print(pd.Series(data).describe())
def read_box_data(data):
image_h, image_w = int(data["size"]["height"]), int(data["size"]["width"])
assert int(data["segmented"]) == 0, f"Expected segmented=0, got {data['segmented']}"
objects = data["object"]
object_out = []
for object_data in objects:
box_data = object_data["bndbox"]
xmin, xmax, ymin, ymax = (
int(box_data["xmin"]),
int(box_data["xmax"]),
int(box_data["ymin"]),
int(box_data["ymax"]),
)
assert (
int(object_data["difficult"]) == 0
), f"Expected difficult=0, got {object_data['difficult']}"
synset_name = object_data["name"]
pose = object_data["pose"]
assert pose == "Unspecified", f"Expected pose=Unspecified, got {pose}"
truncated = int(object_data["truncated"])
assert truncated == 0, f"Expected truncated=0, got {truncated}"
# convert to coco format (x, y, w, h)
w = xmax - xmin
h = ymax - ymin
object_out.append(
{
"synset": synset_name,
"bbox": [xmin, ymin, w, h],
}
)
return {
"image_h": image_h,
"image_w": image_w,
"objects": object_out,
}
# in XML we do not know apriori whether an element occurs once or multiple times
# define here that all fields are singular, except the object field
_multiple_values = {0: ["object"]}
def tree_to_dict(root, depth=0):
out = {}
multi_fields = _multiple_values.get(depth, [])
for field in multi_fields:
out[f"{field}"] = []
for child in root:
child: etree.ElementBase
assert len(child.attrib) == 0, f"Expected no attributes, got {child.attrib}"
key = child.tag
assert child.prefix is None, f"Expected no prefix, got {child.prefix}"
assert child.tail.strip() == "", f"Expected no tail, got {child.tail}"
text = child.text.strip() # should have value only if there is no children
# base is the path to the file, nsmap is some empty namespace mapping,
# sourceline is the line in the file
if len(child) == 0:
assert text != "", f"Expected text, got no text"
value = text
else:
assert text == "", f"Expected no text, got {text}"
value = tree_to_dict(child, depth=depth + 1)
if key in multi_fields:
out[key].append(value)
else:
assert key not in out, f"Expected no duplicate keys, got {key}"
out[key] = value
return out
if __name__ == "__main__":
main()