-
Notifications
You must be signed in to change notification settings - Fork 4
/
objects365_part.py
140 lines (118 loc) · 6.28 KB
/
objects365_part.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import shutil
from loguru import logger
from cerberusdet.utils.general import Path, download, np, xyxy2xywhn
# check_requirements('pycocotools>=2.0')
from pycocotools.coco import COCO
from tqdm import tqdm
DOWNLOAD_SUBSETS = True # False to download full Objects365 dataset
def download_archive(urls, dir):
for url in urls:
archive_name = os.path.basename(url)
download([url], dir=dir, curl=True, delete=True, threads=1)
archive_path = os.path.join(dir, archive_name)
if os.path.exists(archive_path):
logger.warning(f"Downloading archive again: {archive_name}")
os.remove(archive_path)
download_archive([url], dir)
if __name__ == '__main__':
yaml = {"path": "/data/Objects365_part" if DOWNLOAD_SUBSETS else "/data/Objects365_full"}
# ['Monkey', 'Rabbit', 'Yak', 'Antelope', 'Pig', 'Bear', 'Deer', 'Giraffe', 'Zebra', 'Elephant',
# 'Lion', 'Donkey', 'Camel', 'Jellyfish', 'Other Fish', 'Dolphin', 'Crab', 'Seal', 'Goldfish']
animals_categories_ids = [341, 342, 344, 318, 300, 295, 240, 180, 178, 144, 324, 323, 307, 330, 103, 326, 311, 320,
273]
# ['Cup', 'Plate', 'Wine Glass', 'Pot', 'Knife', 'Fork', 'Spoon', 'Chopsticks', 'Cutting/chopping Board', 'Tea pot',
# 'Kettle', 'Tong']
tableware = [10, 15, 35, 95, 84, 88, 93, 162, 166, 122, 209, 203]
if DOWNLOAD_SUBSETS:
subsets = {
"animals": animals_categories_ids,
"tableware": tableware,
}
else:
subsets = {
"all": None,
}
out_images_dir_names = [f"images/{subset_name}" for subset_name in subsets.keys()]
out_labels_dir_names = [f"labels/{subset_name}" for subset_name in subsets.keys()]
# Make Directories
dir = Path(yaml["path"]) # dataset root dir
for p in ["tmp_images"] + out_images_dir_names + out_labels_dir_names:
(dir / p).mkdir(parents=True, exist_ok=True)
for q in "train", "val":
(dir / p / q).mkdir(parents=True, exist_ok=True)
# Train, Val Splits
for split, patches in [("val", 43 + 1), ("train", 50 + 1)]:
print(f"Processing {split} in {patches} patches ...")
tmp_images = dir / "tmp_images" / split
# Download
url = f"https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/{split}/"
if split == "train":
download([f"{url}zhiyuan_objv2_{split}.tar.gz"], dir=dir, delete=True) # annotations json
download_archive([f"{url}patch{i}.tar.gz" for i in range(patches)], tmp_images)
elif split == "val":
download([f"{url}zhiyuan_objv2_{split}.json"], dir=dir, delete=False) # annotations json
download_archive([f"{url}images/v1/patch{i}.tar.gz" for i in range(15 + 1)], tmp_images)
download_archive([f"{url}images/v2/patch{i}.tar.gz" for i in range(16, patches)], tmp_images)
# Move
for f in tqdm(tmp_images.rglob("*.jpg"), desc=f"Moving {split} images"):
f.rename(tmp_images / f.name) # move to /tmp_images/{split}
# Labels
coco = COCO(dir / f"zhiyuan_objv2_{split}.json")
names = [x["name"] for x in coco.loadCats(coco.getCatIds())]
images_to_save = set()
for subset_name, categories_ids in subsets.items():
for cid, cat in enumerate(names):
if categories_ids is not None and cid not in categories_ids:
continue
catIds = coco.getCatIds(catNms=[cat])
imgIds = coco.getImgIds(catIds=catIds)
for im in tqdm(coco.loadImgs(imgIds), desc=f"Scan {cid + 1}/{len(names)} {cat}"):
path = tmp_images / Path(im["file_name"]).name # image filename
images_to_save.add(str(path))
# print(path)
for f in tqdm(tmp_images.rglob("*.jpg")):
if str(f) not in images_to_save:
os.remove(str(f))
print(f"Remove {f}")
for subset_name, categories_ids in subsets.items():
missed = 0
n_images = 0
images = dir / f"images/{subset_name}" / split
labels = dir / f"labels/{subset_name}" / split
for cid, cat in enumerate(names):
if categories_ids is not None:
if cid not in categories_ids:
continue
new_cat_id = categories_ids.index(cid)
else:
new_cat_id = cid
catIds = coco.getCatIds(catNms=[cat])
imgIds = coco.getImgIds(catIds=catIds)
for im in tqdm(coco.loadImgs(imgIds), desc=f"Class {cid + 1}/{len(names)} {cat}"):
width, height = im["width"], im["height"]
path = tmp_images / Path(im["file_name"]).name # image filename
try:
annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=False)
annots = coco.loadAnns(annIds)
if len(annots) == 0:
continue
if not path.exists():
missed += 1
continue
with open(labels / path.with_suffix(".txt").name, "a") as file:
for a in annots:
x, y, w, h = a["bbox"] # bounding box in xywh (xy top-left corner)
xyxy = np.array([x, y, x + w, y + h])[None] # pixels(1,4)
x, y, w, h = xyxy2xywhn(xyxy, w=width, h=height, clip=True)[0] # normalized and clipped
file.write(f"{new_cat_id} {x:.5f} {y:.5f} {w:.5f} {h:.5f}\n")
except Exception as e:
print(e)
if not (images / path.name).exists():
print(f"rename {path} to {images / path.name}")
shutil.copyfile(str(path), str(images / path.name))
n_images += 1
print(f"{subset_name}{split} Missed images: {missed} Get images: {n_images}")
for f in tqdm(tmp_images.rglob("*.jpg"), desc=f"Removing tmp {split} images"):
os.remove(str(f))
print(f"Remove {f}")