-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata.py
160 lines (124 loc) · 5.68 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os, sys, cv2, argparse
from shutil import copy2
import numpy as np
import tensorflow as tf
def download_dataset():
BASE_PATH = tf.keras.utils.get_file('flower_photos',
'http://download.tensorflow.org/example_images/flower_photos.tgz',
untar=True, cache_dir='.')
print(f"Downloaded and extracted at {BASE_PATH}")
return BASE_PATH
def split_dataset(BASE_PATH = 'flower_photos', DATASET_PATH = 'dataset', train_images = 300, val_images = 50):
# Specify path to the downloaded folder
classes = os.listdir(BASE_PATH)
# Specify path for copying the dataset into train and val sets
os.makedirs(DATASET_PATH, exist_ok=True)
# Creating train directory
train_dir = os.path.join(DATASET_PATH, 'train')
os.makedirs(train_dir, exist_ok=True)
# Creating val directory
val_dir = os.path.join(DATASET_PATH, 'val')
os.makedirs(val_dir, exist_ok=True)
# Copying images from original folder to dataset folder
for class_name in classes:
if len(class_name.split('.')) >= 2:
continue
print(f"Copying images for {class_name}...")
# Creating destination folder (train and val)
class_train_dir = os.path.join(train_dir, class_name)
os.makedirs(class_train_dir, exist_ok=True)
class_val_dir = os.path.join(val_dir, class_name)
os.makedirs(class_val_dir, exist_ok=True)
# Shuffling the image list
class_path = os.path.join(BASE_PATH, class_name)
class_images = os.listdir(class_path)
np.random.shuffle(class_images)
for image in class_images[:train_images]:
copy2(os.path.join(class_path, image), class_train_dir)
for image in class_images[train_images:train_images+val_images]:
copy2(os.path.join(class_path, image), class_val_dir)
def get_dataset_stats(DATASET_PATH = 'dataset'):
"""
This utility gives the following stats for the dataset:
TOTAL_IMAGES: Total number of images for each class in train and val sets
AVG_IMG_HEIGHT: Average height of images across complete dataset (incl. train and val)
AVG_IMG_WIDTH: Average width of images across complete dataset (incl. train and val)
MIN_HEIGHT: Minimum height of images across complete dataset (incl. train and val)
MIN_WIDTH: Minimum width of images across complete dataset (incl. train and val)
MAX_HEIGHT: Maximum height of images across complete dataset (incl. train and val)
MAX_WIDTH: Maximum width of images across complete dataset (incl. train and val)
NOTE: You should have enough memory to load complete dataset
"""
train_dir = os.path.join(DATASET_PATH, 'train')
val_dir = os.path.join(DATASET_PATH, 'val')
len_classes = len(os.listdir(train_dir))
assert len(os.listdir(train_dir)) == len(os.listdir(val_dir))
avg_height = 0
min_height = np.inf
max_height = 0
avg_width = 0
min_width = np.inf
max_width = 0
total_train = 0
print('\nTraining dataset stats:')
for class_name in os.listdir(train_dir):
class_path = os.path.join(train_dir, class_name)
class_images = os.listdir(class_path)
for img_name in class_images:
h, w, c = cv2.imread(os.path.join(class_path, img_name)).shape
avg_height += h
avg_width += w
min_height = min(min_height, h)
min_width = min(min_width, w)
max_height = max(max_height, h)
max_width = max(max_width, w)
total_train += len(class_images)
print(f'--> Images in {class_name}: {len(class_images)}')
total_val = 0
print('\nValidation dataset stats:')
for class_name in os.listdir(val_dir):
class_path = os.path.join(val_dir, class_name)
class_images = os.listdir(class_path)
for img_name in class_images:
h, w, c = cv2.imread(os.path.join(class_path, img_name)).shape
avg_height += h
avg_width += w
min_height = min(min_height, h)
min_width = min(min_width, w)
max_height = max(max_height, h)
max_width = max(max_width, w)
total_val += len(class_images)
print(f'--> Images in {class_name}: {len(os.listdir(os.path.join(val_dir, class_name)))}')
IMG_HEIGHT = avg_height // total_train
IMG_WIDTH = avg_width // total_train
print()
print(f'AVG_IMG_HEIGHT: {IMG_HEIGHT}')
print(f'AVG_IMG_WIDTH: {IMG_WIDTH}')
print(f'MIN_HEIGHT: {min_height}')
print(f'MIN_WIDTH: {min_width}')
print(f'MAX_HEIGHT: {max_height}')
print(f'MAX_WIDTH: {max_width}')
print()
return len_classes, train_dir, val_dir, IMG_HEIGHT, IMG_WIDTH, total_train, total_val
def parse_args(args):
"""
Example command:
$ $ python data.py --train-count 500 --val-count 100
"""
parser = argparse.ArgumentParser(description='Optimize RetinaNet anchor configuration')
parser.add_argument('--train-count', type=int, help='Number of training images to be used for each class.')
parser.add_argument('--val-count', type=int, help='Number of validation images to be used for each class.')
return parser.parse_args(args)
def main(args=None):
# Parse command line arguments.
if args is None:
args = sys.argv[1:]
args = parse_args(args)
BASE_PATH = download_dataset()
# Number of images required in train and val sets
train_images = args.train_count
val_images = args.val_count
split_dataset(BASE_PATH=BASE_PATH, train_images = train_images, val_images = val_images)
get_dataset_stats()
if __name__ == "__main__":
main()