Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sharding of data for Horovod distributed training for SSD and YOLOv3 #1610

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions gluoncv/data/pascal_voc/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'voc'),
transform=None, index_map=None, preload_label=True):
super(VOCDetection, self).__init__(root)
self._im_shapes = {}
self._im_aspect_ratios = None
self._root = os.path.expanduser(root)
self._transform = transform
self._splits = splits
Expand Down Expand Up @@ -151,6 +152,16 @@ def _preload_labels(self):
logging.debug("Preloading %s labels into memory...", str(self))
return [self._load_label(idx) for idx in range(len(self))]

def get_im_aspect_ratio(self):
"""Return the aspect ratio of each image in the order of the raw data."""
if self._im_aspect_ratios is not None:
return self._im_aspect_ratios
self._im_aspect_ratios = [None] * len(self._im_shapes)
for i, im_shape in self._im_shapes.items():
self._im_aspect_ratios[i] = 1.0 * im_shape[0] / im_shape[1]

return self._im_aspect_ratios


class CustomVOCDetection(VOCDetection):
"""Custom Pascal VOC detection Dataset.
Expand Down
16 changes: 12 additions & 4 deletions scripts/detection/ssd/train_ssd.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,15 @@ def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_
_, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx))
anchors = anchors.as_in_context(mx.cpu())
batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets
train_sampler = \
gcv.nn.sampler.SplitSortedBucketSampler(train_dataset.get_im_aspect_ratio(),
batch_size,
num_parts=hvd.size() if args.horovod else 1,
part_index=hvd.rank() if args.horovod else 0,
shuffle=True)
train_loader = gluon.data.DataLoader(
train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)),
batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers)
val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
val_loader = gluon.data.DataLoader(
val_dataset.transform(SSDDefaultValTransform(width, height)),
Expand Down Expand Up @@ -341,15 +347,17 @@ def train(net, train_data, val_data, eval_metric, ctx, args):
name1, loss1 = ce_metric.get()
name2, loss2 = smoothl1_metric.get()
logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
epoch, i, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2))
btic = time.time()
epoch, i, args.log_interval * args.batch_size / (time.time() - btic),
name1, loss1, name2, loss2))
btic = time.time()

if (not args.horovod or hvd.rank() == 0):
name1, loss1 = ce_metric.get()
name2, loss2 = smoothl1_metric.get()
logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format(
epoch, (time.time()-tic), name1, loss1, name2, loss2))
if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0):
if ((epoch + 1) % args.val_interval == 0) or \
(args.save_interval and (epoch + 1) % args.save_interval == 0):
# consider reduce the frequency of validation to save time
map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
Expand Down
23 changes: 18 additions & 5 deletions scripts/detection/yolo/train_yolo3.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,26 @@ def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_
"""Get dataloader."""
width, height = data_shape, data_shape
batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generated

if args.mixup:
im_aspect_ratio = train_dataset._dataset.get_im_aspect_ratio()
else:
im_aspect_ratio = train_dataset.get_im_aspect_ratio()
train_sampler = \
gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio,
batch_size,
num_parts=hvd.size() if args.horovod else 1,
part_index=hvd.rank() if args.horovod else 0,
shuffle=True)
if args.no_random_shape:
train_loader = gluon.data.DataLoader(
train_dataset.transform(YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup)),
batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers)
else:
transform_fns = [YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup) for x in range(10, 20)]
train_loader = RandomTransformDataLoader(
transform_fns, train_dataset, batch_size=batch_size, interval=10, last_batch='rollover',
shuffle=True, batchify_fn=batchify_fn, num_workers=num_workers)
transform_fns, train_dataset, interval=10,
batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers)
val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
val_loader = gluon.data.DataLoader(
val_dataset.transform(YOLO3DefaultValTransform(width, height)),
Expand Down Expand Up @@ -302,8 +313,10 @@ def train(net, train_data, val_data, eval_metric, ctx, args):
name3, loss3 = scale_metrics.get()
name4, loss4 = cls_metrics.get()
logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format(
epoch, i, trainer.learning_rate, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4))
btic = time.time()
epoch, i, trainer.learning_rate,
args.log_interval * args.batch_size / (time.time() - btic),
name1, loss1, name2, loss2, name3, loss3, name4, loss4))
btic = time.time()

if (not args.horovod or hvd.rank() == 0):
name1, loss1 = obj_metrics.get()
Expand Down