Skip to content

Commit

Permalink
[Enhancement] Support deepspeed with flexible runner (open-mmlab#1673)
Browse files Browse the repository at this point in the history
* [Feature] Support deepspeed with flexible runner

* [Fix] Reformat with yapf

* [Refacor] Rename configs

* [Fix] Reformat with yapf

* [Refactor] Remove unused keys

* [Refactor] Change the _base_ path

* [Refactor] Reformat
  • Loading branch information
fanqiNO1 authored Jun 29, 2023
1 parent 68758db commit 658db80
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
_base_ = ['./vit-huge-p14_8xb128-coslr-50e_in1k.py']

# optimizer wrapper
optim_wrapper = dict(type='DeepSpeedOptimWrapper')

# training strategy
# Deepspeed with ZeRO3 + fp16
strategy = dict(
type='DeepSpeedStrategy',
fp16=dict(
enabled=True,
fp16_master_weights_and_grads=False,
loss_scale=0,
loss_scale_window=500,
hysteresis=2,
min_loss_scale=1,
initial_scale_power=15,
),
inputs_to_half=['inputs'],
zero_optimization=dict(
stage=3,
allgather_partitions=True,
reduce_scatter=True,
allgather_bucket_size=50000000,
reduce_bucket_size=50000000,
overlap_comm=True,
contiguous_gradients=True,
cpu_offload=False,
))

# runner which supports strategies
runner_type = 'FlexibleRunner'
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
_base_ = ['./vit-large-p16_8xb128-coslr-50e_in1k.py']

# optimizer wrapper
optim_wrapper = dict(type='DeepSpeedOptimWrapper')

# training strategy
# Deepspeed with ZeRO3 + fp16
strategy = dict(
type='DeepSpeedStrategy',
fp16=dict(
enabled=True,
fp16_master_weights_and_grads=False,
loss_scale=0,
loss_scale_window=500,
hysteresis=2,
min_loss_scale=1,
initial_scale_power=15,
),
inputs_to_half=['inputs'],
zero_optimization=dict(
stage=3,
allgather_partitions=True,
reduce_scatter=True,
allgather_bucket_size=50000000,
reduce_bucket_size=50000000,
overlap_comm=True,
contiguous_gradients=True,
cpu_offload=False,
))

# runner which supports strategies
runner_type = 'FlexibleRunner'
9 changes: 8 additions & 1 deletion tools/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import mmengine
from mmengine.config import Config, ConfigDict, DictAction
from mmengine.evaluator import DumpResults
from mmengine.registry import RUNNERS
from mmengine.runner import Runner


Expand Down Expand Up @@ -169,7 +170,13 @@ def main():
cfg = merge_args(cfg, args)

# build the runner from config
runner = Runner.from_cfg(cfg)
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)

if args.out and args.out_item in ['pred', None]:
runner.test_evaluator.metrics.append(
Expand Down
9 changes: 8 additions & 1 deletion tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from copy import deepcopy

from mmengine.config import Config, ConfigDict, DictAction
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION
Expand Down Expand Up @@ -149,7 +150,13 @@ def main():
cfg = merge_args(cfg, args)

# build the runner from config
runner = Runner.from_cfg(cfg)
if 'runner_type' not in cfg:
# build the default runner
runner = Runner.from_cfg(cfg)
else:
# build customized runner from the registry
# if 'runner_type' is set in the cfg
runner = RUNNERS.build(cfg)

# start training
runner.train()
Expand Down

0 comments on commit 658db80

Please sign in to comment.