Skip to content

Commit

Permalink
scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Panlichen committed Apr 11, 2023
1 parent 0be3a6d commit 791de39
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 21 deletions.
10 changes: 5 additions & 5 deletions configs/vit_imagenet_acc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import os
host = os.environ.get('HOST')
if (host == "oneflow-25" or host == "oneflow-27"):
from .common.models.vit.vit_base_patch16_224 import model
# from .common.models.vit.vit_base_patch16_224 import model
# from .common.models.vit.vit_huge_patch14_224 import model
# from .common.models.vit.vit_large_patch16_224 import model
from .common.models.vit.vit_large_patch16_224 import model
else:
from .common.models.vit.vit_small_patch16_224 import model

Expand Down Expand Up @@ -91,11 +91,11 @@
# Distributed Settings
train.dist.pipeline_num_layers = model.cfg.depth
train.dist.tensor_parallel_size = 2
train.dist.data_parallel_size = 2
train.dist.data_parallel_size = 4
train.dist.pipeline_parallel_size = 2

train.train_micro_batch_size = 128

train.num_accumulation_steps = train.dist.pipeline_parallel_size # global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups
train.train_micro_batch_size = 128 // train.num_accumulation_steps
# train.num_accumulation_steps = train.dist.pipeline_parallel_size # global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups
# train.train_micro_batch_size = 128 // train.num_accumulation_steps

26 changes: 13 additions & 13 deletions tools/train_27.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ elif [ $GPUS = 4 ]; then
elif [ $GPUS = 8 ]; then

#pure dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=30
# export RECV_SUCCESS_THRESHOLD=100000000
# export BASE_CTX_SWITCH_THRESHOLD=120000
# export TOLERANT_UNPROGRESSED_CNT=180000
# export NUM_TRY_TASKQ_HEAD=240
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=30
export RECV_SUCCESS_THRESHOLD=100000000
export BASE_CTX_SWITCH_THRESHOLD=80000
export TOLERANT_UNPROGRESSED_CNT=500000
export NUM_TRY_TASKQ_HEAD=240

#pure tp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
Expand Down Expand Up @@ -147,13 +147,13 @@ elif [ $GPUS = 8 ]; then
# export TOLERANT_UNPROGRESSED_CNT=80000
# export NUM_TRY_TASKQ_HEAD=10

# 3d-acc_grad-base
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=5
export RECV_SUCCESS_THRESHOLD=10000000
export BASE_CTX_SWITCH_THRESHOLD=90000
export TOLERANT_UNPROGRESSED_CNT=80000
export NUM_TRY_TASKQ_HEAD=10
# 3d-acc_grad-base-2
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000000
# export BASE_CTX_SWITCH_THRESHOLD=150000
# export TOLERANT_UNPROGRESSED_CNT=120000
# export NUM_TRY_TASKQ_HEAD=10


fi
Expand Down
6 changes: 3 additions & 3 deletions tools/train_27_25.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ if [ -z $RUN_TYPE ];then
# RUN_TYPE="NSYS"
fi

# export ONEFLOW_ENABLE_OFCCL=1
export ONEFLOW_ENABLE_OFCCL=1
export DISABLE_NCCL_COMPUTE_STREAM=1
# export ONEFLOW_TIME_SHAPE=1
export ONEFLOW_DEBUG_MODE=1
export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1

export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1
# export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1
# nn_graph*=1,
# export GLOG_v=1
export GLOG_v=1

export SHOW_ALL_PREPARED_COLL=1

Expand Down

0 comments on commit 791de39

Please sign in to comment.