From 791de39eb2137b8335e63dcea2924fa3771ed708 Mon Sep 17 00:00:00 2001 From: Panlichen Date: Tue, 11 Apr 2023 05:48:49 +0000 Subject: [PATCH] scripts --- configs/vit_imagenet_acc.py | 10 +++++----- tools/train_27.sh | 26 +++++++++++++------------- tools/train_27_25.sh | 6 +++--- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/configs/vit_imagenet_acc.py b/configs/vit_imagenet_acc.py index 6ed34531a..86ec34728 100644 --- a/configs/vit_imagenet_acc.py +++ b/configs/vit_imagenet_acc.py @@ -3,9 +3,9 @@ import os host = os.environ.get('HOST') if (host == "oneflow-25" or host == "oneflow-27"): - from .common.models.vit.vit_base_patch16_224 import model + # from .common.models.vit.vit_base_patch16_224 import model # from .common.models.vit.vit_huge_patch14_224 import model - # from .common.models.vit.vit_large_patch16_224 import model + from .common.models.vit.vit_large_patch16_224 import model else: from .common.models.vit.vit_small_patch16_224 import model @@ -91,11 +91,11 @@ # Distributed Settings train.dist.pipeline_num_layers = model.cfg.depth train.dist.tensor_parallel_size = 2 -train.dist.data_parallel_size = 2 +train.dist.data_parallel_size = 4 train.dist.pipeline_parallel_size = 2 train.train_micro_batch_size = 128 -train.num_accumulation_steps = train.dist.pipeline_parallel_size # global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups -train.train_micro_batch_size = 128 // train.num_accumulation_steps +# train.num_accumulation_steps = train.dist.pipeline_parallel_size # global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups +# train.train_micro_batch_size = 128 // train.num_accumulation_steps diff --git a/tools/train_27.sh b/tools/train_27.sh index 94deff3c1..661706701 100755 --- a/tools/train_27.sh +++ b/tools/train_27.sh @@ -84,12 +84,12 @@ elif [ $GPUS = 4 ]; then elif [ $GPUS = 8 ]; then #pure dp - # export ONEFLOW_OFCCL_SKIP_NEGO=0 - # export RECV_SUCCESS_FACTOR=30 - # export RECV_SUCCESS_THRESHOLD=100000000 - # export BASE_CTX_SWITCH_THRESHOLD=120000 - # export TOLERANT_UNPROGRESSED_CNT=180000 - # export NUM_TRY_TASKQ_HEAD=240 + export ONEFLOW_OFCCL_SKIP_NEGO=0 + export RECV_SUCCESS_FACTOR=30 + export RECV_SUCCESS_THRESHOLD=100000000 + export BASE_CTX_SWITCH_THRESHOLD=80000 + export TOLERANT_UNPROGRESSED_CNT=500000 + export NUM_TRY_TASKQ_HEAD=240 #pure tp # export ONEFLOW_OFCCL_SKIP_NEGO=0 @@ -147,13 +147,13 @@ elif [ $GPUS = 8 ]; then # export TOLERANT_UNPROGRESSED_CNT=80000 # export NUM_TRY_TASKQ_HEAD=10 - # 3d-acc_grad-base - export ONEFLOW_OFCCL_SKIP_NEGO=0 - export RECV_SUCCESS_FACTOR=5 - export RECV_SUCCESS_THRESHOLD=10000000 - export BASE_CTX_SWITCH_THRESHOLD=90000 - export TOLERANT_UNPROGRESSED_CNT=80000 - export NUM_TRY_TASKQ_HEAD=10 + # 3d-acc_grad-base-2 + # export ONEFLOW_OFCCL_SKIP_NEGO=0 + # export RECV_SUCCESS_FACTOR=5 + # export RECV_SUCCESS_THRESHOLD=10000000 + # export BASE_CTX_SWITCH_THRESHOLD=150000 + # export TOLERANT_UNPROGRESSED_CNT=120000 + # export NUM_TRY_TASKQ_HEAD=10 fi diff --git a/tools/train_27_25.sh b/tools/train_27_25.sh index 785c00041..f02b166d5 100755 --- a/tools/train_27_25.sh +++ b/tools/train_27_25.sh @@ -40,15 +40,15 @@ if [ -z $RUN_TYPE ];then # RUN_TYPE="NSYS" fi -# export ONEFLOW_ENABLE_OFCCL=1 +export ONEFLOW_ENABLE_OFCCL=1 export DISABLE_NCCL_COMPUTE_STREAM=1 # export ONEFLOW_TIME_SHAPE=1 export ONEFLOW_DEBUG_MODE=1 export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 -export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 +# export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1 # nn_graph*=1, -# export GLOG_v=1 +export GLOG_v=1 export SHOW_ALL_PREPARED_COLL=1