Implementing ViT with ColoTensor using ZeroDDP #1162
Replies: 4 comments
-
Where's the definition of |
Beta Was this translation helpful? Give feedback.
-
I also suppose that your inputs are not on CUDA. |
Beta Was this translation helpful? Give feedback.
-
I replace the dali dataloader with random data, the code works well. import glob
import os
import colossalai
import torch
import torch.nn as nn
import colossalai.utils as utils
from colossalai.utils import colo_set_process_memory_fraction, get_current_device, MultiTimer
from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import Accuracy, CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.nn._ops import *
from colossalai.nn.parallel.layers import init_colo_module
from colossalai.nn.parallel import ZeroDDP
from colossalai.nn.parallel.data_parallel import ColoDDP
from colossalai.zero import ZeroOptimizer
from colossalai.trainer import Trainer, hooks
from colossalai.tensor import TensorSpec, ComputePattern, ParallelAction, DistSpecManager, distspec, ChunkManager
from colossalai.gemini.gemini_mgr import GeminiManager
from titans.dataloader.imagenet import build_dali_imagenet
from timm.models.vision_transformer import _create_vision_transformer, vit_tiny_patch16_224
from tqdm import tqdm
def init_spec_func(model, tp_type):
if tp_type == 'row':
spec = TensorSpec(
distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [-1], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
ParallelAction(ComputePattern.TP1D))
with DistSpecManager.no_grad():
for n, p in model.named_parameters():
print(n)
if 'weight' in n and 'norm' not in n and 'patch_embed.proj.weight' not in n:
p.set_spec(spec)
print("done")
elif tp_type == 'col':
spec = TensorSpec(
distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [0], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
ParallelAction(ComputePattern.TP1D))
with DistSpecManager.no_grad():
for n, p in model.named_parameters():
if ('weight' in n
or 'bias' in n) and 'norm' not in n and ('patch_embed.proj.weight' not in n and 'patch_embed.proj.bias' not in n):
p.set_spec(spec)
else:
raise NotImplemented
def get_data(batch_size):
imgs = torch.rand(batch_size, 3, 224, 224, device=get_current_device())
labels = torch.randint(0, 1000, (batch_size, ), device=get_current_device())
return imgs, labels
def train_imagenet():
colossalai.launch_from_torch(config={'parallel': {'tensor': {'mode': '1d', 'size': 2}}})
disable_existing_loggers()
logger = get_dist_logger()
logger.info('Build model', ranks=[0])
use_chunk = True
use_zero = True
placement_policy = 'cuda'
with ColoInitContext(device=get_current_device()):
model = vit_tiny_patch16_224()
model = model.cuda().half()
init_spec_func(model, 'col')
chunk_size = ChunkManager.search_chunk_size(model, 8192, 8) if use_chunk else None
chunk_manager = ChunkManager(chunk_size,
enable_distributed_storage=use_zero,
init_device=GeminiManager.get_default_device(placement_policy))
gemini_manager = GeminiManager(placement_policy, chunk_manager)
model = ZeroDDP(model, gemini_manager)
logger.info(chunk_manager, ranks=[0])
logger.info('Build criterion, optimizer, lr_scheduler', ranks=[0])
criterion = CrossEntropyLoss(label_smoothing=0.1)
optimizer = HybridAdam(model.parameters(), lr=1e-3, weight_decay=0)
optimizer = ZeroOptimizer(optimizer, model, initial_scale=32)
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
total_steps=10,
warmup_steps=1)
model.train()
for i in range(10):
imgs, labels = get_data(16)
optimizer.zero_grad()
output = model(imgs)
loss = criterion(output, labels)
optimizer.backward(loss)
optimizer.step()
logger.info(f'Step [{i+1}/10] loss: {loss.item():.3f}', ranks=[0])
if __name__ == '__main__':
train_imagenet() |
Beta Was this translation helpful? Give feedback.
-
Thanks. That works for me. |
Beta Was this translation helpful? Give feedback.
-
I try to Implement ViT with ColoTensor using ZeroDDP.
The code snippet is shown below.
I use 2DP+2TP setting with
parallel = dict(tensor=dict(mode="1d", size=2),)
and get the following error message
Any suggestions?
Beta Was this translation helpful? Give feedback.
All reactions