Implementing ViT with ColoTensor using ZeroDDP #1162

ZhaoYi1222 · 2022-06-23T04:08:52Z

ZhaoYi1222
Jun 23, 2022

I try to Implement ViT with ColoTensor using ZeroDDP.
The code snippet is shown below.

def init_spec_func(model, tp_type):
    if tp_type == 'row':
        spec = TensorSpec(
            distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [-1], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
            ParallelAction(ComputePattern.TP1D))
        with DistSpecManager.no_grad():
            for n, p in model.named_parameters():
                if 'weight' in n and 'norm' not in n and 'patch_embed.proj.weight' not in n:
                    p.set_spec(spec)
    elif tp_type == 'col':
        spec = TensorSpec(
            distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [0], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
            ParallelAction(ComputePattern.TP1D))
        with DistSpecManager.no_grad():
            for n, p in model.named_parameters():
                if ('weight' in n
                        or 'bias' in n) and 'norm' not in n and ('patch_embed.proj.weight' not in n and 'patch_embed.proj.bias' not in n):
                    p.set_spec(spec)
    else:
        raise NotImplemented


use_chunk = False
use_zero = True
placement_policy = 'cuda'
with ColoInitContext(device=get_current_device()):
    model = _create_vision_transformer('vit_base_patch16_224', pretrained=False, **model_kwargs)
model = model.cuda().half()
init_spec_func(model, gpc.config.TP_TYPE)
chunk_size = ChunkManager.search_chunk_size(model, 8192, 8) if use_chunk else None
chunk_manager = ChunkManager(chunk_size,
                                 enable_distributed_storage=use_zero,
                                 init_device=GeminiManager.get_default_device(placement_policy))
gemini_manager = GeminiManager(placement_policy, chunk_manager)

model = ZeroDDP(model, gemini_manager)

logger.info('Build criterion, optimizer, lr_scheduler', ranks=[0])
criterion = CrossEntropyLoss(label_smoothing=0.1)
optimizer = HybridAdam(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
optimizer = ZeroOptimizer(optimizer, model, initial_scale=32)
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=gpc.config.NUM_EPOCHS,
                                           warmup_steps=gpc.config.WARMUP_EPOCHS)

model.train()
for index, (x, y) in enumerate(train_dataloader):
    if use_ddp:
        model.zero_grad()
    else:
        optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    if use_ddp:
        model.backward(loss)
    else:
        loss.backward()
    optimizer.step()

I use 2DP+2TP setting with
parallel = dict(tensor=dict(mode="1d", size=2),)

and get the following error message

File "/home/lczy/anaconda3/envs/pyt/lib/python3.8/site-packages/timm/models/layers/patch_embed.py", line 35, in forward
    x = self.proj(x)
  File "/home/lczy/anaconda3/envs/pyt/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lczy/anaconda3/envs/pyt/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 447, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/lczy/anaconda3/envs/pyt/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
  File "/home/lczy/dl/gitspace/ColossalAI/colossalai/tensor/colo_parameter.py", line 71, in __torch_function__
    ret = super().__torch_function__(func, types, args, kwargs)
  File "/home/lczy/dl/gitspace/ColossalAI/colossalai/tensor/colo_tensor.py", line 86, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:1! (when checking argument for argument weight in method wrapper___slow_conv2d_forward)

Any suggestions?

zxgx · 2022-06-23T04:49:49Z

zxgx
Jun 23, 2022

Where's the definition of train_dataloader? I guess output = model(x.cuda()) may be able to resolve this problem?
If it works, you should also change the criterion function to loss = criterion(output, y.cuda())

0 replies

ver217 · 2022-06-23T04:52:55Z

ver217
Jun 23, 2022
Maintainer

I also suppose that your inputs are not on CUDA.

0 replies

ver217 · 2022-06-23T04:53:56Z

ver217
Jun 23, 2022
Maintainer

I replace the dali dataloader with random data, the code works well.

import glob
import os
import colossalai
import torch
import torch.nn as nn
import colossalai.utils as utils
from colossalai.utils import colo_set_process_memory_fraction, get_current_device, MultiTimer
from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import Accuracy, CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.nn._ops import *
from colossalai.nn.parallel.layers import init_colo_module
from colossalai.nn.parallel import ZeroDDP
from colossalai.nn.parallel.data_parallel import ColoDDP
from colossalai.zero import ZeroOptimizer
from colossalai.trainer import Trainer, hooks
from colossalai.tensor import TensorSpec, ComputePattern, ParallelAction, DistSpecManager, distspec, ChunkManager
from colossalai.gemini.gemini_mgr import GeminiManager
from titans.dataloader.imagenet import build_dali_imagenet
from timm.models.vision_transformer import _create_vision_transformer, vit_tiny_patch16_224
from tqdm import tqdm



def init_spec_func(model, tp_type):
    if tp_type == 'row':
        spec = TensorSpec(
            distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [-1], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
            ParallelAction(ComputePattern.TP1D))
        with DistSpecManager.no_grad():
            for n, p in model.named_parameters():
                print(n)
                if 'weight' in n and 'norm' not in n and 'patch_embed.proj.weight' not in n:
                    p.set_spec(spec)
                    print("done")
    elif tp_type == 'col':
        spec = TensorSpec(
            distspec.shard(gpc.get_group(ParallelMode.PARALLEL_1D), [0], [gpc.get_world_size(ParallelMode.PARALLEL_1D)]),
            ParallelAction(ComputePattern.TP1D))
        with DistSpecManager.no_grad():
            for n, p in model.named_parameters():
                if ('weight' in n
                        or 'bias' in n) and 'norm' not in n and ('patch_embed.proj.weight' not in n and 'patch_embed.proj.bias' not in n):
                    p.set_spec(spec)
    else:
        raise NotImplemented

def get_data(batch_size):
    imgs = torch.rand(batch_size, 3, 224, 224, device=get_current_device())
    labels = torch.randint(0, 1000, (batch_size, ), device=get_current_device())
    return imgs, labels



def train_imagenet():
    colossalai.launch_from_torch(config={'parallel': {'tensor': {'mode': '1d', 'size': 2}}})

    disable_existing_loggers()
    logger = get_dist_logger()


    logger.info('Build model', ranks=[0])

    use_chunk = True
    use_zero = True
    placement_policy = 'cuda'
    with ColoInitContext(device=get_current_device()):
        model = vit_tiny_patch16_224()
    model = model.cuda().half()
    init_spec_func(model, 'col')
    chunk_size = ChunkManager.search_chunk_size(model, 8192, 8) if use_chunk else None
    chunk_manager = ChunkManager(chunk_size,
                                 enable_distributed_storage=use_zero,
                                 init_device=GeminiManager.get_default_device(placement_policy))
    gemini_manager = GeminiManager(placement_policy, chunk_manager)

    model = ZeroDDP(model, gemini_manager)
    logger.info(chunk_manager, ranks=[0])

    logger.info('Build criterion, optimizer, lr_scheduler', ranks=[0])
    criterion = CrossEntropyLoss(label_smoothing=0.1)
    optimizer = HybridAdam(model.parameters(), lr=1e-3, weight_decay=0)
    optimizer = ZeroOptimizer(optimizer, model, initial_scale=32)
    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=10,
                                           warmup_steps=1)

    model.train()
    for i in range(10):
        imgs, labels = get_data(16)
        optimizer.zero_grad()
        output = model(imgs)
        loss = criterion(output, labels)
        optimizer.backward(loss)
        optimizer.step()
        logger.info(f'Step [{i+1}/10] loss: {loss.item():.3f}', ranks=[0])



if __name__ == '__main__':
    train_imagenet()

0 replies

ZhaoYi1222 · 2022-06-23T05:07:16Z

ZhaoYi1222
Jun 23, 2022
Author

Thanks. That works for me.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implementing ViT with ColoTensor using ZeroDDP #1162

{{title}}

Replies: 4 comments

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

Implementing ViT with ColoTensor using ZeroDDP #1162

ZhaoYi1222 Jun 23, 2022

Replies: 4 comments

zxgx Jun 23, 2022

ver217 Jun 23, 2022 Maintainer

ver217 Jun 23, 2022 Maintainer

ZhaoYi1222 Jun 23, 2022 Author

ZhaoYi1222
Jun 23, 2022

zxgx
Jun 23, 2022

ver217
Jun 23, 2022
Maintainer

ver217
Jun 23, 2022
Maintainer

ZhaoYi1222
Jun 23, 2022
Author