Skip to content

Commit

Permalink
Fixes errors in vision model pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
jon-barker authored and jaredcasper committed Sep 12, 2023
1 parent 4c5f77f commit b492498
Show file tree
Hide file tree
Showing 12 changed files with 225 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ build
*~
slurm*
logs
.vscode
64 changes: 64 additions & 0 deletions examples/pretrain_vision_classify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#! /bin/bash

# Pre-trains ViT based image classificaation model

export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1

# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>

CHECKPOINT_PATH=<Specify path>

CLASSIFIER_ARGS="
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "

DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"

OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"

torchrun pretrain_vision_classification.py \
$CLASSIFIER_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH

67 changes: 67 additions & 0 deletions examples/pretrain_vision_dino.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#! /bin/bash

# Pre-trains Dino V1 model
# For model details: https://arxiv.org/abs/2104.14294
# For original author implementation: https://github.com/facebookresearch/dino/tree/main

export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1

# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>

CHECKPOINT_PATH=<Specify path>

DINO_ARGS="
--vision-pretraining-type dino \
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "

DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"

OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"

torchrun pretrain_vision_dino.py \
$DINO_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH

65 changes: 65 additions & 0 deletions examples/pretrain_vision_inpaint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#! /bin/bash

# Pre-trains ViT based image inpainting model

export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1

# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>

CHECKPOINT_PATH=<Specify path>

INPAINT_ARGS="
--vision-pretraining-type inpaint \
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "

DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"

OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"

torchrun pretrain_vision_inpaint.py \
$INPAINT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH

2 changes: 1 addition & 1 deletion megatron/data/autoaugment.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def __init__(
"rotate": np.linspace(0, 30, num_levels),
"color": np.linspace(0.0, 0.9, num_levels),
"posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
np.int
np.int32
),
"solarize": np.linspace(256, 0, num_levels), # range [0, 256]
"contrast": np.linspace(0.0, 0.9, num_levels),
Expand Down
5 changes: 3 additions & 2 deletions megatron/model/vision/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self, config, num_classes, finetune=False,
pre_process=True, post_process=True):
super(VitClassificationModel, self).__init__()
args = get_args()
self.config = config

self.hidden_size = args.hidden_size
self.num_classes = num_classes
Expand All @@ -29,10 +30,10 @@ def __init__(self, config, num_classes, finetune=False,
post_process=self.post_process,
single_token_output=True
)

if self.post_process:
if not self.finetune:
self.head = VitMlpHead(self.hidden_size, self.num_classes)
self.head = VitMlpHead(config, self.hidden_size, self.num_classes)
else:
self.head = get_linear_layer(
self.hidden_size,
Expand Down
3 changes: 2 additions & 1 deletion megatron/model/vision/dino.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def get_student_backbone_and_num_features(config, pre_process=True, post_process
else:
raise Exception('{} vision backbone is not supported.'.format(
args.vision_backbone_type))

return student, num_features

def get_teacher_backbone_and_num_features(config, pre_process=True, post_process=True):
Expand Down Expand Up @@ -220,6 +220,7 @@ class DINOPretrainModel(MegatronModule):
def __init__(self, config, pre_process=True, post_process=True):
super(DINOPretrainModel, self).__init__()
args = get_args()
self.config = config
self.out_dim = 65536

self.dino_loss = DINOLoss(
Expand Down
15 changes: 8 additions & 7 deletions megatron/model/vision/inpainting.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
i

import math
import apex
import einops
Expand All @@ -13,7 +13,7 @@
from megatron.model.vision.vit_backbone import VitBackbone
from megatron.model.module import MegatronModule
from megatron.model.vision.mit_backbone import mit_b3
from megatron.model.vision.utils import resize_
from megatron.model.vision.utils import resize


class VitInpaintingModel(MegatronModule):
Expand All @@ -22,6 +22,7 @@ def __init__(self, config, pre_process=True, post_process=True):
super(VitInpaintingModel, self).__init__()
args = get_args()

self.config = config
self.pre_process = pre_process
self.post_process = post_process
self.hidden_size = config.hidden_size
Expand Down Expand Up @@ -108,9 +109,9 @@ def __init__(self, pre_process=True, post_process=True):
self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False)
self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
self.dropout = torch.nn.Dropout2d(0.1)

self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1)

def set_input_tensor(self, input_tensor):
"""See megatron.model.transformer.set_input_tensor()"""
pass
Expand All @@ -121,7 +122,7 @@ def forward(self, input):
n, _, h, w = c4.shape
_c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
_c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)

_c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
_c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)

Expand All @@ -132,7 +133,7 @@ def forward(self, input):

_c = torch.cat([_c4, _c3, _c2, _c1], dim=1)
_c = self.conv_fuse(_c)

x = self.norm(_c)
x = F.relu(x, inplace=True)
x = self.dropout(x)
Expand Down
7 changes: 5 additions & 2 deletions megatron/model/vision/vit_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ class VitMlpHead(MegatronModule):
bias is set to zero.
"""

def __init__(self, hidden_size, num_classes):
def __init__(self, config, hidden_size, num_classes):
super(VitMlpHead, self).__init__()
self.config = config
self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
self.relu = torch.nn.ReLU()
self.dense_out = torch.nn.Linear(hidden_size, num_classes)
Expand Down Expand Up @@ -139,6 +140,7 @@ def __init__(self,
drop_path_rate=0.0):
super(VitBackbone, self).__init__(share_embeddings_and_output_weights=False)
args = get_args()
self.config = config

self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy

Expand Down Expand Up @@ -172,7 +174,7 @@ def __init__(self,
)
torch.nn.init.zeros_(self.cls_token)
self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()

# Linear encoder
self.linear_encoder = torch.nn.Linear(
self.flatten_dim, self.hidden_size
Expand All @@ -196,6 +198,7 @@ def __init__(self,
# Transformer
self.transformer = ParallelTransformer(
config,
model_type=args.model_type,
pre_process=self.pre_process,
post_process=self.post_process,
post_layer_norm=self.post_layer_norm,
Expand Down
2 changes: 1 addition & 1 deletion megatron/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def build_tokenizer(args):
else:
raise NotImplementedError('{} tokenizer is not '
'implemented.'.format(args.tokenizer_type))

# Add vocab size (if not already set from a checkpoint).
if getattr(args, "padded_vocab_size", None) is None:
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
Expand Down
3 changes: 2 additions & 1 deletion pretrain_vision_dino.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def get_batch(data_iterator):

def loss_func(model, labels, output_tensor, collect_data=False):
args = get_args()

model = unwrap_model(model)
if model.training:
student_output, teacher_output = output_tensor
Expand Down Expand Up @@ -94,6 +94,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):


if __name__ == "__main__":

pretrain(
train_valid_test_datasets_provider,
model_provider,
Expand Down
Loading

0 comments on commit b492498

Please sign in to comment.