Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training without inference #5

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
wandb/*
aws*
*.env*

configs/summit_clipH_pythia70m_*
!configs/summit_clipH_pythia70m_template.yml
9 changes: 6 additions & 3 deletions configs/summit_clipH_neox20B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
lm_path: "EleutherAI/gpt-neox-20b",

# train settings
batch_size: 256,
batch_size: 3,
train_steps: 150000,
lr: 8.0e-4,
min_lr: 0.0,
Expand All @@ -31,8 +31,11 @@
# vqa_dir: "/mnt/localdisk/vqa_val_converted",
# gqa_dir: "/mnt/localdisk/gqa_val_converted",

save: "checkpoints/MAGMA_20B_clipH",
load: "checkpoints/MAGMA_20B_clipH",
save_every: 2500,
save: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_20B_clipH",
load: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_20B_clipH",
#save: "checkpoints/MAGMA_19M_clipH",
#load: "checkpoints/MAGMA_19M_clipH",

eval_every: 250,
wandb_project: "MAGMA_20B_clipH",
Expand Down
11 changes: 8 additions & 3 deletions configs/summit_clipH_pythia70m.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
# language model settings
lm_name: "neox",
lm_path: "EleutherAI/pythia-70m-deduped",
# lm_path: "EleutherAI/gpt-neox-20b",

# train settings
batch_size: 256,
batch_size: 3,
train_steps: 150000,
lr: 8.0e-4,
min_lr: 0.0,
Expand All @@ -26,13 +27,17 @@
# dataset / save / load settings
# dataset_type: 'new',
train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed'], #'/mnt/brick/wit_converted'
#train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/LAION-400m-webdataset'],

eval_dataset_dir: null, # if this is none, train dataset will be split
# vqa_dir: "/mnt/localdisk/vqa_val_converted",
# gqa_dir: "/mnt/localdisk/gqa_val_converted",

save: "checkpoints/MAGMA_19M_clipH",
load: "checkpoints/MAGMA_19M_clipH",
save_every: 2500,
save: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_19M_clipH",
load: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_19M_clipH",
#save: "checkpoints/MAGMA_19M_clipH",
#load: "checkpoints/MAGMA_19M_clipH",

eval_every: 250,
wandb_project: "MAGMA_19M_clipH",
Expand Down
47 changes: 47 additions & 0 deletions configs/summit_clipH_pythia70m_template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
# image encoder settings
encoder_name: 'openclip-H',
adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 8}, "attention": {"adapter_type": "normal", "downsample_factor": 8}},
freeze_img_encoder: false,

# language model settings
lm_name: "neox",
lm_path: "EleutherAI/pythia-70m-deduped",
# lm_path: "EleutherAI/gpt-neox-20b",

# train settings
batch_size: 3,
train_steps: 150000,
lr: 8.0e-4,
min_lr: 0.0,
lr_decay_iters: 300000,
image_enc_lr: 2.0e-6,
use_image_embed_layernorm: true,
image_embed_dropout_prob: 0.1,
image_size: 384,

gradient_accumulation_steps: 4,
zero_stage: 2,
gradient_clipping: 1.0,

# dataset / save / load settings
# dataset_type: 'new',
train_dataset_dir: [
#'/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed',
'/gpfs/alpine/csc499/proj-shared/magma/LAION-400m-webdataset/',
],

eval_dataset_dir: null, # if this is none, train dataset will be split
# vqa_dir: "/mnt/localdisk/vqa_val_converted",
# gqa_dir: "/mnt/localdisk/gqa_val_converted",

save_every: 2500,
save: "/gpfs/alpine/scratch/{{USER}}/csc499/magma/checkpoints/MAGMA_19M_clipH_{{NODES}}",
load: "/gpfs/alpine/scratch/{{USER}}/csc499/magma/checkpoints/MAGMA_19M_clipH_{{NODES}}",
#save: "checkpoints/MAGMA_19M_clipH",
#load: "checkpoints/MAGMA_19M_clipH",

eval_every: 250,
wandb_project: "MAGMA_19M_clipH",
name: "MAGMA_19M_clipH"
}
36 changes: 36 additions & 0 deletions launch_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
#BSUB -nnodes 5
#BSUB -W 1:30
#BSUB -q batch
#BSUB -o /ccs/home/alexisroger/scratch/jobs/magma_pythia70m_out.%J
#BSUB -e /ccs/home/alexisroger/scratch/jobs/magma_pythia70m_err.%J
#BSUB -J magma_pythia70m
#BSUB -alloc_flags gpudefault
#BSUB -P CSC499

source /gpfs/alpine/csc499/proj-shared/env_setup/setup.sh

export TORCH_EXTENSIONS_DIR=/gpfs/alpine/scratch/$(whoami)/csc499/cache/torch_extensions

# Write the hostfile for this job
cat $LSB_DJOB_HOSTFILE | sort | uniq | tail -n +2 | sed -e 's/$/ slots=6/' > /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts
export DLTS_HOSTFILE=/ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts

NNODES=$(wc -l < /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts)

export WANDB_DIR=/gpfs/alpine/scratch/$(whoami)/csc499/wandb
export WANDB_MODE=dryrun

if [ ! -e configs/summit_clipH_pythia70m_$NNODES.yml ]; then
cp configs/summit_clipH_pythia70m_template.yml configs/summit_clipH_pythia70m_$NNODES.yml
sed -i "s/{{NODES}}/$NNODES/g" configs/summit_clipH_pythia70m_$NNODES.yml
sed -i "s/{{USER}}/$(whoami)/g" configs/summit_clipH_pythia70m_$NNODES.yml
fi

if [ ${1:-1} = "-l" ]
then
deepspeed -H /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts train.py --config summit_clipH_pythia70m_$NNODES.yml > log_$NNODES\_$sec.log 2>&1 &
else
deepspeed -H /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts train.py --config summit_clipH_pythia70m_$NNODES.yml
fi

8 changes: 6 additions & 2 deletions magma/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,14 @@ def __post_init__(self):
},
}
self.deepspeed_config_params = {
"train_batch_size": self.batch_size,
#"train_batch_size": self.batch_size,
"train_micro_batch_size_per_gpu": self.batch_size,
"gradient_accumulation_steps": self.gradient_accumulation_steps,
"gradient_clipping": self.gradient_clipping,
"fp16": {"enabled": True, "loss_scale_window": 250},
"fp16": {
"enabled": True,
"loss_scale_window": 250
},
"scheduler": self.scheduler_dict,
"zero_optimization": {
"stage": self.zero_stage,
Expand Down
53 changes: 53 additions & 0 deletions magma/datasets/convert_LAION400m.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
from pathlib import Path
from tqdm import tqdm
from convert_datasets import convert_dataset

def get_ds_iterator(tmp_path):
for image in tmp_path.glob("*.jpg"):
try:
with open(image.with_suffix('.txt'), "r") as f:
yield (image, {"captions" : f.readlines()[0]})
except:
pass

if __name__ == "__main__":
original_datasets = '/gpfs/alpine/csc499/proj-shared/LAION-400m-webdataset/data'
destination_datasets = '/gpfs/alpine/csc499/proj-shared/magma/LAION-400m-webdataset'

original_datasets = Path(original_datasets)
destination_datasets = Path(destination_datasets)

os.makedirs(destination_datasets / 'tmp', exist_ok=True)

seen_archives = [p.stem for p in (destination_datasets/'images').glob('*')]

for path in tqdm(
original_datasets.glob("*.tar"),
desc=f"loading dataset tar form {original_datasets}",
):

archive_number = path.stem

if archive_number in seen_archives:
continue

tmp_path = destination_datasets / 'tmp' / archive_number

os.makedirs(tmp_path, exist_ok=True)
os.system(f"tar -xf {path} -C {tmp_path}")

ds_iterator = get_ds_iterator(tmp_path)

convert_dataset(
destination_datasets,
dir_size=10000,
mode="mv",
ds_iterator=ds_iterator,
dataset_number=archive_number
)

os.system(f'rm -rf {tmp_path}')

os.system(f"rm -rf {destination_datasets / 'tmp'}")

9 changes: 5 additions & 4 deletions magma/datasets/convert_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def save_to_jsons(data_list, target_dir, starting_idx=0):


def save_images(img_list, target_dir, mode="mv"):
print(f'img_list: {img_list}')
#print(f'img_list: {img_list}')
for img_path in tqdm(
img_list,
desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}",
Expand All @@ -37,6 +37,7 @@ def convert_dataset(
hash_fn=None,
mode="cp",
ds_iterator=None,
dataset_number=0
):
"""
Builds a dataset directory in our standard format. ds_iterator should return data of the form
Expand All @@ -52,8 +53,8 @@ def convert_dataset(

img_data_list = []
img_path_list = []
save_img_dir = data_dir / "images" / "0"
save_data_dir = data_dir / "image_data" / "0"
save_img_dir = data_dir / "images" / str(dataset_number)
save_data_dir = data_dir / "image_data" / str(dataset_number)
num_img_dirs = 0

# save the new locations of all img files in case some datafiles point to the same image
Expand Down Expand Up @@ -81,7 +82,7 @@ def convert_dataset(
else:
# if file exists in the old location, it will get moved to a new directory
#new_img_path = f"images/{save_img_dir}/{img_path}"
new_img_path = f"{img_path}"
new_img_path = f"{save_img_dir}/{str(img_path).split('/')[-1]}"
img_cpt_data["image_path"] = new_img_path
new_img_locations[str(img_path)] = {"new_img_path": new_img_path}
# original location is saved an later saved to the new directory
Expand Down
1 change: 1 addition & 0 deletions magma/magma.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def __init__(self, config, device=None, init_weights=True):
# freeze parameters
if config.freeze_lm:
for name, param in self.lm.named_parameters(): # freeze lm weights
param.requires_grad = False
if config.adapter_config and "adapter" in name:
param.requires_grad = True

Expand Down
4 changes: 2 additions & 2 deletions magma/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ def build_labels(
"""
shape = input_embeddings.shape[:2] # b, s

print(f'captions: {captions.shape[1]}')
print(f'default: {shape[1]}')
#print(f'captions: {captions.shape[1]}')
#print(f'default: {shape[1]}')
assert captions.shape[1] >= shape[1]

# make sure to add masked embedding tokens in the appropriate locations in the labels
Expand Down
17 changes: 11 additions & 6 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from magma.utils import (
is_main,
cycle,
get_world_info,
parse_args,
wandb_log,
wandb_init,
Expand All @@ -30,6 +31,8 @@
train_step,
)

import deepspeed.comm as dist
from deepspeed.runtime.utils import see_memory_usage

def _load_img_cpt_datasets(dataset_dir, tokenizer, transforms):
if isinstance(dataset_dir, (list, tuple)):
Expand Down Expand Up @@ -75,6 +78,8 @@ def get_pretraining_datasets(config, tokenizer, transforms):
args = parse_args()
deepspeed.init_distributed()

args.local_rank, _, _ = get_world_info()

# load model + tokenizer:
model = Magma(
args.config,
Expand Down Expand Up @@ -150,7 +155,7 @@ def get_pretraining_datasets(config, tokenizer, transforms):
global_step += 1

if global_step % config.log_every == 0:
pbar.set_description(f"training... Step: {global_step} Loss: {loss}")
pbar.set_description(f"training... Rank: {dist.get_rank()}, Step: {global_step}, Loss: {loss}")
current_lr = (
[lr for lr in lr_scheduler.get_lr()]
if lr_scheduler is not None
Expand All @@ -173,11 +178,11 @@ def get_pretraining_datasets(config, tokenizer, transforms):
)

##### inference:
image_grid, caption = inference_step(config, eval_loader, model_engine)
wandb_log(
{"inference/image": wandb.Image(image_grid, caption=caption)},
step=global_step,
)
# image_grid, caption = inference_step(config, eval_loader, model_engine)
# wandb_log(
# {"inference/image": wandb.Image(image_grid, caption=caption)},
# step=global_step,
# )

model_engine.train()

Expand Down