From 7b14a613d71023b4221177b44ce97ad8d5c11c91 Mon Sep 17 00:00:00 2001 From: floatingsnake <1227073631@qq.com> Date: Mon, 27 Mar 2023 21:13:22 +0900 Subject: [PATCH 1/4] fix freeze language parameter --- magma/magma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/magma/magma.py b/magma/magma.py index a2df2a2..df39a55 100644 --- a/magma/magma.py +++ b/magma/magma.py @@ -99,6 +99,7 @@ def __init__(self, config, device=None, init_weights=True): # freeze parameters if config.freeze_lm: for name, param in self.lm.named_parameters(): # freeze lm weights + param.requires_grad = False if config.adapter_config and "adapter" in name: param.requires_grad = True From 59356b8925522e435cc116d35a6986e27598b7d9 Mon Sep 17 00:00:00 2001 From: Alexis-BX Date: Wed, 29 Mar 2023 12:49:59 -0400 Subject: [PATCH 2/4] working training --- configs/summit_clipH_pythia70m.yml | 2 +- train.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/summit_clipH_pythia70m.yml b/configs/summit_clipH_pythia70m.yml index 611a1e5..1d6ef6d 100644 --- a/configs/summit_clipH_pythia70m.yml +++ b/configs/summit_clipH_pythia70m.yml @@ -9,7 +9,7 @@ lm_path: "EleutherAI/pythia-70m-deduped", # train settings - batch_size: 256, + batch_size: 48, train_steps: 150000, lr: 8.0e-4, min_lr: 0.0, diff --git a/train.py b/train.py index abc8bdc..01aebb4 100644 --- a/train.py +++ b/train.py @@ -173,11 +173,11 @@ def get_pretraining_datasets(config, tokenizer, transforms): ) ##### inference: - image_grid, caption = inference_step(config, eval_loader, model_engine) - wandb_log( - {"inference/image": wandb.Image(image_grid, caption=caption)}, - step=global_step, - ) + # image_grid, caption = inference_step(config, eval_loader, model_engine) + # wandb_log( + # {"inference/image": wandb.Image(image_grid, caption=caption)}, + # step=global_step, + # ) model_engine.train() From 7f3a3137eb497339cd49aa58289dc5e90397052f Mon Sep 17 00:00:00 2001 From: Alexis-BX Date: Tue, 18 Apr 2023 11:04:34 -0400 Subject: [PATCH 3/4] working LAION_400M --- .gitignore | 3 ++ configs/summit_clipH_neox20B.yml | 9 ++-- configs/summit_clipH_pythia70m.yml | 11 +++-- configs/summit_clipH_pythia70m_template.yml | 47 ++++++++++++++++++ launch_job.sh | 36 ++++++++++++++ magma/config.py | 8 +++- magma/datasets/convert_LAION400m.py | 53 +++++++++++++++++++++ magma/datasets/convert_datasets.py | 9 ++-- magma/utils.py | 4 +- train.py | 7 ++- 10 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 configs/summit_clipH_pythia70m_template.yml create mode 100644 launch_job.sh create mode 100644 magma/datasets/convert_LAION400m.py diff --git a/.gitignore b/.gitignore index bda4a34..a65f54d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ wandb/* aws* *.env* + +configs/summit_clipH_pythia70m_* +!configs/summit_clipH_pythia70m_template.yml \ No newline at end of file diff --git a/configs/summit_clipH_neox20B.yml b/configs/summit_clipH_neox20B.yml index fee9080..459b65d 100644 --- a/configs/summit_clipH_neox20B.yml +++ b/configs/summit_clipH_neox20B.yml @@ -9,7 +9,7 @@ lm_path: "EleutherAI/gpt-neox-20b", # train settings - batch_size: 256, + batch_size: 3, train_steps: 150000, lr: 8.0e-4, min_lr: 0.0, @@ -31,8 +31,11 @@ # vqa_dir: "/mnt/localdisk/vqa_val_converted", # gqa_dir: "/mnt/localdisk/gqa_val_converted", - save: "checkpoints/MAGMA_20B_clipH", - load: "checkpoints/MAGMA_20B_clipH", + save_every: 2500, + save: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_20B_clipH", + load: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_20B_clipH", + #save: "checkpoints/MAGMA_19M_clipH", + #load: "checkpoints/MAGMA_19M_clipH", eval_every: 250, wandb_project: "MAGMA_20B_clipH", diff --git a/configs/summit_clipH_pythia70m.yml b/configs/summit_clipH_pythia70m.yml index 1d6ef6d..94b9e65 100644 --- a/configs/summit_clipH_pythia70m.yml +++ b/configs/summit_clipH_pythia70m.yml @@ -7,9 +7,10 @@ # language model settings lm_name: "neox", lm_path: "EleutherAI/pythia-70m-deduped", + # lm_path: "EleutherAI/gpt-neox-20b", # train settings - batch_size: 48, + batch_size: 3, train_steps: 150000, lr: 8.0e-4, min_lr: 0.0, @@ -26,13 +27,17 @@ # dataset / save / load settings # dataset_type: 'new', train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed'], #'/mnt/brick/wit_converted' + #train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/LAION-400m-webdataset'], eval_dataset_dir: null, # if this is none, train dataset will be split # vqa_dir: "/mnt/localdisk/vqa_val_converted", # gqa_dir: "/mnt/localdisk/gqa_val_converted", - save: "checkpoints/MAGMA_19M_clipH", - load: "checkpoints/MAGMA_19M_clipH", + save_every: 2500, + save: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_19M_clipH", + load: "/gpfs/alpine/scratch/alexisroger/csc499/magma/checkpoints/MAGMA_19M_clipH", + #save: "checkpoints/MAGMA_19M_clipH", + #load: "checkpoints/MAGMA_19M_clipH", eval_every: 250, wandb_project: "MAGMA_19M_clipH", diff --git a/configs/summit_clipH_pythia70m_template.yml b/configs/summit_clipH_pythia70m_template.yml new file mode 100644 index 0000000..831dd6c --- /dev/null +++ b/configs/summit_clipH_pythia70m_template.yml @@ -0,0 +1,47 @@ +{ + # image encoder settings + encoder_name: 'openclip-H', + adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 8}, "attention": {"adapter_type": "normal", "downsample_factor": 8}}, + freeze_img_encoder: false, + + # language model settings + lm_name: "neox", + lm_path: "EleutherAI/pythia-70m-deduped", + # lm_path: "EleutherAI/gpt-neox-20b", + + # train settings + batch_size: 3, + train_steps: 150000, + lr: 8.0e-4, + min_lr: 0.0, + lr_decay_iters: 300000, + image_enc_lr: 2.0e-6, + use_image_embed_layernorm: true, + image_embed_dropout_prob: 0.1, + image_size: 384, + + gradient_accumulation_steps: 4, + zero_stage: 2, + gradient_clipping: 1.0, + + # dataset / save / load settings + # dataset_type: 'new', + train_dataset_dir: [ + #'/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed', + '/gpfs/alpine/csc499/proj-shared/magma/LAION-400m-webdataset/', + ], + + eval_dataset_dir: null, # if this is none, train dataset will be split + # vqa_dir: "/mnt/localdisk/vqa_val_converted", + # gqa_dir: "/mnt/localdisk/gqa_val_converted", + + save_every: 2500, + save: "/gpfs/alpine/scratch/{{USER}}/csc499/magma/checkpoints/MAGMA_19M_clipH_{{NODES}}", + load: "/gpfs/alpine/scratch/{{USER}}/csc499/magma/checkpoints/MAGMA_19M_clipH_{{NODES}}", + #save: "checkpoints/MAGMA_19M_clipH", + #load: "checkpoints/MAGMA_19M_clipH", + + eval_every: 250, + wandb_project: "MAGMA_19M_clipH", + name: "MAGMA_19M_clipH" +} diff --git a/launch_job.sh b/launch_job.sh new file mode 100644 index 0000000..1b44420 --- /dev/null +++ b/launch_job.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#BSUB -nnodes 9 +#BSUB -W 2:00 +#BSUB -q batch +#BSUB -o magma_pythia70m_out.%J +#BSUB -e magma_pythia70m_err.%J +#BSUB -J magma_pythia70m +#BSUB -alloc_flags gpudefault +#BSUB -P CSC499 + +source /gpfs/alpine/csc499/proj-shared/env_setup/setup.sh + +export TORCH_EXTENSIONS_DIR=/gpfs/alpine/scratch/$(whoami)/csc499/cache/torch_extensions + +# Write the hostfile for this job +cat $LSB_DJOB_HOSTFILE | sort | uniq | tail -n +2 | sed -e 's/$/ slots=6/' > /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts +export DLTS_HOSTFILE=/ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts + +NNODES=$(wc -l < /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts) + +export WANDB_DIR=/gpfs/alpine/scratch/$(whoami)/csc499/wandb +export WANDB_MODE=dryrun + +if [ ! -e configs/summit_clipH_pythia70m_$NNODES.yml ]; then + cp configs/summit_clipH_pythia70m_template.yml configs/summit_clipH_pythia70m_$NNODES.yml + sed -i "s/{{NODES}}/$NNODES/g" configs/summit_clipH_pythia70m_$NNODES.yml + sed -i "s/{{USER}}/$(whoami)/g" configs/summit_clipH_pythia70m_$NNODES.yml +fi + +if [ ${1:-1} = "-l" ] + then + deepspeed -H /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts train.py --config summit_clipH_pythia70m_$NNODES.yml > log_$NNODES\_$sec.log 2>&1 & + else + deepspeed -H /ccs/home/$(whoami)/scratch/hostfiles/$LSB_JOBID-hosts train.py --config summit_clipH_pythia70m_$NNODES.yml +fi + diff --git a/magma/config.py b/magma/config.py index e69ed18..cc52821 100644 --- a/magma/config.py +++ b/magma/config.py @@ -127,10 +127,14 @@ def __post_init__(self): }, } self.deepspeed_config_params = { - "train_batch_size": self.batch_size, + #"train_batch_size": self.batch_size, + "train_micro_batch_size_per_gpu": self.batch_size, "gradient_accumulation_steps": self.gradient_accumulation_steps, "gradient_clipping": self.gradient_clipping, - "fp16": {"enabled": True, "loss_scale_window": 250}, + "fp16": { + "enabled": True, + "loss_scale_window": 250 + }, "scheduler": self.scheduler_dict, "zero_optimization": { "stage": self.zero_stage, diff --git a/magma/datasets/convert_LAION400m.py b/magma/datasets/convert_LAION400m.py new file mode 100644 index 0000000..5361512 --- /dev/null +++ b/magma/datasets/convert_LAION400m.py @@ -0,0 +1,53 @@ +import os +from pathlib import Path +from tqdm import tqdm +from magma.datasets.convert_datasets import convert_dataset + +def get_ds_iterator(tmp_path): + for image in tmp_path.glob("*.jpg"): + try: + with open(image.with_suffix('.txt'), "r") as f: + yield (image, {"captions" : f.readlines()[0]}) + except: + pass + +if __name__ == "__main__": + original_datasets = '/gpfs/alpine/csc499/proj-shared/LAION-400m-webdataset/data' + destination_datasets = '/gpfs/alpine/csc499/proj-shared/magma/LAION-400m-webdataset' + + original_datasets = Path(original_datasets) + destination_datasets = Path(destination_datasets) + + os.makedirs(destination_datasets / 'tmp', exist_ok=True) + + seen_archives = [p.stem for p in (destination_datasets/'images').glob('*')] + + for path in tqdm( + original_datasets.glob("*.tar"), + desc=f"loading dataset tar form {original_datasets}", + ): + + archive_number = path.stem + + if archive_number in seen_archives: + continue + + tmp_path = destination_datasets / 'tmp' / archive_number + + os.makedirs(tmp_path, exist_ok=True) + os.system(f"tar -xf {path} -C {tmp_path}") + + ds_iterator = get_ds_iterator(tmp_path) + + convert_dataset( + destination_datasets, + dir_size=10000, + mode="mv", + ds_iterator=ds_iterator, + dataset_number=archive_number + ) + + os.system(f'rm -rf {tmp_path}') + + os.system(f"rm -rf {destination_datasets / 'tmp'}") + diff --git a/magma/datasets/convert_datasets.py b/magma/datasets/convert_datasets.py index 38f2a49..0b95561 100644 --- a/magma/datasets/convert_datasets.py +++ b/magma/datasets/convert_datasets.py @@ -20,7 +20,7 @@ def save_to_jsons(data_list, target_dir, starting_idx=0): def save_images(img_list, target_dir, mode="mv"): - print(f'img_list: {img_list}') + #print(f'img_list: {img_list}') for img_path in tqdm( img_list, desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}", @@ -37,6 +37,7 @@ def convert_dataset( hash_fn=None, mode="cp", ds_iterator=None, + dataset_number=0 ): """ Builds a dataset directory in our standard format. ds_iterator should return data of the form @@ -52,8 +53,8 @@ def convert_dataset( img_data_list = [] img_path_list = [] - save_img_dir = data_dir / "images" / "0" - save_data_dir = data_dir / "image_data" / "0" + save_img_dir = data_dir / "images" / str(dataset_number) + save_data_dir = data_dir / "image_data" / str(dataset_number) num_img_dirs = 0 # save the new locations of all img files in case some datafiles point to the same image @@ -81,7 +82,7 @@ def convert_dataset( else: # if file exists in the old location, it will get moved to a new directory #new_img_path = f"images/{save_img_dir}/{img_path}" - new_img_path = f"{img_path}" + new_img_path = f"{save_img_dir}/{str(img_path).split('/')[-1]}" img_cpt_data["image_path"] = new_img_path new_img_locations[str(img_path)] = {"new_img_path": new_img_path} # original location is saved an later saved to the new directory diff --git a/magma/utils.py b/magma/utils.py index 610286c..b4035ce 100644 --- a/magma/utils.py +++ b/magma/utils.py @@ -366,8 +366,8 @@ def build_labels( """ shape = input_embeddings.shape[:2] # b, s - print(f'captions: {captions.shape[1]}') - print(f'default: {shape[1]}') + #print(f'captions: {captions.shape[1]}') + #print(f'default: {shape[1]}') assert captions.shape[1] >= shape[1] # make sure to add masked embedding tokens in the appropriate locations in the labels diff --git a/train.py b/train.py index 01aebb4..7b23f61 100644 --- a/train.py +++ b/train.py @@ -16,6 +16,7 @@ from magma.utils import ( is_main, cycle, + get_world_info, parse_args, wandb_log, wandb_init, @@ -30,6 +31,8 @@ train_step, ) +import deepspeed.comm as dist +from deepspeed.runtime.utils import see_memory_usage def _load_img_cpt_datasets(dataset_dir, tokenizer, transforms): if isinstance(dataset_dir, (list, tuple)): @@ -75,6 +78,8 @@ def get_pretraining_datasets(config, tokenizer, transforms): args = parse_args() deepspeed.init_distributed() + args.local_rank, _, _ = get_world_info() + # load model + tokenizer: model = Magma( args.config, @@ -150,7 +155,7 @@ def get_pretraining_datasets(config, tokenizer, transforms): global_step += 1 if global_step % config.log_every == 0: - pbar.set_description(f"training... Step: {global_step} Loss: {loss}") + pbar.set_description(f"training... Rank: {dist.get_rank()}, Step: {global_step}, Loss: {loss}") current_lr = ( [lr for lr in lr_scheduler.get_lr()] if lr_scheduler is not None From dae5ded59516da1a2a9c92053abfce6d9f0ad0dc Mon Sep 17 00:00:00 2001 From: Alexis Roger Date: Tue, 9 May 2023 13:09:20 -0400 Subject: [PATCH 4/4] minor changes --- launch_job.sh | 8 ++++---- magma/datasets/convert_LAION400m.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) mode change 100644 => 100755 launch_job.sh diff --git a/launch_job.sh b/launch_job.sh old mode 100644 new mode 100755 index 1b44420..c9a7389 --- a/launch_job.sh +++ b/launch_job.sh @@ -1,9 +1,9 @@ #!/bin/bash -#BSUB -nnodes 9 -#BSUB -W 2:00 +#BSUB -nnodes 5 +#BSUB -W 1:30 #BSUB -q batch -#BSUB -o magma_pythia70m_out.%J -#BSUB -e magma_pythia70m_err.%J +#BSUB -o /ccs/home/alexisroger/scratch/jobs/magma_pythia70m_out.%J +#BSUB -e /ccs/home/alexisroger/scratch/jobs/magma_pythia70m_err.%J #BSUB -J magma_pythia70m #BSUB -alloc_flags gpudefault #BSUB -P CSC499 diff --git a/magma/datasets/convert_LAION400m.py b/magma/datasets/convert_LAION400m.py index 5361512..dcd3b3d 100644 --- a/magma/datasets/convert_LAION400m.py +++ b/magma/datasets/convert_LAION400m.py @@ -1,7 +1,7 @@ import os from pathlib import Path from tqdm import tqdm -from magma.datasets.convert_datasets import convert_dataset +from convert_datasets import convert_dataset def get_ds_iterator(tmp_path): for image in tmp_path.glob("*.jpg"):