Aleph-Alpha · Jan 15, 2023 · Jan 15, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 *.pyc
 wandb/*
 aws*
+*.env*
diff --git a/configs/summit_clipH_neox20B.yml b/configs/summit_clipH_neox20B.yml
@@ -0,0 +1,40 @@
+{
+    # image encoder settings
+    encoder_name: 'openclip-H',
+    adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 8}, "attention": {"adapter_type": "normal", "downsample_factor": 8}},
+    freeze_img_encoder: false,
+
+    # language model settings
+    lm_name: "neox",
+    lm_path: "EleutherAI/gpt-neox-20b",
+
+    # train settings
+    batch_size: 256,
+    train_steps: 150000,
+    lr: 8.0e-4,
+    min_lr: 0.0,
+    lr_decay_iters: 300000,
+    image_enc_lr: 2.0e-6,
+    use_image_embed_layernorm: true,
+    image_embed_dropout_prob: 0.1,
+    image_size: 384,
+
+    gradient_accumulation_steps: 4,
+    zero_stage: 2,
+    gradient_clipping: 1.0,
+
+    # dataset / save / load settings
+    # dataset_type: 'new',
+    train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed'], #'/mnt/brick/wit_converted'
+
+    eval_dataset_dir: null, # if this is none, train dataset will be split
+    # vqa_dir: "/mnt/localdisk/vqa_val_converted",
+    # gqa_dir: "/mnt/localdisk/gqa_val_converted",
+
+    save: "checkpoints/MAGMA_20B_clipH",
+    load: "checkpoints/MAGMA_20B_clipH",
+
+    eval_every: 250,
+    wandb_project: "MAGMA_20B_clipH",
+    name: "MAGMA_20B_clipH"
+}
diff --git a/configs/summit_clipH_pythia70m.yml b/configs/summit_clipH_pythia70m.yml
@@ -0,0 +1,40 @@
+{
+    # image encoder settings
+    encoder_name: 'openclip-H',
+    adapter_config: {"mlp": {"adapter_type": "normal", "downsample_factor": 8}, "attention": {"adapter_type": "normal", "downsample_factor": 8}},
+    freeze_img_encoder: false,
+
+    # language model settings
+    lm_name: "neox",
+    lm_path: "EleutherAI/pythia-70m-deduped",
+
+    # train settings
+    batch_size: 256,
+    train_steps: 150000,
+    lr: 8.0e-4,
+    min_lr: 0.0,
+    lr_decay_iters: 300000,
+    image_enc_lr: 2.0e-6,
+    use_image_embed_layernorm: true,
+    image_embed_dropout_prob: 0.1,
+    image_size: 384,
+
+    gradient_accumulation_steps: 4,
+    zero_stage: 2,
+    gradient_clipping: 1.0,
+
+    # dataset / save / load settings
+    # dataset_type: 'new',
+    train_dataset_dir: ['/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed'], #'/mnt/brick/wit_converted'
+
+    eval_dataset_dir: null, # if this is none, train dataset will be split
+    # vqa_dir: "/mnt/localdisk/vqa_val_converted",
+    # gqa_dir: "/mnt/localdisk/gqa_val_converted",
+
+    save: "checkpoints/MAGMA_19M_clipH",
+    load: "checkpoints/MAGMA_19M_clipH",
+
+    eval_every: 250,
+    wandb_project: "MAGMA_19M_clipH",
+    name: "MAGMA_19M_clipH"
+}
diff --git a/convert_flickr8k.py b/convert_flickr8k.py
@@ -0,0 +1,21 @@
+from magma.datasets.convert_datasets import convert_dataset
+import csv
+from pathlib import Path
+
+def my_dataset_iterator():
+    """
+    Implement an iterator for your dataset that for every datapoint yields a tuple
+    image_path, {"captions": [...], "metadata": {...}, }, where image_path is the path to the image as a Path object, captions is a list of caption strings and metadata is an optional field.
+    """
+    with open("/gpfs/alpine/csc499/proj-shared/magma/flickr8k/captions.txt") as f:
+        default_iter = csv.reader(f)
+
+        custom_iter = []
+        next(default_iter)
+        for row in default_iter:
+             custom_iter.append((Path('/gpfs/alpine/csc499/proj-shared/magma/flickr8k/images/' + row[0]), {"captions": row[1]}))
+        return iter(custom_iter)
+
+
+if __name__ == "__main__":
+    convert_dataset(data_dir="/gpfs/alpine/csc499/proj-shared/magma/flickr8k_processed", ds_iterator=my_dataset_iterator(), mode='cp')
diff --git a/example_inference.py b/example_inference.py
@@ -1,9 +1,8 @@
 from magma import Magma
 from magma.image_input import ImageInput
 
-model = Magma.from_checkpoint(
-    config_path = "configs/MAGMA_v1.yml",
-    checkpoint_path = "./mp_rank_00_model_states.pt",
+model = Magma(
+    config = "configs/summit_clipH_pythia19m.yml",
     device = 'cuda:0'
 )
 

diff --git a/magma/config.py b/magma/config.py
@@ -2,6 +2,7 @@
 import yaml
 from pprint import pprint
 from .utils import is_main
+from typing import Optional
 import os
 from pathlib import Path
 import uuid
@@ -42,6 +43,10 @@ class MultimodalConfig:
     fine_tune: bool = False
     load_optimizer: bool = True
 
+    # Language model:
+    lm_name: str = "gptj"
+    lm_path: Optional[str] = None
+
     # Checkpointing:
     # ------------------------------------------------------------
     save_every: int = 2500

diff --git a/magma/datasets/convert_datasets.py b/magma/datasets/convert_datasets.py
@@ -20,6 +20,7 @@ def save_to_jsons(data_list, target_dir, starting_idx=0):
 
 
 def save_images(img_list, target_dir, mode="mv"):
+    print(f'img_list: {img_list}')
     for img_path in tqdm(
         img_list,
         desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}",
@@ -34,7 +35,7 @@ def convert_dataset(
     data_dir,
     dir_size=10000,
     hash_fn=None,
-    mode="mv",
+    mode="cp",
     ds_iterator=None,
 ):
     """
@@ -62,12 +63,13 @@ def convert_dataset(
         enumerate(ds_iterator),
         desc="converting dataset to standard format...",
     )
-
+    final_iter = 0
     for k, (img_path, data) in pbar:
         img_cpt_data = {}
         # get img data
         img_cpt_data.update(data)
 
+        #print(f'img_path: {str(img_path)}')
         if str(img_path) in new_img_locations.keys():
             # if filename is in the dictionary, it already has a new location
             new_img_path = new_img_locations[str(img_path)]["new_img_path"]
@@ -78,7 +80,8 @@ def convert_dataset(
                 ]["hash"]
         else:
             # if file exists in the old location, it will get moved to a new directory
-            new_img_path = f"images/{save_img_dir.name}/{img_path.name}"
+            #new_img_path = f"images/{save_img_dir}/{img_path}"
+            new_img_path = f"{img_path}"
             img_cpt_data["image_path"] = new_img_path
             new_img_locations[str(img_path)] = {"new_img_path": new_img_path}
             # original location is saved an later saved to the new directory
@@ -97,22 +100,40 @@ def convert_dataset(
 
         img_data_list.append(img_cpt_data)
 
+        if k % 10000 - 1 == 0:
+            print(f'img_path_list len: {len(img_path_list)}')
+
+
         # save images in specified images folder (maximum of dir_size images per folder)
-        if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0) or (
-            k == len(ds_iterator) - 1
-        ):
+        if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0):
+            print(f"saving {len(img_path_list)} images...")
             os.makedirs(save_img_dir, exist_ok=True)
             save_images(img_path_list, save_img_dir, mode=mode)
             img_path_list = []
             num_img_dirs += 1
             save_img_dir = data_dir / "images" / f"{num_img_dirs}/"
 
-        # save jdon data in specified image_data folder with consecutive labeling of the json files
-        if ((k + 1) % dir_size == 0) or (k == len(ds_iterator) - 1):
+        # save json data in specified image_data folder with consecutive labeling of the json files
+        if ((k + 1) % dir_size == 0):
             os.makedirs(save_data_dir, exist_ok=True)
             save_to_jsons(
                 img_data_list, save_data_dir, starting_idx=max(k + 1 - dir_size, 0)
             )
             # empty path and data lists and update save directories for next saving step
             img_data_list = []
             save_data_dir = data_dir / "image_data" / f"{int((k+1)/dir_size)}/"
+        final_iter = k
+
+    os.makedirs(save_img_dir, exist_ok=True)
+    save_images(img_path_list, save_img_dir, mode=mode)
+    img_path_list = []
+    num_img_dirs += 1
+    save_img_dir = data_dir / "images" / f"{num_img_dirs}/"
+
+    os.makedirs(save_data_dir, exist_ok=True)
+    save_to_jsons(
+                img_data_list, save_data_dir, starting_idx=max(final_iter + 1 - dir_size, 0)
+                )
+    # empty path and data lists and update save directories for next saving step
+    img_data_list = []
+    save_data_dir = data_dir / "image_data" / f"{int((final_iter+1)/dir_size)}/"
diff --git a/magma/image_encoders.py b/magma/image_encoders.py
@@ -4,15 +4,15 @@
 from torchtyping import patch_typeguard
 from einops import rearrange
 import timm
-import clip
+import open_clip
 from functools import partial
 
 # ----------------------------- Utils --------------------------------------
 
-clip.model.LayerNorm = (
-    nn.LayerNorm
-)  # we need to patch this for clip to work with deepspeed
-patch_typeguard()  # needed for torchtyping typechecks to work
+# clip.model.LayerNorm = (
+#     nn.LayerNorm
+# )  # we need to patch this for clip to work with deepspeed
+# patch_typeguard()  # needed for torchtyping typechecks to work
 
 
 class Lambda(torch.nn.Module):
@@ -54,25 +54,55 @@ def clip_encoder(
     If the variant is a resnet model, we also remove the attention pooling.
     """
     if name in ["clip", "ViT-B/32"]:
-        name = "ViT-B/32"
+        name, pretrained = "ViT-B-32", "openai"
     elif name in ["clip_resnet", "RN50x4"]:
-        name = "RN50x4"
+        name, pretrained = "RN50x4", "openai"
     elif name in ["clip_resnet_large", "RN50x16"]:
-        name = "RN50x16"
+        name, pretrained = "RN50x16", "openai"
+    elif "openclip" in name:
+        if "H" in name:
+            name, pretrained = "ViT-H-14", "laion2b_s32b_b79k"
+        elif "B" in name and "32" in name:
+            name, pretrained = "ViT-B-32", "laion2b_s34b_b79k"
+        else:
+            raise NotImplementedError(f"Encoder {name} not recognized")
     else:
-        raise ValueError(f"encoder {name} not recognized")
+        raise NotImplementedError(f"Encoder {name} not recognized")
 
-    encoder = clip.load(name, device=device)[0].visual
-
-    if device is not None:
-        encoder = encoder.to(device)
+    # TODO better internet connection
+    encoder = open_clip.create_model(name, device=device, precision="fp16" if "cuda" in str(device) else "fp32").visual  # , pretrained=pretrained).visual
 
     if "RN" in name:
         # remove attention pooling
         encoder.attnpool = Lambda(
             partial(rearrange, pattern="b d h w -> b (h w) d")
         )  # remove attn pooling, just use reshaped features
 
+    if False and hasattr(encoder, "transformer"):  # TODO when do we want to disable pooling?
+        def forward(self, x: torch.Tensor):
+            x = self.conv1(x)  # shape = [*, width, grid, grid]
+            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+            x = torch.cat(
+                [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+                 x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+            x = x + self.positional_embedding.to(x.dtype)
+
+            ## a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+            # x = self.patch_dropout(x)
+            x = self.ln_pre(x)
+
+            x = x.permute(1, 0, 2)  # NLD -> LND
+            x = self.transformer(x)
+            x = self.ln_post(x)
+            x = x.permute(1, 0, 2)  # LND -> NLD
+            return x
+        encoder.forward = partial(forward, encoder)
+
+
+    if device is not None:
+        encoder = encoder.to(device)
+
     return encoder
 
 

diff --git a/magma/image_prefix.py b/magma/image_prefix.py
@@ -11,13 +11,15 @@
 ENCODER_SEQ_LENS = {
     "clip_resnet": 49,
     "clip_resnet_large": 144,
+    "openclip-H": 257
 }
 
 ENCODER_OUT_DIMS = {
     "nfresnet50": 2048,
     "clip": 512,
     "clip_resnet": 2560,
     "clip_resnet_large": 3072,
+    "openclip-H": 1024,
 }
 
 
@@ -48,6 +50,7 @@ def __init__(
         # get image encoder backbone
         self.enc = get_image_encoder(
             config.encoder_name,
+            # device=self.device,
             pretrained=config.pretrained_img_encoder,
         )
         self.encoder_out_dim = ENCODER_OUT_DIMS[
@@ -106,4 +109,8 @@ def forward(
         if self.use_layernorm:
             logits = self.ln(logits)
 
+        # Added for shape mismatch.
+        if logits.ndim == 2:
+            logits = logits.unsqueeze(1)
+
         return logits
-Original file line number
+Diff line change
@@ @@ -3,3 +3,4 @@ @@
     *.pyc
     wandb/*
     aws*
+    *.env*