From 53fa580fd013fd25ccc39972c3b7425f0ee762b8 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sun, 5 Nov 2023 00:08:57 +0100
Subject: [PATCH 01/24] first approach using list flattener.

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 63 +++++++++++--------
 .../nn/pipeline_parallel/test_partitioner.py  | 42 ++++++++-----
 2 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 801f81f..1c91f90 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -26,34 +26,41 @@ def __init__(self, module: nn.Module, parallel_context: ParallelContext):
         self.parallel_context = parallel_context
 
     def split(self) -> List[nn.Module]:
-        module = self.module
         n_partitions = self.parallel_context.pipeline_parallel_size
-
-        # NOTE: BLOOM-560
-        # embedding_module = module.transformer.word_embeddings
-        # transformer_blocks = module.transformer.h
-        # lm_head = module.lm_head
-
-        # NOTE: For sshleifer/tiny-gpt2
-        embed_module = module.transformer.wte
-        pos_embed_module = module.transformer.wpe
-        drop_module = module.transformer.drop
-        transformer_blocks = module.transformer.h
-        ln_f = module.transformer.ln_f
-        lm_head = module.lm_head
-
-        # NOTE: Calculate the number of transformer blocks per partition
-        blocks_per_partition = len(transformer_blocks) // n_partitions
+        module = self.module
         partitions = []
-
-        for i in range(n_partitions):
-            start = i * blocks_per_partition
-            # NOTE: if it's the last partition, get all remaining blocks
-            end = start + blocks_per_partition if i < n_partitions - 1 else None
-            partitions.append(nn.Sequential(*transformer_blocks[start:end]))
-
-        partitions[0] = nn.Sequential(embed_module, pos_embed_module, drop_module, partitions[0])
-        partitions[-1] = nn.Sequential(ln_f, lm_head, partitions[-1])
+        start = 0
+        end = 0
+
+        def _flatten_model(model, parent_name=""):
+            model_list = []
+            for name, child_module in model.named_children():
+                # Form the full name of the module
+                full_name = f"{parent_name}.{name}" if parent_name else name
+                if (
+                    full_name == "transformer.h"
+                ):  # Check if the module is the 'h' attribute
+                    # If it's the 'h' ModuleList, append each of its blocks as a whole
+                    for block in child_module:
+                        model_list.append(block)
+                elif len(list(child_module.children())) == 0:
+                    # If it's a leaf node, append the module itself
+                    model_list.append(child_module)
+                else:
+                    # Otherwise, continue flattening its children
+                    model_list.extend(_flatten_model(child_module, full_name))
+            return model_list
+
+        prepared_model = _flatten_model(module)
+        for p in range(n_partitions):
+            end = start + len(prepared_model) // n_partitions
+            partitions.append(nn.Sequential(*prepared_model[start:end]))
+            start = end
+
+        for partition in partitions:
+            print("--------------------------------------------------")
+            print(partition)
+            print("--------------------------------------------------")
 
         return partitions
 
@@ -67,7 +74,9 @@ def _get_partitioner(policy: PartitionPolicy) -> BasePartitioner:
     return policy_to_partitioner[policy]
 
 
-def get_model_partition(module: nn.Module, policy: PartitionPolicy, parallel_context: ParallelContext) -> nn.Module:
+def get_model_partition(
+    module: nn.Module, policy: PartitionPolicy, parallel_context: ParallelContext
+) -> nn.Module:
     """Get the corresponding partition of the current process."""
     partitioner = _get_partitioner(policy)
     partitions = partitioner(module, parallel_context).split()
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index adddfee..eac7393 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -1,23 +1,34 @@
 import pytest
 from torch import nn
 from transformers import AutoModelForCausalLM, AutoTokenizer
-
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
 )
 from pipegoose.testing.utils import init_parallel_context, spawn
 
-MODEL_NAME = "sshleifer/tiny-gpt2"
+# MODEL_NAME = "sshleifer/tiny-gpt2"
+MODEL_NAME = "bigscience/bloom-560m"
 
 
-def run_model_partitioner(rank, world_size, port, tensor_parallel_size, pipeline_parallel_size, data_parallel_size):
+def run_model_partitioner(
+    rank,
+    world_size,
+    port,
+    tensor_parallel_size,
+    pipeline_parallel_size,
+    data_parallel_size,
+):
     parallel_context = init_parallel_context(
-        rank, world_size, port, tensor_parallel_size, pipeline_parallel_size, data_parallel_size
+        rank,
+        world_size,
+        port,
+        tensor_parallel_size,
+        pipeline_parallel_size,
+        data_parallel_size,
     )
     module = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     tokenizer.pad_token = tokenizer.eos_token
-
     text = ["Hello world", "How are you?"]
     inputs = tokenizer(text, return_tensors="pt", padding=True)
 
@@ -25,20 +36,17 @@ def run_model_partitioner(rank, world_size, port, tensor_parallel_size, pipeline
     partitions = UniformPartitioner(module, parallel_context).split()
     # partition = get_model_partition(module, policy, parallel_context)
 
-    assert isinstance(partitions, list)
-    assert len(partitions) == pipeline_parallel_size
-
-    for partition in partitions:
-        assert isinstance(partition, nn.Module)
-        assert partition != module
-
-    outputs = inputs
-    for partition in partitions:
-        outputs = partition(outputs)
+    """
+    i = 0
+    for p in partitions:
+        print("partition: ", i)
+        print(p)
+        i += 1
+    
+    """
 
 
-@pytest.mark.skip
-@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+@pytest.mark.parametrize("pipeline_parallel_size", [2])
 def test_naive_partitioning(pipeline_parallel_size):
     TENSOR_PARALLEL_SIZE = 1
     DATA_PARALLEL_SIZE = 1

From afc7c3019b61bc7eb6ab6d381e4309b21951f029 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sun, 5 Nov 2023 00:09:59 +0100
Subject: [PATCH 02/24] remove print from split method

---
 pipegoose/nn/pipeline_parallel/partitioner.py  | 5 -----
 tests/nn/pipeline_parallel/test_partitioner.py | 4 +---
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 1c91f90..5acadcd 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -57,11 +57,6 @@ def _flatten_model(model, parent_name=""):
             partitions.append(nn.Sequential(*prepared_model[start:end]))
             start = end
 
-        for partition in partitions:
-            print("--------------------------------------------------")
-            print(partition)
-            print("--------------------------------------------------")
-
         return partitions
 
 
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index eac7393..6f8e4a0 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -36,14 +36,12 @@ def run_model_partitioner(
     partitions = UniformPartitioner(module, parallel_context).split()
     # partition = get_model_partition(module, policy, parallel_context)
 
-    """
     i = 0
     for p in partitions:
+        print("------------------------------------------------")
         print("partition: ", i)
         print(p)
         i += 1
-    
-    """
 
 
 @pytest.mark.parametrize("pipeline_parallel_size", [2])

From ef592e54e8343bce85039e1a1e017ea9e51b1d2b Mon Sep 17 00:00:00 2001
From: xrsrke <b3f0cus@icloud.com>
Date: Wed, 15 Nov 2023 12:39:47 +0700
Subject: [PATCH 03/24] [Feature] Add set seed in ParallelContext

---
 pipegoose/distributed/parallel_context.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pipegoose/distributed/parallel_context.py b/pipegoose/distributed/parallel_context.py
index 652d0f3..cd7262c 100644
--- a/pipegoose/distributed/parallel_context.py
+++ b/pipegoose/distributed/parallel_context.py
@@ -19,6 +19,7 @@
 import random
 from typing import Dict, List, Literal
 
+import numpy as np
 import torch
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
@@ -124,17 +125,12 @@ def __init__(
 
         self.init_global_dist(rank, world_size, backend, host, port)
         self.init_parallel_groups()
-
-        # if torch.cuda.is_available():
-        #     self.set_device()
-
         self.map_rank_to_device()
 
         self.rpc_worker_map = {rank: WORKER_NAME.format(rank) for rank in self.get_ranks_in_group(ParallelMode.GLOBAL)}
         self.init_rpc_workers(host, port)
 
-        # self.set_seed(seed)
-
+        self.set_seed(seed)
         self._set_context()
 
     def _set_context(self):
@@ -253,11 +249,12 @@ def set_device(self):
     def set_seed(self, seed: int):
         """Set seed for reproducibility."""
         random.seed(seed)
+        np.random.seed(seed)
         torch.manual_seed(seed)
 
-        # TODO: set GPU seed
-        # if torch.cuda.is_available():
-        #     pass
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
 
     def map_rank_to_device(self):
         """Map global rank to device."""

From f431b10808b572393eb1f3bba85ba45946cfdb69 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Wed, 15 Nov 2023 20:39:48 +0100
Subject: [PATCH 04/24] use MPTracer (HFTracer)

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 326 +++++++++++++++++-
 .../nn/pipeline_parallel/test_partitioner.py  |  11 +-
 2 files changed, 323 insertions(+), 14 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 5acadcd..b762a35 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -1,12 +1,46 @@
+import torch
+import inspect
+import copy
 from abc import ABC, abstractclassmethod
 from enum import Enum, auto
 from typing import List
-
 from torch import nn
+from transformers.utils.fx import (
+    HFTracer,
+    _generate_random_int,
+    transform_to_dynamic_input_,
+    _generate_supported_model_classes,
+    _SUPPORTED_MODELS,
+    _SUPPORTED_MODELS_FOR_DYNAMIC_AXES,
+    _wrap_method_for_model_tracing,
+    _reset_tensor_methods,
+)
+from transformers.models.auto import get_values
+from packaging import version
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+    MODEL_FOR_PRETRAINING_MAPPING,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+    MODEL_MAPPING,
+    GPT2DoubleHeadsModel,
+    PretrainedConfig,
+    PreTrainedModel,
+    logging,
+)
 
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
 
+_GLOBAL_ARGS = None
+
 
 class PartitionPolicy(Enum):
     UNIFORM = auto()
@@ -19,6 +53,287 @@ class BasePartitioner(ABC):
     def split(self) -> List[nn.Module]:
         raise NotImplementedError
 
+    """_summary_
+    hf_fx_compatibility(model)
+
+    _description_
+    Check if the model is compatible with the HFTracer
+
+    """
+
+
+def hf_fx_compatibility(model):
+    added_model = tuple(_generate_supported_model_classes("vit"))
+    transformers_fx_models = tuple(
+        _SUPPORTED_MODELS + _SUPPORTED_MODELS_FOR_DYNAMIC_AXES + added_model
+    )
+    if isinstance(model, PreTrainedModel) and isinstance(model, transformers_fx_models):
+        return True
+    else:
+        return False
+
+
+def get_args():
+    """Return arguments."""
+    assert _GLOBAL_ARGS is not None, "{} is not initialized.".format("args")
+    return _GLOBAL_ARGS
+
+
+class MpTracer(HFTracer):
+    def __init__(
+        self,
+        leaf_modules=(),
+        manual_input_shape=None,
+        trace_batch=None,
+        batch_size=1,
+        sequence_length=[128, 128],
+        num_choices=-1,
+    ):
+        super().__init__(batch_size, sequence_length, num_choices)
+        self.leaf_modules = leaf_modules
+        if manual_input_shape is not None:
+            self.encoder_shape = manual_input_shape
+
+        self.trace_batch = trace_batch
+
+    def is_manual_leaf_module(self, m):
+        for i in self.leaf_modules:
+            if isinstance(m, i):
+                return True
+        return False
+
+    def is_leaf_module(self, m: torch.nn.Module, model_qualified_name: str) -> bool:
+        return super().is_leaf_module(
+            m, model_qualified_name
+        ) or self.is_manual_leaf_module(m)
+
+    def _generate_dummy_input(self, model, input_name):
+        """Generates dummy input for model inference recording."""
+        args = get_args()
+        model_class = model.__class__
+        device = model.device
+        # device = 'cpu'
+        inputs_dict = dict()
+        if self.trace_batch is not None:
+            return self.trace_batch
+
+        if input_name in ["labels", "start_positions", "end_positions"]:
+            batch_size = self.encoder_shape[0]
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = torch.ones(
+                    batch_size, dtype=torch.long, device=device
+                )
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = torch.zeros(
+                    batch_size, dtype=torch.long, device=device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    batch_size, dtype=torch.long, device=device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    batch_size, dtype=torch.long, device=device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+                GPT2DoubleHeadsModel,
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    self.decoder_shape, dtype=torch.long, device=device
+                )
+            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    self.encoder_shape, dtype=torch.long, device=device
+                )
+            else:
+                raise NotImplementedError(f"{model_class} not supported yet.")
+
+        elif "mask" in input_name or "ids" in input_name:
+            shape = (
+                self.encoder_shape
+                if "decoder" not in input_name
+                else self.decoder_shape
+            )
+            inputs_dict[input_name] = torch.ones(shape, dtype=torch.long, device=device)
+        elif "pixel_values" in input_name:
+            shape = [
+                self.encoder_shape[0],
+                model.config.num_channels,
+                model.config.image_size,
+                model.config.image_size,
+            ]
+            inputs_dict[input_name] = torch.ones(
+                shape, dtype=torch.float, device=device
+            )
+        else:
+            shape = (
+                self.encoder_shape
+                if "decoder" not in input_name
+                else self.decoder_shape
+            )
+            shape += [model.config.hidden_size]
+            inputs_dict[input_name] = torch.ones(
+                shape, dtype=torch.float, device=device
+            )
+
+        if args.fp16 or args.half_precision_backend == "apex":
+            half_inputs_dict = {}
+            for k, v in inputs_dict.items():
+                half_inputs_dict[k] = v.half()
+            inputs_dict = half_inputs_dict
+
+        return inputs_dict
+
+    def trace(self, root: PreTrainedModel, concrete_args=None, method_names=None):
+        if concrete_args is None:
+            concrete_args = {}
+
+        sig = inspect.signature(root.forward)
+        input_names = sig.parameters.keys() - concrete_args.keys()
+
+        self.record(root, input_names, method_names=method_names)
+
+        for method_name, cache_name in self.recorded_methods.items():
+            _wrap_method_for_model_tracing(root, method_name, cache_name)
+
+        graph = torch.fx.Tracer.trace(self, root, concrete_args=concrete_args)
+
+        _reset_tensor_methods(self.original_methods)
+
+        torch_version = version.parse(torch.__version__)
+        if torch_version.minor <= 11:
+            # torch version compatibility
+            # https://github.com/huggingface/transformers/pull/17129
+            # https://github.com/pytorch/pytorch/pull/59569
+            for node in graph.nodes:
+                if node.op == "placeholder":
+                    # Removing default values for inputs as the forward pass will fail with them.
+                    if node.target in input_names:
+                        node.args = ()
+                    # It is a concrete arg so it is not used and should be removed.
+                    else:
+                        graph.erase_node(node)
+        return graph
+
+
+def symbolic_trace(
+    model,
+    input_names=None,
+    batch_size=1,
+    sequence_length=(128, 128),
+    num_choices=-1,
+    extra_leaf_modules=(),
+    trace_batch=None,
+):
+    """
+    Performs symbolic tracing on the model.
+
+    Args:
+        model (:obj:`PretrainedModel`):
+            The model to trace.
+        input_names (:obj:`List[str]`, `optional`):
+            The names of the inputs of the traced model. If unset, model.dummy_inputs().keys() are used instead.
+        batch_size (:obj:`int`, `optional`, defaults to 1):
+            The batch size of the traced model inputs.
+        sequence_length (:obj:`int` or :obj:`List[int]]`):
+            The sequence length of the traced model inputs. For sequence-to-sequence models with different sequence
+            lengths between the encoder and the decoder inputs, this must be :obj:`[encoder_sequence_length,
+            decoder_sequence_length]`.
+        num_choices (:obj:`int`, `optional`, defaults to -1):
+            The number of possible choices for a multiple choice task.
+
+    Returns:
+        :obj:`torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
+
+    Example::
+
+        from transformers.utils.fx import symbolic_trace
+        traced_model = symbolic_trace(
+            model,
+            input_names=["input_ids", "attention_mask", "token_type_ids"],
+            batch_size=1,
+            sequence_length=128,
+        )
+    """
+    if input_names is None or input_names == []:
+        input_names = model.dummy_inputs.keys()
+
+    sig = inspect.signature(model.forward)
+    concrete_args = {
+        p.name: p.default for p in sig.parameters.values() if p.name not in input_names
+    }
+    # print(concrete_args)
+    # Preparing HFTracer batch_size and sequence_lenght values for potential dynamic axes.
+    use_dynamic_batch_size = batch_size <= 0
+    if isinstance(sequence_length, (list, tuple)):
+        use_dynamic_sequence_length = sequence_length[0] <= 0 or sequence_length[1] <= 0
+    elif isinstance(sequence_length, int):
+        use_dynamic_sequence_length = sequence_length <= 0
+    else:
+        use_dynamic_sequence_length = False
+
+    if use_dynamic_batch_size or use_dynamic_sequence_length:
+        forbidden_values = [
+            model.config.num_attention_heads,
+            model.config.hidden_size,
+            model.config.hidden_size // model.config.num_attention_heads,
+        ]
+        if use_dynamic_batch_size:
+            batch_size = _generate_random_int(forbidden_values=forbidden_values)
+        forbidden_values.append(batch_size)
+        if use_dynamic_sequence_length:
+            encoder_sequence_length = _generate_random_int(
+                forbidden_values=forbidden_values
+            )
+            forbidden_values.append(encoder_sequence_length)
+            decoder_sequence_length = _generate_random_int(
+                forbidden_values=forbidden_values
+            )
+            sequence_length = [encoder_sequence_length, decoder_sequence_length]
+
+    if isinstance(extra_leaf_modules, list):
+        extra_leaf_modules = tuple(extra_leaf_modules)
+    elif isinstance(extra_leaf_modules, nn.Module):
+        extra_leaf_modules = tuple([extra_leaf_modules])
+    else:
+        assert isinstance(extra_leaf_modules, tuple), "leaf_modules should be tuple"
+    # Tracing.
+    tracer = MpTracer(
+        leaf_modules=default_leaf_modules + extra_leaf_modules,
+        trace_batch=trace_batch,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_choices=num_choices,
+    )
+    with torch.no_grad():
+        traced_graph = tracer.trace(model, concrete_args=concrete_args)
+    traced = torch.fx.GraphModule(model, traced_graph)
+    dummy_inputs = {}
+
+    for name in input_names:
+        dummy_inputs.update(tracer._generate_dummy_input(model, name))
+
+    del traced_graph, tracer
+
+    traced.config = copy.deepcopy(model.config)
+    traced.num_choices = num_choices
+
+    traced.use_dynamic_batch_size = use_dynamic_batch_size
+    traced.use_dynamic_sequence_length = use_dynamic_sequence_length
+    traced.static_batch_size = batch_size
+    traced.static_sequence_length = sequence_length
+
+    transform_to_dynamic_input_(traced)
+
+    return traced, dummy_inputs
+
 
 class UniformPartitioner(BasePartitioner):
     def __init__(self, module: nn.Module, parallel_context: ParallelContext):
@@ -31,6 +346,8 @@ def split(self) -> List[nn.Module]:
         partitions = []
         start = 0
         end = 0
+        print("module")
+        print(hf_fx_compatibility(module))
 
         def _flatten_model(model, parent_name=""):
             model_list = []
@@ -52,10 +369,9 @@ def _flatten_model(model, parent_name=""):
             return model_list
 
         prepared_model = _flatten_model(module)
-        for p in range(n_partitions):
-            end = start + len(prepared_model) // n_partitions
-            partitions.append(nn.Sequential(*prepared_model[start:end]))
-            start = end
+        for p in prepared_model:
+            print(type(p))
+            print(p)
 
         return partitions
 
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 6f8e4a0..9e4ee28 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -6,8 +6,8 @@
 )
 from pipegoose.testing.utils import init_parallel_context, spawn
 
-# MODEL_NAME = "sshleifer/tiny-gpt2"
-MODEL_NAME = "bigscience/bloom-560m"
+MODEL_NAME = "sshleifer/tiny-gpt2"
+# MODEL_NAME = "bigscience/bloom-560m"
 
 
 def run_model_partitioner(
@@ -36,13 +36,6 @@ def run_model_partitioner(
     partitions = UniformPartitioner(module, parallel_context).split()
     # partition = get_model_partition(module, policy, parallel_context)
 
-    i = 0
-    for p in partitions:
-        print("------------------------------------------------")
-        print("partition: ", i)
-        print(p)
-        i += 1
-
 
 @pytest.mark.parametrize("pipeline_parallel_size", [2])
 def test_naive_partitioning(pipeline_parallel_size):

From 48a099b9929ad41e494ced9cb92421934df65007 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Fri, 17 Nov 2023 23:00:33 +0100
Subject: [PATCH 05/24] copy from fairscale

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 521 +++++++-----------
 .../nn/pipeline_parallel/test_partitioner.py  |   8 +-
 2 files changed, 193 insertions(+), 336 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index b762a35..061114a 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -1,45 +1,18 @@
-import torch
-import inspect
-import copy
 from abc import ABC, abstractclassmethod
 from enum import Enum, auto
 from typing import List
 from torch import nn
-from transformers.utils.fx import (
-    HFTracer,
-    _generate_random_int,
-    transform_to_dynamic_input_,
-    _generate_supported_model_classes,
-    _SUPPORTED_MODELS,
-    _SUPPORTED_MODELS_FOR_DYNAMIC_AXES,
-    _wrap_method_for_model_tracing,
-    _reset_tensor_methods,
-)
+import torch
+from typing import Dict
+from torch.fx.node import Node
+from typing import Set
+
 from transformers.models.auto import get_values
 from packaging import version
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    MODEL_FOR_MASKED_LM_MAPPING,
-    MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-    MODEL_FOR_PRETRAINING_MAPPING,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    MODEL_MAPPING,
-    GPT2DoubleHeadsModel,
-    PretrainedConfig,
-    PreTrainedModel,
-    logging,
-)
 
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
-
-_GLOBAL_ARGS = None
+from transformers.utils.fx import symbolic_trace
 
 
 class PartitionPolicy(Enum):
@@ -53,286 +26,55 @@ class BasePartitioner(ABC):
     def split(self) -> List[nn.Module]:
         raise NotImplementedError
 
-    """_summary_
-    hf_fx_compatibility(model)
 
-    _description_
-    Check if the model is compatible with the HFTracer
-
-    """
-
-
-def hf_fx_compatibility(model):
-    added_model = tuple(_generate_supported_model_classes("vit"))
-    transformers_fx_models = tuple(
-        _SUPPORTED_MODELS + _SUPPORTED_MODELS_FOR_DYNAMIC_AXES + added_model
-    )
-    if isinstance(model, PreTrainedModel) and isinstance(model, transformers_fx_models):
-        return True
+def _get_count(param_count: Dict, node_name: str) -> int:
+    """Identify different mutations of a given node name."""
+    # TODO(anj): This is not very stable since it is possible that the name
+    # may not be in the same format. Is there another way to identify nodes
+    # in a graph?
+    if node_name in param_count:
+        return param_count[node_name]
+    elif node_name.split("_")[0] in param_count:
+        return param_count[node_name.split("_")[0]]
     else:
-        return False
-
-
-def get_args():
-    """Return arguments."""
-    assert _GLOBAL_ARGS is not None, "{} is not initialized.".format("args")
-    return _GLOBAL_ARGS
-
-
-class MpTracer(HFTracer):
-    def __init__(
-        self,
-        leaf_modules=(),
-        manual_input_shape=None,
-        trace_batch=None,
-        batch_size=1,
-        sequence_length=[128, 128],
-        num_choices=-1,
-    ):
-        super().__init__(batch_size, sequence_length, num_choices)
-        self.leaf_modules = leaf_modules
-        if manual_input_shape is not None:
-            self.encoder_shape = manual_input_shape
+        raise RuntimeError(
+            f"Unable to find match between param {param_count} and node {node_name}"
+        )
 
-        self.trace_batch = trace_batch
 
-    def is_manual_leaf_module(self, m):
-        for i in self.leaf_modules:
-            if isinstance(m, i):
-                return True
-        return False
+def _create_shard_to_param_count(
+    param_count: Dict, node_name_to_shard_id: Dict
+) -> Dict:
+    """Utility to create a map from shard id to param count using existing state."""
 
-    def is_leaf_module(self, m: torch.nn.Module, model_qualified_name: str) -> bool:
-        return super().is_leaf_module(
-            m, model_qualified_name
-        ) or self.is_manual_leaf_module(m)
-
-    def _generate_dummy_input(self, model, input_name):
-        """Generates dummy input for model inference recording."""
-        args = get_args()
-        model_class = model.__class__
-        device = model.device
-        # device = 'cpu'
-        inputs_dict = dict()
-        if self.trace_batch is not None:
-            return self.trace_batch
-
-        if input_name in ["labels", "start_positions", "end_positions"]:
-            batch_size = self.encoder_shape[0]
-            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = torch.ones(
-                    batch_size, dtype=torch.long, device=device
-                )
-            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = torch.zeros(
-                    batch_size, dtype=torch.long, device=device
-                )
-                inputs_dict["end_positions"] = torch.zeros(
-                    batch_size, dtype=torch.long, device=device
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = torch.zeros(
-                    batch_size, dtype=torch.long, device=device
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-                GPT2DoubleHeadsModel,
-            ]:
-                inputs_dict["labels"] = torch.zeros(
-                    self.decoder_shape, dtype=torch.long, device=device
-                )
-            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    self.encoder_shape, dtype=torch.long, device=device
-                )
-            else:
-                raise NotImplementedError(f"{model_class} not supported yet.")
-
-        elif "mask" in input_name or "ids" in input_name:
-            shape = (
-                self.encoder_shape
-                if "decoder" not in input_name
-                else self.decoder_shape
-            )
-            inputs_dict[input_name] = torch.ones(shape, dtype=torch.long, device=device)
-        elif "pixel_values" in input_name:
-            shape = [
-                self.encoder_shape[0],
-                model.config.num_channels,
-                model.config.image_size,
-                model.config.image_size,
-            ]
-            inputs_dict[input_name] = torch.ones(
-                shape, dtype=torch.float, device=device
-            )
+    shard_to_param_count: Dict[int, int] = {}
+    for node_name in node_name_to_shard_id.keys():
+        try:
+            count = _get_count(param_count, node_name)
+        except RuntimeError:
+            continue
+        if node_name_to_shard_id[node_name] in shard_to_param_count:
+            shard_to_param_count[node_name_to_shard_id[node_name]] += count
         else:
-            shape = (
-                self.encoder_shape
-                if "decoder" not in input_name
-                else self.decoder_shape
-            )
-            shape += [model.config.hidden_size]
-            inputs_dict[input_name] = torch.ones(
-                shape, dtype=torch.float, device=device
-            )
-
-        if args.fp16 or args.half_precision_backend == "apex":
-            half_inputs_dict = {}
-            for k, v in inputs_dict.items():
-                half_inputs_dict[k] = v.half()
-            inputs_dict = half_inputs_dict
-
-        return inputs_dict
-
-    def trace(self, root: PreTrainedModel, concrete_args=None, method_names=None):
-        if concrete_args is None:
-            concrete_args = {}
-
-        sig = inspect.signature(root.forward)
-        input_names = sig.parameters.keys() - concrete_args.keys()
-
-        self.record(root, input_names, method_names=method_names)
-
-        for method_name, cache_name in self.recorded_methods.items():
-            _wrap_method_for_model_tracing(root, method_name, cache_name)
-
-        graph = torch.fx.Tracer.trace(self, root, concrete_args=concrete_args)
-
-        _reset_tensor_methods(self.original_methods)
-
-        torch_version = version.parse(torch.__version__)
-        if torch_version.minor <= 11:
-            # torch version compatibility
-            # https://github.com/huggingface/transformers/pull/17129
-            # https://github.com/pytorch/pytorch/pull/59569
-            for node in graph.nodes:
-                if node.op == "placeholder":
-                    # Removing default values for inputs as the forward pass will fail with them.
-                    if node.target in input_names:
-                        node.args = ()
-                    # It is a concrete arg so it is not used and should be removed.
-                    else:
-                        graph.erase_node(node)
-        return graph
-
-
-def symbolic_trace(
-    model,
-    input_names=None,
-    batch_size=1,
-    sequence_length=(128, 128),
-    num_choices=-1,
-    extra_leaf_modules=(),
-    trace_batch=None,
-):
-    """
-    Performs symbolic tracing on the model.
-
-    Args:
-        model (:obj:`PretrainedModel`):
-            The model to trace.
-        input_names (:obj:`List[str]`, `optional`):
-            The names of the inputs of the traced model. If unset, model.dummy_inputs().keys() are used instead.
-        batch_size (:obj:`int`, `optional`, defaults to 1):
-            The batch size of the traced model inputs.
-        sequence_length (:obj:`int` or :obj:`List[int]]`):
-            The sequence length of the traced model inputs. For sequence-to-sequence models with different sequence
-            lengths between the encoder and the decoder inputs, this must be :obj:`[encoder_sequence_length,
-            decoder_sequence_length]`.
-        num_choices (:obj:`int`, `optional`, defaults to -1):
-            The number of possible choices for a multiple choice task.
-
-    Returns:
-        :obj:`torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
-
-    Example::
-
-        from transformers.utils.fx import symbolic_trace
-        traced_model = symbolic_trace(
-            model,
-            input_names=["input_ids", "attention_mask", "token_type_ids"],
-            batch_size=1,
-            sequence_length=128,
-        )
-    """
-    if input_names is None or input_names == []:
-        input_names = model.dummy_inputs.keys()
-
-    sig = inspect.signature(model.forward)
-    concrete_args = {
-        p.name: p.default for p in sig.parameters.values() if p.name not in input_names
-    }
-    # print(concrete_args)
-    # Preparing HFTracer batch_size and sequence_lenght values for potential dynamic axes.
-    use_dynamic_batch_size = batch_size <= 0
-    if isinstance(sequence_length, (list, tuple)):
-        use_dynamic_sequence_length = sequence_length[0] <= 0 or sequence_length[1] <= 0
-    elif isinstance(sequence_length, int):
-        use_dynamic_sequence_length = sequence_length <= 0
-    else:
-        use_dynamic_sequence_length = False
-
-    if use_dynamic_batch_size or use_dynamic_sequence_length:
-        forbidden_values = [
-            model.config.num_attention_heads,
-            model.config.hidden_size,
-            model.config.hidden_size // model.config.num_attention_heads,
-        ]
-        if use_dynamic_batch_size:
-            batch_size = _generate_random_int(forbidden_values=forbidden_values)
-        forbidden_values.append(batch_size)
-        if use_dynamic_sequence_length:
-            encoder_sequence_length = _generate_random_int(
-                forbidden_values=forbidden_values
-            )
-            forbidden_values.append(encoder_sequence_length)
-            decoder_sequence_length = _generate_random_int(
-                forbidden_values=forbidden_values
-            )
-            sequence_length = [encoder_sequence_length, decoder_sequence_length]
-
-    if isinstance(extra_leaf_modules, list):
-        extra_leaf_modules = tuple(extra_leaf_modules)
-    elif isinstance(extra_leaf_modules, nn.Module):
-        extra_leaf_modules = tuple([extra_leaf_modules])
-    else:
-        assert isinstance(extra_leaf_modules, tuple), "leaf_modules should be tuple"
-    # Tracing.
-    tracer = MpTracer(
-        leaf_modules=default_leaf_modules + extra_leaf_modules,
-        trace_batch=trace_batch,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_choices=num_choices,
-    )
-    with torch.no_grad():
-        traced_graph = tracer.trace(model, concrete_args=concrete_args)
-    traced = torch.fx.GraphModule(model, traced_graph)
-    dummy_inputs = {}
-
-    for name in input_names:
-        dummy_inputs.update(tracer._generate_dummy_input(model, name))
+            shard_to_param_count[node_name_to_shard_id[node_name]] = count
+    return shard_to_param_count
 
-    del traced_graph, tracer
 
-    traced.config = copy.deepcopy(model.config)
-    traced.num_choices = num_choices
+class _ExtendedLeafTracer(torch.fx.Tracer):
+    def __init__(self, leaf_modules: Set[torch.nn.Module]):
+        super().__init__()
+        self.leaf_modules = leaf_modules
 
-    traced.use_dynamic_batch_size = use_dynamic_batch_size
-    traced.use_dynamic_sequence_length = use_dynamic_sequence_length
-    traced.static_batch_size = batch_size
-    traced.static_sequence_length = sequence_length
+    def is_leaf_module(self, m: torch.nn.Module, model_qualified_name: str) -> bool:
+        return super().is_leaf_module(m, model_qualified_name) or m in self.leaf_modules
 
-    transform_to_dynamic_input_(traced)
 
-    return traced, dummy_inputs
+def _trace(
+    model: torch.nn.Module, leaf_modules: Set[torch.nn.Module]
+) -> torch.fx.GraphModule:
+    tracer = _ExtendedLeafTracer(leaf_modules)
+    graph = tracer.trace(model)
+    return torch.fx.GraphModule(model, graph)
 
 
 class UniformPartitioner(BasePartitioner):
@@ -340,40 +82,149 @@ def __init__(self, module: nn.Module, parallel_context: ParallelContext):
         self.module = module
         self.parallel_context = parallel_context
 
-    def split(self) -> List[nn.Module]:
-        n_partitions = self.parallel_context.pipeline_parallel_size
-        module = self.module
-        partitions = []
-        start = 0
-        end = 0
-        print("module")
-        print(hf_fx_compatibility(module))
-
-        def _flatten_model(model, parent_name=""):
-            model_list = []
-            for name, child_module in model.named_children():
-                # Form the full name of the module
-                full_name = f"{parent_name}.{name}" if parent_name else name
+    def _split_nodes(
+        self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
+    ) -> Dict:
+        """Utility used to trace a graph and identify shard cutpoints."""
+
+        node_name_to_shard_id: Dict[str, int] = {}
+        shard_id = 0
+        nodes_so_far = []
+        param_count: Dict[str, int] = {}
+        shard_to_param_count = {}
+
+        # Find the total number of params in the model and
+        # the number of params per shard we are aiming for.
+        for name, module in traced_graph_module.named_modules():
+            name = name.replace(".", "_")
+            param_count[name] = sum([x.numel() for x in module.parameters()])
+        print(f"Total number of params are {param_count['']}")
+        per_shard_param = param_count[""] // shard_count
+        print(f"Per shard param count {per_shard_param}")
+
+        for node in traced_graph_module.graph.nodes:
+            if node.op == "placeholder":
+                node_name_to_shard_id[node.name] = shard_id
+                nodes_so_far.append(node.name)
+            elif node.op in ["get_attr", "call_function", "call_method", "call_module"]:
+                min_shard_id = shard_id
+                min_node_name = ""
+                # For each of the args of a given node, find the arg that is not the
+                # last node we traversed. This is to help us find skip connections
+                # across shards.
+                for arg in node.args:
+                    # If the node has args that are inputs to the forward function, they
+                    # may not have explicit names.
+                    if not hasattr(arg, "name"):
+                        continue
+
+                    if (
+                        arg.name in node_name_to_shard_id
+                        and arg.name != nodes_so_far[-1]
+                    ):
+                        if node_name_to_shard_id[arg.name] < min_shard_id:
+                            min_shard_id = node_name_to_shard_id[arg.name]
+                            min_node_name = arg.name
+
+                # If there is an input that is not from the previous shard,
+                # we collapse all the shards in between to be part of 1 shard.
+                # and update the param count per shard accordingly.
+                if min_shard_id < shard_id:
+                    for node_name in reversed(nodes_so_far):
+                        node_name_to_shard_id[node_name] = min_shard_id
+                        if node_name == min_node_name:
+                            break
+                    shard_id = min_shard_id
+                    # TODO(anj-s): Find a way to raise an error early if this can cause OOM errors.
+                    shard_to_param_count = _create_shard_to_param_count(
+                        param_count, node_name_to_shard_id
+                    )
+
+                # Update state that is tracking node -> shard id and shard id -> param count.
+                node_name_to_shard_id[node.name] = shard_id
+                nodes_so_far.append(node.name)
+                # TODO(anj): This could just be an update, we don't need to recreate the map.
+                shard_to_param_count = _create_shard_to_param_count(
+                    param_count, node_name_to_shard_id
+                )
+                # If we have gone over the number of params per shard count that we want to
+                # achieve, we should add a new shard.
+                # The shard_id may not have been updated in the map if we are at a node that does not
+                # have params.
                 if (
-                    full_name == "transformer.h"
-                ):  # Check if the module is the 'h' attribute
-                    # If it's the 'h' ModuleList, append each of its blocks as a whole
-                    for block in child_module:
-                        model_list.append(block)
-                elif len(list(child_module.children())) == 0:
-                    # If it's a leaf node, append the module itself
-                    model_list.append(child_module)
-                else:
-                    # Otherwise, continue flattening its children
-                    model_list.extend(_flatten_model(child_module, full_name))
-            return model_list
-
-        prepared_model = _flatten_model(module)
-        for p in prepared_model:
-            print(type(p))
-            print(p)
-
-        return partitions
+                    shard_id in shard_to_param_count
+                    and shard_to_param_count[shard_id] > per_shard_param
+                ):
+                    shard_id += 1
+            elif node.op == "output":
+                break
+        return node_name_to_shard_id
+
+    def split(self, input_names) -> List[nn.Module]:
+        n_partitions = self.parallel_context.pipeline_parallel_size
+        model = self.module
+        leaf_modules = set()
+        module_list: List[torch.fx.GraphModule] = []
+        num_graphs = 0
+        new_graph = torch.fx.Graph()  # type: ignore
+        env: Dict[str, Node] = {}
+        new_input_node = None
+
+        symbolic_traced_module = symbolic_trace(self.module)
+
+        prev_shard_id = 1000
+        prev_shard_node = None
+
+        node_name_to_shard_id = self._split_nodes(symbolic_traced_module, n_partitions)
+
+        prev_shard_id = 1000
+        prev_node = None
+        for node in symbolic_traced_module.graph.nodes:
+            # If the current node is in the next shard, we insert an output node.
+            # A new graph is created and a placeholder is added for the next shard.
+            if (
+                node.name in node_name_to_shard_id
+                and prev_shard_id < node_name_to_shard_id[node.name]
+            ):
+                assert prev_node, "prev_node cannot be None"
+
+                with new_graph.inserting_after(prev_node):
+                    new_graph.output(env[prev_node.name])
+                num_graphs += 1
+                module_list.append(torch.fx.GraphModule(model, new_graph))
+                new_graph = torch.fx.Graph()
+                node_name = "placeholder" + str(num_graphs)
+                pl_node = new_graph.create_node("placeholder", node_name)
+                env[node_name] = pl_node
+                new_input_node = pl_node
+
+            if new_input_node is not None:
+                # Account for a placeholder in the new graph.
+                node.args = (new_input_node,)
+                new_input_node = None
+            if node.op in [
+                "placeholder",
+                "get_attr",
+                "call_function",
+                "call_method",
+                "call_module",
+            ]:
+                # Copy the nodes from the existing graph to the new graph.
+                new_node = new_graph.node_copy(node, lambda x: env[x.name])
+                env[node.name] = new_node
+            elif node.op == "output":
+                # If this is the last node, we should add an output
+                # node and add the last graph to the list.
+                assert prev_node, "prev_node cannot be None"
+
+                with new_graph.inserting_after(prev_node):
+                    new_graph.output(env[prev_node.name])
+                module_list.append(torch.fx.GraphModule(model, new_graph))
+                break
+            prev_node = new_node
+            prev_shard_id = node_name_to_shard_id[node.name]
+
+        return module_list
 
 
 def _get_partitioner(policy: PartitionPolicy) -> BasePartitioner:
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 9e4ee28..6344440 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -33,9 +33,15 @@ def run_model_partitioner(
     inputs = tokenizer(text, return_tensors="pt", padding=True)
 
     # policy = PartitionPolicy.UNIFORM
-    partitions = UniformPartitioner(module, parallel_context).split()
+    partitions = UniformPartitioner(module, parallel_context).split(["input_ids"])
+
     # partition = get_model_partition(module, policy, parallel_context)
 
+    for p in partitions:
+        print("==================")
+        print(p)
+        print("==================")
+
 
 @pytest.mark.parametrize("pipeline_parallel_size", [2])
 def test_naive_partitioning(pipeline_parallel_size):

From 89e473a4e2328f2b08fa1393bbe2e616319da98e Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Fri, 17 Nov 2023 23:03:15 +0100
Subject: [PATCH 06/24] remove unused leaf tracer

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 061114a..921a818 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -60,23 +60,6 @@ def _create_shard_to_param_count(
     return shard_to_param_count
 
 
-class _ExtendedLeafTracer(torch.fx.Tracer):
-    def __init__(self, leaf_modules: Set[torch.nn.Module]):
-        super().__init__()
-        self.leaf_modules = leaf_modules
-
-    def is_leaf_module(self, m: torch.nn.Module, model_qualified_name: str) -> bool:
-        return super().is_leaf_module(m, model_qualified_name) or m in self.leaf_modules
-
-
-def _trace(
-    model: torch.nn.Module, leaf_modules: Set[torch.nn.Module]
-) -> torch.fx.GraphModule:
-    tracer = _ExtendedLeafTracer(leaf_modules)
-    graph = tracer.trace(model)
-    return torch.fx.GraphModule(model, graph)
-
-
 class UniformPartitioner(BasePartitioner):
     def __init__(self, module: nn.Module, parallel_context: ParallelContext):
         self.module = module

From 1d0c0f1181487bb162632e6d8793f6e60735fc91 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sat, 18 Nov 2023 17:27:20 +0100
Subject: [PATCH 07/24] new split nodes logic

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 137 ++++++++----------
 .../nn/pipeline_parallel/test_partitioner.py  |  10 +-
 2 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 921a818..a613ab2 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -15,6 +15,25 @@
 from transformers.utils.fx import symbolic_trace
 
 
+def _snake_case(s: str) -> str:
+    """
+    Transforms the given string ``s`` to a Python-style variable name
+
+    Examples:
+        ``mod.snake_case`` -> ``mod.snake_case``
+        ``mod.pascalCase``-> ``mod.pascal_case``
+        ``mod.ALL_CAPS`` -> ``mod.all_caps``
+    """
+    chars = []
+    prev_lower = False
+    for c in s:
+        if prev_lower and c.isupper():
+            chars.append("_")
+        chars.append(c.lower())
+        prev_lower = c.islower()
+    return "".join(chars)
+
+
 class PartitionPolicy(Enum):
     UNIFORM = auto()
 
@@ -61,89 +80,56 @@ def _create_shard_to_param_count(
 
 
 class UniformPartitioner(BasePartitioner):
-    def __init__(self, module: nn.Module, parallel_context: ParallelContext):
+    def __init__(self, module: nn.Module, parallel_context):
         self.module = module
         self.parallel_context = parallel_context
 
-    def _split_nodes(
-        self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
-    ) -> Dict:
-        """Utility used to trace a graph and identify shard cutpoints."""
-
-        node_name_to_shard_id: Dict[str, int] = {}
+    def _split_nodes(self, traced_graph_module, shard_count) -> Dict[str, int]:
+        node_name_to_shard_id = {}
         shard_id = 0
-        nodes_so_far = []
-        param_count: Dict[str, int] = {}
-        shard_to_param_count = {}
+        param_count = {}
+        shard_to_param_count = [0] * shard_count
 
-        # Find the total number of params in the model and
-        # the number of params per shard we are aiming for.
+        # Calculate the number of params for each module
         for name, module in traced_graph_module.named_modules():
-            name = name.replace(".", "_")
-            param_count[name] = sum([x.numel() for x in module.parameters()])
-        print(f"Total number of params are {param_count['']}")
-        per_shard_param = param_count[""] // shard_count
-        print(f"Per shard param count {per_shard_param}")
+            name = _snake_case(name).replace(".", "_")
+            param_count[name] = sum(p.numel() for p in module.parameters())
+
+        # Calculate the number of params per shard
+        print(f"param_count: {param_count}")
+        total_params = param_count[""]
+        per_shard_param = total_params // shard_count
+        remainder = total_params % shard_count
 
         for node in traced_graph_module.graph.nodes:
-            if node.op == "placeholder":
-                node_name_to_shard_id[node.name] = shard_id
-                nodes_so_far.append(node.name)
-            elif node.op in ["get_attr", "call_function", "call_method", "call_module"]:
-                min_shard_id = shard_id
-                min_node_name = ""
-                # For each of the args of a given node, find the arg that is not the
-                # last node we traversed. This is to help us find skip connections
-                # across shards.
-                for arg in node.args:
-                    # If the node has args that are inputs to the forward function, they
-                    # may not have explicit names.
-                    if not hasattr(arg, "name"):
-                        continue
-
-                    if (
-                        arg.name in node_name_to_shard_id
-                        and arg.name != nodes_so_far[-1]
-                    ):
-                        if node_name_to_shard_id[arg.name] < min_shard_id:
-                            min_shard_id = node_name_to_shard_id[arg.name]
-                            min_node_name = arg.name
-
-                # If there is an input that is not from the previous shard,
-                # we collapse all the shards in between to be part of 1 shard.
-                # and update the param count per shard accordingly.
-                if min_shard_id < shard_id:
-                    for node_name in reversed(nodes_so_far):
-                        node_name_to_shard_id[node_name] = min_shard_id
-                        if node_name == min_node_name:
-                            break
-                    shard_id = min_shard_id
-                    # TODO(anj-s): Find a way to raise an error early if this can cause OOM errors.
-                    shard_to_param_count = _create_shard_to_param_count(
-                        param_count, node_name_to_shard_id
-                    )
-
-                # Update state that is tracking node -> shard id and shard id -> param count.
-                node_name_to_shard_id[node.name] = shard_id
-                nodes_so_far.append(node.name)
-                # TODO(anj): This could just be an update, we don't need to recreate the map.
-                shard_to_param_count = _create_shard_to_param_count(
-                    param_count, node_name_to_shard_id
-                )
-                # If we have gone over the number of params per shard count that we want to
-                # achieve, we should add a new shard.
-                # The shard_id may not have been updated in the map if we are at a node that does not
-                # have params.
-                if (
-                    shard_id in shard_to_param_count
-                    and shard_to_param_count[shard_id] > per_shard_param
-                ):
+            if node.op in ["output", "placeholder"]:
+                continue
+
+            node_name = _snake_case(node.name).replace(".", "_")
+            node_param_count = param_count.get(node_name, 0)
+
+            # Print node type and parameter count
+            print(f"Node '{node_name}' ({node.op}) has {node_param_count} parameters")
+
+            if node.op in ["get_attr", "call_function", "call_method", "call_module"]:
+                # Handle specific node types here
+
+                # Move to the next shard if the limit is exceeded and it's not the last shard
+                if shard_id < shard_count - 1 and shard_to_param_count[
+                    shard_id
+                ] + node_param_count > per_shard_param + (1 if remainder > 0 else 0):
+                    remainder -= 1
                     shard_id += 1
-            elif node.op == "output":
-                break
+
+                node_name_to_shard_id[node.name] = shard_id
+                shard_to_param_count[shard_id] += node_param_count
+
+        for i, count in enumerate(shard_to_param_count):
+            print(f"Shard {i} has {count} parameters")
+
         return node_name_to_shard_id
 
-    def split(self, input_names) -> List[nn.Module]:
+    def split(self) -> List[nn.Module]:
         n_partitions = self.parallel_context.pipeline_parallel_size
         model = self.module
         leaf_modules = set()
@@ -160,6 +146,11 @@ def split(self, input_names) -> List[nn.Module]:
 
         node_name_to_shard_id = self._split_nodes(symbolic_traced_module, n_partitions)
 
+        print(f"node_name_to_shard_id: {node_name_to_shard_id}")
+
+        # for i, node in enumerate(node_name_to_shard_id):
+        #   print(f"Node {i} is {node} and shard id is {node_name_to_shard_id[node]}")
+
         prev_shard_id = 1000
         prev_node = None
         for node in symbolic_traced_module.graph.nodes:
@@ -205,7 +196,7 @@ def split(self, input_names) -> List[nn.Module]:
                 module_list.append(torch.fx.GraphModule(model, new_graph))
                 break
             prev_node = new_node
-            prev_shard_id = node_name_to_shard_id[node.name]
+            # prev_shard_id = node_name_to_shard_id[node.name]
 
         return module_list
 
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 6344440..5bea4ab 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -27,23 +27,19 @@ def run_model_partitioner(
         data_parallel_size,
     )
     module = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+    print(module)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     tokenizer.pad_token = tokenizer.eos_token
     text = ["Hello world", "How are you?"]
     inputs = tokenizer(text, return_tensors="pt", padding=True)
 
     # policy = PartitionPolicy.UNIFORM
-    partitions = UniformPartitioner(module, parallel_context).split(["input_ids"])
+    partitions = UniformPartitioner(module, parallel_context).split()
 
     # partition = get_model_partition(module, policy, parallel_context)
 
-    for p in partitions:
-        print("==================")
-        print(p)
-        print("==================")
 
-
-@pytest.mark.parametrize("pipeline_parallel_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [4])
 def test_naive_partitioning(pipeline_parallel_size):
     TENSOR_PARALLEL_SIZE = 1
     DATA_PARALLEL_SIZE = 1

From 50bdf0c6dfa7296bb1c450a6d743586ea6e38bab Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Sat, 18 Nov 2023 22:48:44 +0100
Subject: [PATCH 08/24] partitioning WIP

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 153 +++++++-----------
 .../nn/pipeline_parallel/test_partitioner.py  | 114 ++++++++-----
 2 files changed, 128 insertions(+), 139 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 921a818..56ec244 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -27,124 +27,87 @@ def split(self) -> List[nn.Module]:
         raise NotImplementedError
 
 
-def _get_count(param_count: Dict, node_name: str) -> int:
-    """Identify different mutations of a given node name."""
-    # TODO(anj): This is not very stable since it is possible that the name
-    # may not be in the same format. Is there another way to identify nodes
-    # in a graph?
-    if node_name in param_count:
-        return param_count[node_name]
-    elif node_name.split("_")[0] in param_count:
-        return param_count[node_name.split("_")[0]]
-    else:
-        raise RuntimeError(
-            f"Unable to find match between param {param_count} and node {node_name}"
-        )
-
-
-def _create_shard_to_param_count(
-    param_count: Dict, node_name_to_shard_id: Dict
-) -> Dict:
-    """Utility to create a map from shard id to param count using existing state."""
-
-    shard_to_param_count: Dict[int, int] = {}
-    for node_name in node_name_to_shard_id.keys():
-        try:
-            count = _get_count(param_count, node_name)
-        except RuntimeError:
-            continue
-        if node_name_to_shard_id[node_name] in shard_to_param_count:
-            shard_to_param_count[node_name_to_shard_id[node_name]] += count
-        else:
-            shard_to_param_count[node_name_to_shard_id[node_name]] = count
-    return shard_to_param_count
-
-
 class UniformPartitioner(BasePartitioner):
-    def __init__(self, module: nn.Module, parallel_context: ParallelContext):
+    # def __init__(self, module: nn.Module, parallel_context: ParallelContext):
+    def __init__(self, module: nn.Module, n_partitions: int):
         self.module = module
-        self.parallel_context = parallel_context
+        self.n_partitions = n_partitions
 
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
     ) -> Dict:
         """Utility used to trace a graph and identify shard cutpoints."""
 
-        node_name_to_shard_id: Dict[str, int] = {}
-        shard_id = 0
         nodes_so_far = []
         param_count: Dict[str, int] = {}
-        shard_to_param_count = {}
 
         # Find the total number of params in the model and
         # the number of params per shard we are aiming for.
+
+        # Note: we need to iterate over named_parameters AND named_modules because
+        # sometimes the parameters of a module is split into weight and bia in the
+        # traced graph and sometimes not.
+        # Example:
+        # The embedding in traced graph is called transformer_wte. The naming as parameter
+        # is transformer.wte.weight while as a module it is transformer.wte
+        #
+        # The projection inside the attention layer is split into weight and bias
+        # in the traced graph while in the module we only see the projection as a whole module.
+        
+        for name, param in traced_graph_module.named_parameters():
+            print(f"{name} => {param.numel()}")
+            name = name.replace(".", "_")
+            param_count[name] = param.numel()
+        
+        total_param_count = 0
         for name, module in traced_graph_module.named_modules():
+            print(name)
+            if len(name) > 0 and name.count(".") == 0:
+                # also note that the parameters of the lm_head for some models (e.g. GPT2) are not
+                # considered in named_parameters(). therefore, we must count the parameters using
+                # named_modules.
+                # we recursively go deeper into the modules, we cannot naively count the parameters of each module,
+                # because we then would count the same parameter multiple times. hence, we only count the 
+                # parameters of the top-level modules.
+                total_param_count += sum([x.numel() for x in module.parameters()])
+
             name = name.replace(".", "_")
             param_count[name] = sum([x.numel() for x in module.parameters()])
-        print(f"Total number of params are {param_count['']}")
-        per_shard_param = param_count[""] // shard_count
+
+        print(f"Total number of params are {total_param_count}")
+        per_shard_param = total_param_count // shard_count
         print(f"Per shard param count {per_shard_param}")
 
+
+        node_name_to_shard_id: Dict[str, int] = {}
+        shard_id = 0
+        shard_id_to_param_count = [0 for _ in range(shard_count)]
         for node in traced_graph_module.graph.nodes:
-            if node.op == "placeholder":
-                node_name_to_shard_id[node.name] = shard_id
-                nodes_so_far.append(node.name)
-            elif node.op in ["get_attr", "call_function", "call_method", "call_module"]:
-                min_shard_id = shard_id
-                min_node_name = ""
-                # For each of the args of a given node, find the arg that is not the
-                # last node we traversed. This is to help us find skip connections
-                # across shards.
-                for arg in node.args:
-                    # If the node has args that are inputs to the forward function, they
-                    # may not have explicit names.
-                    if not hasattr(arg, "name"):
-                        continue
-
-                    if (
-                        arg.name in node_name_to_shard_id
-                        and arg.name != nodes_so_far[-1]
-                    ):
-                        if node_name_to_shard_id[arg.name] < min_shard_id:
-                            min_shard_id = node_name_to_shard_id[arg.name]
-                            min_node_name = arg.name
-
-                # If there is an input that is not from the previous shard,
-                # we collapse all the shards in between to be part of 1 shard.
-                # and update the param count per shard accordingly.
-                if min_shard_id < shard_id:
-                    for node_name in reversed(nodes_so_far):
-                        node_name_to_shard_id[node_name] = min_shard_id
-                        if node_name == min_node_name:
-                            break
-                    shard_id = min_shard_id
-                    # TODO(anj-s): Find a way to raise an error early if this can cause OOM errors.
-                    shard_to_param_count = _create_shard_to_param_count(
-                        param_count, node_name_to_shard_id
-                    )
-
-                # Update state that is tracking node -> shard id and shard id -> param count.
-                node_name_to_shard_id[node.name] = shard_id
-                nodes_so_far.append(node.name)
-                # TODO(anj): This could just be an update, we don't need to recreate the map.
-                shard_to_param_count = _create_shard_to_param_count(
-                    param_count, node_name_to_shard_id
-                )
-                # If we have gone over the number of params per shard count that we want to
-                # achieve, we should add a new shard.
-                # The shard_id may not have been updated in the map if we are at a node that does not
-                # have params.
-                if (
-                    shard_id in shard_to_param_count
-                    and shard_to_param_count[shard_id] > per_shard_param
-                ):
-                    shard_id += 1
-            elif node.op == "output":
+            if node.op == "output":
                 break
+
+            if node.op in ("call_module", "get_attr"):
+                # call_module and get_attr are the two operations which involve accessing parameters
+                print(f"\n{node.name} = {node.op} target={node.target} args={node.args} ===> {shard_id}")
+                print(f"Args and their shards: {[(arg.name, node_name_to_shard_id[arg.name]) for arg in node.args if hasattr(arg, 'name')]}")
+
+                current_param_count = param_count.get(node.name, 0)
+
+                # if shard_id_to_param_count[shard_id] >= per_shard_param and (shard_id + 1) < shard_count:
+                print(shard_id_to_param_count[shard_id] >= per_shard_param, shard_id_to_param_count[shard_id], per_shard_param)
+                if (shard_id_to_param_count[shard_id] + current_param_count) >= per_shard_param and (shard_id + 1) < shard_count:
+                    shard_id += 1
+
+                shard_id_to_param_count[shard_id] += current_param_count
+                print(f"shard_id_to_param_count = {shard_id_to_param_count}")
+            
+            node_name_to_shard_id[node.name] = shard_id
+
         return node_name_to_shard_id
 
     def split(self, input_names) -> List[nn.Module]:
-        n_partitions = self.parallel_context.pipeline_parallel_size
+        # n_partitions = self.parallel_context.pipeline_parallel_size
+        n_partitions = self.n_partitions # FIXME:
         model = self.module
         leaf_modules = set()
         module_list: List[torch.fx.GraphModule] = []
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 6344440..a1528ae 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -1,57 +1,83 @@
 import pytest
-from torch import nn
-from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
 )
 from pipegoose.testing.utils import init_parallel_context, spawn
 
-MODEL_NAME = "sshleifer/tiny-gpt2"
-# MODEL_NAME = "bigscience/bloom-560m"
-
-
-def run_model_partitioner(
-    rank,
-    world_size,
-    port,
-    tensor_parallel_size,
-    pipeline_parallel_size,
-    data_parallel_size,
-):
-    parallel_context = init_parallel_context(
-        rank,
-        world_size,
-        port,
-        tensor_parallel_size,
-        pipeline_parallel_size,
-        data_parallel_size,
-    )
-    module = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    tokenizer.pad_token = tokenizer.eos_token
-    text = ["Hello world", "How are you?"]
-    inputs = tokenizer(text, return_tensors="pt", padding=True)
 
-    # policy = PartitionPolicy.UNIFORM
-    partitions = UniformPartitioner(module, parallel_context).split(["input_ids"])
+def get_small_gpt2_and_tokenizer(n_layer=12):
+    return GPT2LMHeadModel(
+        GPT2Config(
+            n_layer=n_layer
+        )
+    ), AutoTokenizer.from_pretrained("gpt2")
+
+
+# def run_model_partitioner(
+#     rank,
+#     world_size,
+#     port,
+#     tensor_parallel_size,
+#     pipeline_parallel_size,
+#     data_parallel_size,
+# ):
+#     parallel_context = init_parallel_context(
+#         rank,
+#         world_size,
+#         port,
+#         tensor_parallel_size,
+#         pipeline_parallel_size,
+#         data_parallel_size,
+#     )
+#     model = get_small_gpt2()
+#     partitions = UniformPartitioner(model, parallel_context).split(["input_ids"])
+
+#     # partition = get_model_partition(module, policy, parallel_context)
+
+#     for p in partitions:
+#         print("==================")
+#         print(sum([x.numel() for x in p.parameters()]))
+#         print("==================")
+    
+#     assert False
+
+
+@pytest.mark.parametrize("pipeline_parallel_size", [4])
+def test_naive_partitioning(pipeline_parallel_size):
+    # TENSOR_PARALLEL_SIZE = 1
+    # DATA_PARALLEL_SIZE = 1
+
+    # spawn(
+    #     run_model_partitioner,
+    #     world_size=pipeline_parallel_size,
+    #     tensor_parallel_size=TENSOR_PARALLEL_SIZE,
+    #     pipeline_parallel_size=pipeline_parallel_size,
+    #     data_parallel_size=DATA_PARALLEL_SIZE,
+    # )
+
+    batch_sentences = ["hello world from pipegoose"]
+
+    model, tokenizer = get_small_gpt2_and_tokenizer()
+    tokenizer.pad_token = tokenizer.eos_token
 
-    # partition = get_model_partition(module, policy, parallel_context)
+    partitioned_model = UniformPartitioner(model, pipeline_parallel_size).split(["input_ids"])
 
-    for p in partitions:
+    for p in partitioned_model:
         print("==================")
-        print(p)
+        # print(p)
+        print(sum([x.numel() for x in p.parameters()]))
         print("==================")
 
+    inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
 
-@pytest.mark.parametrize("pipeline_parallel_size", [2])
-def test_naive_partitioning(pipeline_parallel_size):
-    TENSOR_PARALLEL_SIZE = 1
-    DATA_PARALLEL_SIZE = 1
-
-    spawn(
-        run_model_partitioner,
-        world_size=pipeline_parallel_size,
-        tensor_parallel_size=TENSOR_PARALLEL_SIZE,
-        pipeline_parallel_size=pipeline_parallel_size,
-        data_parallel_size=DATA_PARALLEL_SIZE,
-    )
+    partitioned_model_result = inputs["input_ids"]
+    for partition_id in range(pipeline_parallel_size):
+        partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
+
+    gt_result = model(**inputs)
+
+    assert torch.allclose(gt_result, partitioned_model_result)
+
+    assert False, "Debug"
\ No newline at end of file

From f36b085e6f258091262f1bced2819ba80fa3f7d9 Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Sun, 19 Nov 2023 17:02:40 +0100
Subject: [PATCH 09/24] Working partitioning implementation

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 96 ++++++++++---------
 .../nn/pipeline_parallel/test_partitioner.py  | 88 ++++++++---------
 2 files changed, 93 insertions(+), 91 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 56ec244..e9fc3ed 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -4,11 +4,7 @@
 from torch import nn
 import torch
 from typing import Dict
-from torch.fx.node import Node
-from typing import Set
-
-from transformers.models.auto import get_values
-from packaging import version
+from collections import defaultdict
 
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
@@ -28,17 +24,15 @@ def split(self) -> List[nn.Module]:
 
 
 class UniformPartitioner(BasePartitioner):
-    # def __init__(self, module: nn.Module, parallel_context: ParallelContext):
-    def __init__(self, module: nn.Module, n_partitions: int):
-        self.module = module
-        self.n_partitions = n_partitions
+    def __init__(self, model: nn.Module, parallel_context: ParallelContext):
+        self.model = model
+        self.parallel_context = parallel_context
 
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
     ) -> Dict:
         """Utility used to trace a graph and identify shard cutpoints."""
 
-        nodes_so_far = []
         param_count: Dict[str, int] = {}
 
         # Find the total number of params in the model and
@@ -55,13 +49,11 @@ def _split_nodes(
         # in the traced graph while in the module we only see the projection as a whole module.
         
         for name, param in traced_graph_module.named_parameters():
-            print(f"{name} => {param.numel()}")
             name = name.replace(".", "_")
             param_count[name] = param.numel()
         
         total_param_count = 0
         for name, module in traced_graph_module.named_modules():
-            print(name)
             if len(name) > 0 and name.count(".") == 0:
                 # also note that the parameters of the lm_head for some models (e.g. GPT2) are not
                 # considered in named_parameters(). therefore, we must count the parameters using
@@ -78,10 +70,12 @@ def _split_nodes(
         per_shard_param = total_param_count // shard_count
         print(f"Per shard param count {per_shard_param}")
 
-
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
         shard_id_to_param_count = [0 for _ in range(shard_count)]
+
+        output_from_shard = {}
+
         for node in traced_graph_module.graph.nodes:
             if node.op == "output":
                 break
@@ -100,54 +94,69 @@ def _split_nodes(
 
                 shard_id_to_param_count[shard_id] += current_param_count
                 print(f"shard_id_to_param_count = {shard_id_to_param_count}")
-            
+
+            # we need to collect the nodes from the previous shard which are needed in the
+            # current shard because the previous shard needs to output them
+            if hasattr(node, "args"):
+                for arg in node.args:
+                    if not hasattr(arg, "name"): 
+                        continue
+
+                    # the input to the current node is from the previous shard. remember it!
+                    arg_shard_id = node_name_to_shard_id.get(arg.name, shard_id)
+                    if arg_shard_id < shard_id:
+                        # propagate the input from arg_shard_id until shard_id
+                        for idx in range(arg_shard_id, shard_id):
+                            # note that we use the dict as an ordered set data structure
+                            output_from_shard.setdefault(idx, dict())[arg.name] = None
+
             node_name_to_shard_id[node.name] = shard_id
 
-        return node_name_to_shard_id
+        return node_name_to_shard_id, output_from_shard
 
-    def split(self, input_names) -> List[nn.Module]:
-        # n_partitions = self.parallel_context.pipeline_parallel_size
-        n_partitions = self.n_partitions # FIXME:
-        model = self.module
-        leaf_modules = set()
+    def split(self, input_names: List[str]) -> List[nn.Module]:
+        n_partitions = self.parallel_context.pipeline_parallel_size
+        model = self.model
         module_list: List[torch.fx.GraphModule] = []
         num_graphs = 0
         new_graph = torch.fx.Graph()  # type: ignore
-        env: Dict[str, Node] = {}
-        new_input_node = None
 
-        symbolic_traced_module = symbolic_trace(self.module)
+        symbolic_traced_module = symbolic_trace(model, input_names=input_names)
 
-        prev_shard_id = 1000
-        prev_shard_node = None
+        node_name_to_shard_id, output_from_shard = self._split_nodes(symbolic_traced_module, n_partitions)
 
-        node_name_to_shard_id = self._split_nodes(symbolic_traced_module, n_partitions)
+        nodes_per_shard = defaultdict(dict)
 
         prev_shard_id = 1000
         prev_node = None
         for node in symbolic_traced_module.graph.nodes:
             # If the current node is in the next shard, we insert an output node.
             # A new graph is created and a placeholder is added for the next shard.
-            if (
-                node.name in node_name_to_shard_id
-                and prev_shard_id < node_name_to_shard_id[node.name]
-            ):
+            if node.name in node_name_to_shard_id and prev_shard_id < node_name_to_shard_id[node.name]:
                 assert prev_node, "prev_node cannot be None"
 
+                # generate output node for the past graph/shard
                 with new_graph.inserting_after(prev_node):
-                    new_graph.output(env[prev_node.name])
+                    outputs = output_from_shard[prev_shard_id]
+                    graph_output_names = [prev_node.name] + [i for i in outputs if i != prev_node.name]
+
+                    if isinstance(nodes_per_shard[prev_shard_id][prev_node.name], tuple):
+                        graph_outputs = nodes_per_shard[prev_shard_id][prev_node.name] + \
+                            tuple([nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name])
+                    else:
+                        graph_outputs = tuple([nodes_per_shard[prev_shard_id][prev_node.name]] + \
+                                              [nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name])
+                    new_graph.create_node(op="output", target="output", args=(graph_outputs,))
+
+                # generate new graph/shard and its input nodes (i.e., the output from the previous graph/shard)
                 num_graphs += 1
                 module_list.append(torch.fx.GraphModule(model, new_graph))
                 new_graph = torch.fx.Graph()
-                node_name = "placeholder" + str(num_graphs)
-                pl_node = new_graph.create_node("placeholder", node_name)
-                env[node_name] = pl_node
-                new_input_node = pl_node
-
-            if new_input_node is not None:
-                # Account for a placeholder in the new graph.
-                node.args = (new_input_node,)
-                new_input_node = None
+                # generate placeholder nodes in the new graph/shard which matches the output nodes of the previous graph/shard
+                for new_graph_input_name in graph_output_names:
+                    graph_input_node = new_graph.create_node("placeholder", new_graph_input_name)
+                    nodes_per_shard[node_name_to_shard_id[node.name]][new_graph_input_name] = graph_input_node
+
             if node.op in [
                 "placeholder",
                 "get_attr",
@@ -156,15 +165,16 @@ def split(self, input_names) -> List[nn.Module]:
                 "call_module",
             ]:
                 # Copy the nodes from the existing graph to the new graph.
-                new_node = new_graph.node_copy(node, lambda x: env[x.name])
-                env[node.name] = new_node
+                current_shard_id = node_name_to_shard_id[node.name]
+                new_node = new_graph.node_copy(node, lambda x: nodes_per_shard[current_shard_id][x.name])
+                nodes_per_shard[current_shard_id][node.name] = new_node
             elif node.op == "output":
                 # If this is the last node, we should add an output
                 # node and add the last graph to the list.
                 assert prev_node, "prev_node cannot be None"
 
                 with new_graph.inserting_after(prev_node):
-                    new_graph.output(env[prev_node.name])
+                    new_graph.output(nodes_per_shard[prev_shard_id][prev_node.name])
                 module_list.append(torch.fx.GraphModule(model, new_graph))
                 break
             prev_node = new_node
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index a1528ae..8e5c355 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -15,58 +15,37 @@ def get_small_gpt2_and_tokenizer(n_layer=12):
     ), AutoTokenizer.from_pretrained("gpt2")
 
 
-# def run_model_partitioner(
-#     rank,
-#     world_size,
-#     port,
-#     tensor_parallel_size,
-#     pipeline_parallel_size,
-#     data_parallel_size,
-# ):
-#     parallel_context = init_parallel_context(
-#         rank,
-#         world_size,
-#         port,
-#         tensor_parallel_size,
-#         pipeline_parallel_size,
-#         data_parallel_size,
-#     )
-#     model = get_small_gpt2()
-#     partitions = UniformPartitioner(model, parallel_context).split(["input_ids"])
-
-#     # partition = get_model_partition(module, policy, parallel_context)
-
-#     for p in partitions:
-#         print("==================")
-#         print(sum([x.numel() for x in p.parameters()]))
-#         print("==================")
-    
-#     assert False
-
-
-@pytest.mark.parametrize("pipeline_parallel_size", [4])
-def test_naive_partitioning(pipeline_parallel_size):
-    # TENSOR_PARALLEL_SIZE = 1
-    # DATA_PARALLEL_SIZE = 1
-
-    # spawn(
-    #     run_model_partitioner,
-    #     world_size=pipeline_parallel_size,
-    #     tensor_parallel_size=TENSOR_PARALLEL_SIZE,
-    #     pipeline_parallel_size=pipeline_parallel_size,
-    #     data_parallel_size=DATA_PARALLEL_SIZE,
-    # )
-
+def run_model_partitioner(
+    rank,
+    world_size,
+    port,
+    tensor_parallel_size,
+    pipeline_parallel_size,
+    data_parallel_size,
+):
+    parallel_context = init_parallel_context(
+        rank,
+        world_size,
+        port,
+        tensor_parallel_size,
+        pipeline_parallel_size,
+        data_parallel_size,
+    )
+
+    torch.manual_seed(0)
     batch_sentences = ["hello world from pipegoose"]
 
     model, tokenizer = get_small_gpt2_and_tokenizer()
+    model.eval()
     tokenizer.pad_token = tokenizer.eos_token
+    inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
+    gt_logits = model(input_ids=inputs["input_ids"]).logits
 
-    partitioned_model = UniformPartitioner(model, pipeline_parallel_size).split(["input_ids"])
+    partitioned_model = UniformPartitioner(model, parallel_context).split(["input_ids"])
+    assert len(partitioned_model) == pipeline_parallel_size
 
     for p in partitioned_model:
         print("==================")
-        # print(p)
         print(sum([x.numel() for x in p.parameters()]))
         print("==================")
 
@@ -74,10 +53,23 @@ def test_naive_partitioning(pipeline_parallel_size):
 
     partitioned_model_result = inputs["input_ids"]
     for partition_id in range(pipeline_parallel_size):
-        partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
+        if type(partitioned_model_result) in (list, tuple):
+            partitioned_model_result = partitioned_model[partition_id](*partitioned_model_result)
+        else:
+            partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
 
-    gt_result = model(**inputs)
+    assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 
-    assert torch.allclose(gt_result, partitioned_model_result)
 
-    assert False, "Debug"
\ No newline at end of file
+@pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4])
+def test_naive_partitioning(pipeline_parallel_size):
+    TENSOR_PARALLEL_SIZE = 1
+    DATA_PARALLEL_SIZE = 1
+
+    spawn(
+        run_model_partitioner,
+        world_size=pipeline_parallel_size,
+        tensor_parallel_size=TENSOR_PARALLEL_SIZE,
+        pipeline_parallel_size=pipeline_parallel_size,
+        data_parallel_size=DATA_PARALLEL_SIZE,
+    )

From 2e06cdb12ad5f197fda8a3987a5c31e5e23df0e6 Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Sun, 19 Nov 2023 19:23:24 +0100
Subject: [PATCH 10/24] clean up model partitioning code

---
 pipegoose/nn/pipeline_parallel/partitioner.py  | 14 ++------------
 tests/nn/pipeline_parallel/test_partitioner.py |  4 ++--
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index e9fc3ed..96870fe 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -66,9 +66,7 @@ def _split_nodes(
             name = name.replace(".", "_")
             param_count[name] = sum([x.numel() for x in module.parameters()])
 
-        print(f"Total number of params are {total_param_count}")
         per_shard_param = total_param_count // shard_count
-        print(f"Per shard param count {per_shard_param}")
 
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
@@ -82,27 +80,19 @@ def _split_nodes(
 
             if node.op in ("call_module", "get_attr"):
                 # call_module and get_attr are the two operations which involve accessing parameters
-                print(f"\n{node.name} = {node.op} target={node.target} args={node.args} ===> {shard_id}")
-                print(f"Args and their shards: {[(arg.name, node_name_to_shard_id[arg.name]) for arg in node.args if hasattr(arg, 'name')]}")
-
                 current_param_count = param_count.get(node.name, 0)
-
-                # if shard_id_to_param_count[shard_id] >= per_shard_param and (shard_id + 1) < shard_count:
-                print(shard_id_to_param_count[shard_id] >= per_shard_param, shard_id_to_param_count[shard_id], per_shard_param)
                 if (shard_id_to_param_count[shard_id] + current_param_count) >= per_shard_param and (shard_id + 1) < shard_count:
                     shard_id += 1
 
                 shard_id_to_param_count[shard_id] += current_param_count
-                print(f"shard_id_to_param_count = {shard_id_to_param_count}")
 
-            # we need to collect the nodes from the previous shard which are needed in the
-            # current shard because the previous shard needs to output them
+            # we need to collect the nodes from the previous shards which are needed in the
+            # current shard because we need to propagate them until the current shard
             if hasattr(node, "args"):
                 for arg in node.args:
                     if not hasattr(arg, "name"): 
                         continue
 
-                    # the input to the current node is from the previous shard. remember it!
                     arg_shard_id = node_name_to_shard_id.get(arg.name, shard_id)
                     if arg_shard_id < shard_id:
                         # propagate the input from arg_shard_id until shard_id
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 8e5c355..6b06b64 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -7,7 +7,7 @@
 from pipegoose.testing.utils import init_parallel_context, spawn
 
 
-def get_small_gpt2_and_tokenizer(n_layer=12):
+def get_gpt2_and_tokenizer(n_layer=12):
     return GPT2LMHeadModel(
         GPT2Config(
             n_layer=n_layer
@@ -35,7 +35,7 @@ def run_model_partitioner(
     torch.manual_seed(0)
     batch_sentences = ["hello world from pipegoose"]
 
-    model, tokenizer = get_small_gpt2_and_tokenizer()
+    model, tokenizer = get_gpt2_and_tokenizer()
     model.eval()
     tokenizer.pad_token = tokenizer.eos_token
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")

From 290ae685f3b685ad8d4b0d415e6e454b45a1f5cb Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Sun, 19 Nov 2023 23:36:41 +0100
Subject: [PATCH 11/24] fix partitioning

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index b77610d..36c653d 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -193,7 +193,7 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
                 module_list.append(torch.fx.GraphModule(model, new_graph))
                 break
             prev_node = new_node
-            # prev_shard_id = node_name_to_shard_id[node.name]
+            prev_shard_id = node_name_to_shard_id[node.name]
 
         return module_list
 

From d7c3cf2a1b9cced31242da029e889644d433ea98 Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Sun, 19 Nov 2023 23:37:58 +0100
Subject: [PATCH 12/24] remove unused function

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 36c653d..28d4679 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -11,25 +11,6 @@
 from transformers.utils.fx import symbolic_trace
 
 
-def _snake_case(s: str) -> str:
-    """
-    Transforms the given string ``s`` to a Python-style variable name
-
-    Examples:
-        ``mod.snake_case`` -> ``mod.snake_case``
-        ``mod.pascalCase``-> ``mod.pascal_case``
-        ``mod.ALL_CAPS`` -> ``mod.all_caps``
-    """
-    chars = []
-    prev_lower = False
-    for c in s:
-        if prev_lower and c.isupper():
-            chars.append("_")
-        chars.append(c.lower())
-        prev_lower = c.islower()
-    return "".join(chars)
-
-
 class PartitionPolicy(Enum):
     UNIFORM = auto()
 
@@ -137,11 +118,6 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
 
         nodes_per_shard = defaultdict(dict)
 
-        print(f"node_name_to_shard_id: {node_name_to_shard_id}")
-
-        # for i, node in enumerate(node_name_to_shard_id):
-        #   print(f"Node {i} is {node} and shard id is {node_name_to_shard_id[node]}")
-
         prev_shard_id = 1000
         prev_node = None
         for node in symbolic_traced_module.graph.nodes:

From 4494d627764f1aaf6d0d166cae8d2a35e2126f7c Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Mon, 20 Nov 2023 00:33:08 +0100
Subject: [PATCH 13/24] ability to test multiple models

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 69 ++++++++++++++-----
 .../nn/pipeline_parallel/test_partitioner.py  | 40 ++++++++---
 2 files changed, 79 insertions(+), 30 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 28d4679..5d5d5b6 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -28,7 +28,6 @@ def __init__(self, model: nn.Module, parallel_context: ParallelContext):
         self.model = model
         self.parallel_context = parallel_context
 
-
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
     ) -> Dict:
@@ -48,11 +47,11 @@ def _split_nodes(
         #
         # The projection inside the attention layer is split into weight and bias
         # in the traced graph while in the module we only see the projection as a whole module.
-        
+
         for name, param in traced_graph_module.named_parameters():
             name = name.replace(".", "_")
             param_count[name] = param.numel()
-        
+
         total_param_count = 0
         for name, module in traced_graph_module.named_modules():
             if len(name) > 0 and name.count(".") == 0:
@@ -60,7 +59,7 @@ def _split_nodes(
                 # considered in named_parameters(). therefore, we must count the parameters using
                 # named_modules.
                 # we recursively go deeper into the modules, we cannot naively count the parameters of each module,
-                # because we then would count the same parameter multiple times. hence, we only count the 
+                # because we then would count the same parameter multiple times. hence, we only count the
                 # parameters of the top-level modules.
                 total_param_count += sum([x.numel() for x in module.parameters()])
 
@@ -82,7 +81,9 @@ def _split_nodes(
             if node.op in ("call_module", "get_attr"):
                 # call_module and get_attr are the two operations which involve accessing parameters
                 current_param_count = param_count.get(node.name, 0)
-                if (shard_id_to_param_count[shard_id] + current_param_count) >= per_shard_param and (shard_id + 1) < shard_count:
+                if (
+                    shard_id_to_param_count[shard_id] + current_param_count
+                ) >= per_shard_param and (shard_id + 1) < shard_count:
                     shard_id += 1
 
                 shard_id_to_param_count[shard_id] += current_param_count
@@ -91,7 +92,7 @@ def _split_nodes(
             # current shard because we need to propagate them until the current shard
             if hasattr(node, "args"):
                 for arg in node.args:
-                    if not hasattr(arg, "name"): 
+                    if not hasattr(arg, "name"):
                         continue
 
                     arg_shard_id = node_name_to_shard_id.get(arg.name, shard_id)
@@ -114,7 +115,9 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
 
         symbolic_traced_module = symbolic_trace(model, input_names=input_names)
 
-        node_name_to_shard_id, output_from_shard = self._split_nodes(symbolic_traced_module, n_partitions)
+        node_name_to_shard_id, output_from_shard = self._split_nodes(
+            symbolic_traced_module, n_partitions
+        )
 
         nodes_per_shard = defaultdict(dict)
 
@@ -123,21 +126,43 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
         for node in symbolic_traced_module.graph.nodes:
             # If the current node is in the next shard, we insert an output node.
             # A new graph is created and a placeholder is added for the next shard.
-            if node.name in node_name_to_shard_id and prev_shard_id < node_name_to_shard_id[node.name]:
+            if (
+                node.name in node_name_to_shard_id
+                and prev_shard_id < node_name_to_shard_id[node.name]
+            ):
                 assert prev_node, "prev_node cannot be None"
 
                 # generate output node for the past graph/shard
                 with new_graph.inserting_after(prev_node):
                     outputs = output_from_shard[prev_shard_id]
-                    graph_output_names = [prev_node.name] + [i for i in outputs if i != prev_node.name]
-
-                    if isinstance(nodes_per_shard[prev_shard_id][prev_node.name], tuple):
-                        graph_outputs = nodes_per_shard[prev_shard_id][prev_node.name] + \
-                            tuple([nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name])
+                    graph_output_names = [prev_node.name] + [
+                        i for i in outputs if i != prev_node.name
+                    ]
+
+                    if isinstance(
+                        nodes_per_shard[prev_shard_id][prev_node.name], tuple
+                    ):
+                        graph_outputs = nodes_per_shard[prev_shard_id][
+                            prev_node.name
+                        ] + tuple(
+                            [
+                                nodes_per_shard[prev_shard_id][i]
+                                for i in outputs
+                                if i != prev_node.name
+                            ]
+                        )
                     else:
-                        graph_outputs = tuple([nodes_per_shard[prev_shard_id][prev_node.name]] + \
-                                              [nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name])
-                    new_graph.create_node(op="output", target="output", args=(graph_outputs,))
+                        graph_outputs = tuple(
+                            [nodes_per_shard[prev_shard_id][prev_node.name]]
+                            + [
+                                nodes_per_shard[prev_shard_id][i]
+                                for i in outputs
+                                if i != prev_node.name
+                            ]
+                        )
+                    new_graph.create_node(
+                        op="output", target="output", args=(graph_outputs,)
+                    )
 
                 # generate new graph/shard and its input nodes (i.e., the output from the previous graph/shard)
                 num_graphs += 1
@@ -145,8 +170,12 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
                 new_graph = torch.fx.Graph()
                 # generate placeholder nodes in the new graph/shard which matches the output nodes of the previous graph/shard
                 for new_graph_input_name in graph_output_names:
-                    graph_input_node = new_graph.create_node("placeholder", new_graph_input_name)
-                    nodes_per_shard[node_name_to_shard_id[node.name]][new_graph_input_name] = graph_input_node
+                    graph_input_node = new_graph.create_node(
+                        "placeholder", new_graph_input_name
+                    )
+                    nodes_per_shard[node_name_to_shard_id[node.name]][
+                        new_graph_input_name
+                    ] = graph_input_node
 
             if node.op in [
                 "placeholder",
@@ -157,7 +186,9 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
             ]:
                 # Copy the nodes from the existing graph to the new graph.
                 current_shard_id = node_name_to_shard_id[node.name]
-                new_node = new_graph.node_copy(node, lambda x: nodes_per_shard[current_shard_id][x.name])
+                new_node = new_graph.node_copy(
+                    node, lambda x: nodes_per_shard[current_shard_id][x.name]
+                )
                 nodes_per_shard[current_shard_id][node.name] = new_node
             elif node.op == "output":
                 # If this is the last node, we should add an output
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 6b06b64..08aefa5 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -1,6 +1,12 @@
 import pytest
 import torch
-from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
+from transformers import (
+    AutoTokenizer,
+    GPT2LMHeadModel,
+    GPT2Config,
+    BloomForCausalLM,
+    BloomConfig,
+)
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
 )
@@ -8,11 +14,15 @@
 
 
 def get_gpt2_and_tokenizer(n_layer=12):
-    return GPT2LMHeadModel(
-        GPT2Config(
-            n_layer=n_layer
-        )
-    ), AutoTokenizer.from_pretrained("gpt2")
+    return GPT2LMHeadModel(GPT2Config(n_layer=n_layer)), AutoTokenizer.from_pretrained(
+        "gpt2"
+    )
+
+
+def get_bloom_and_tokenizer(n_layer=12):
+    return BloomForCausalLM(
+        BloomConfig(n_layer=n_layer)
+    ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
 def run_model_partitioner(
@@ -22,6 +32,7 @@ def run_model_partitioner(
     tensor_parallel_size,
     pipeline_parallel_size,
     data_parallel_size,
+    model_retrieval_func,
 ):
     parallel_context = init_parallel_context(
         rank,
@@ -34,8 +45,7 @@ def run_model_partitioner(
 
     torch.manual_seed(0)
     batch_sentences = ["hello world from pipegoose"]
-
-    model, tokenizer = get_gpt2_and_tokenizer()
+    model, tokenizer = model_retrieval_func()
     model.eval()
     tokenizer.pad_token = tokenizer.eos_token
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
@@ -54,15 +64,22 @@ def run_model_partitioner(
     partitioned_model_result = inputs["input_ids"]
     for partition_id in range(pipeline_parallel_size):
         if type(partitioned_model_result) in (list, tuple):
-            partitioned_model_result = partitioned_model[partition_id](*partitioned_model_result)
+            partitioned_model_result = partitioned_model[partition_id](
+                *partitioned_model_result
+            )
         else:
-            partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
+            partitioned_model_result = partitioned_model[partition_id](
+                partitioned_model_result
+            )
 
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 
 
 @pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4])
-def test_naive_partitioning(pipeline_parallel_size):
+@pytest.mark.parametrize(
+    "model_retrieval_func", [get_gpt2_and_tokenizer, get_bloom_and_tokenizer]
+)
+def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):
     TENSOR_PARALLEL_SIZE = 1
     DATA_PARALLEL_SIZE = 1
 
@@ -72,4 +89,5 @@ def test_naive_partitioning(pipeline_parallel_size):
         tensor_parallel_size=TENSOR_PARALLEL_SIZE,
         pipeline_parallel_size=pipeline_parallel_size,
         data_parallel_size=DATA_PARALLEL_SIZE,
+        model_retrieval_func=model_retrieval_func,
     )

From fa36cab77939f4ed3ded6489a40e533adb3197d6 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Mon, 20 Nov 2023 00:37:13 +0100
Subject: [PATCH 14/24] Update test_partitioner.py

---
 tests/nn/pipeline_parallel/test_partitioner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 08aefa5..28d6189 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -25,6 +25,7 @@ def get_bloom_and_tokenizer(n_layer=12):
     ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
+# TODO: Also add a function for a generic nn.Transformer model
 def run_model_partitioner(
     rank,
     world_size,

From 4c86ef9ee925a8c7a021199aa7f1ffef288a6ecf Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Mon, 20 Nov 2023 09:58:20 +0100
Subject: [PATCH 15/24] Update test_partitioner.py

---
 tests/nn/pipeline_parallel/test_partitioner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 28d6189..d1ac429 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -4,8 +4,7 @@
     AutoTokenizer,
     GPT2LMHeadModel,
     GPT2Config,
-    BloomForCausalLM,
-    BloomConfig,
+    AutoModelForCausalLM,
 )
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
@@ -19,9 +18,9 @@ def get_gpt2_and_tokenizer(n_layer=12):
     )
 
 
-def get_bloom_and_tokenizer(n_layer=12):
-    return BloomForCausalLM(
-        BloomConfig(n_layer=n_layer)
+def get_bloom_and_tokenizer():
+    return AutoModelForCausalLM.from_pretrained(
+        "bigscience/bloom-560m"
     ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 

From e69466da957aa1ac6628e464361a4140e89dd777 Mon Sep 17 00:00:00 2001
From: Daniel Grittner <danielgrittner@web.de>
Date: Tue, 21 Nov 2023 16:34:06 +0100
Subject: [PATCH 16/24] fix model partitioning and add more tests

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 25 +++++++++-------
 .../nn/pipeline_parallel/test_partitioner.py  | 30 +++++++++++--------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 5d5d5b6..6334553 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -52,21 +52,23 @@ def _split_nodes(
             name = name.replace(".", "_")
             param_count[name] = param.numel()
 
-        total_param_count = 0
+        exclude_param_count = 0
         for name, module in traced_graph_module.named_modules():
-            if len(name) > 0 and name.count(".") == 0:
-                # also note that the parameters of the lm_head for some models (e.g. GPT2) are not
-                # considered in named_parameters(). therefore, we must count the parameters using
-                # named_modules.
-                # we recursively go deeper into the modules, we cannot naively count the parameters of each module,
-                # because we then would count the same parameter multiple times. hence, we only count the
-                # parameters of the top-level modules.
-                total_param_count += sum([x.numel() for x in module.parameters()])
+            # exclude embedding layers
+            if isinstance(module, nn.Embedding):
+                name = name.replace(".", "_")
+                exclude_param_count += sum([x.numel() for x in module.parameters()])
+                param_count[name] = 0
+                continue
 
             name = name.replace(".", "_")
             param_count[name] = sum([x.numel() for x in module.parameters()])
 
-        per_shard_param = total_param_count // shard_count
+        # Note that we use param_count[""] as total parameters which does not include the
+        # lm_head. We want to exclude the lm_head and the embeddings here because they
+        # usually cause high skew which does not lead to equal partitioning of the
+        # transformer blocks.
+        per_shard_param = (param_count[""] - exclude_param_count) // shard_count
 
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
@@ -81,7 +83,8 @@ def _split_nodes(
             if node.op in ("call_module", "get_attr"):
                 # call_module and get_attr are the two operations which involve accessing parameters
                 current_param_count = param_count.get(node.name, 0)
-                if (
+
+                if shard_id_to_param_count[shard_id] > 0 and (
                     shard_id_to_param_count[shard_id] + current_param_count
                 ) >= per_shard_param and (shard_id + 1) < shard_count:
                     shard_id += 1
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index d1ac429..6b80509 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -2,9 +2,9 @@
 import torch
 from transformers import (
     AutoTokenizer,
-    GPT2LMHeadModel,
-    GPT2Config,
     AutoModelForCausalLM,
+    BloomConfig,
+    BloomForCausalLM
 )
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
@@ -12,16 +12,16 @@
 from pipegoose.testing.utils import init_parallel_context, spawn
 
 
-def get_gpt2_and_tokenizer(n_layer=12):
-    return GPT2LMHeadModel(GPT2Config(n_layer=n_layer)), AutoTokenizer.from_pretrained(
-        "gpt2"
-    )
+def get_gpt2_and_tokenizer():
+    return AutoModelForCausalLM.from_pretrained("gpt2"), AutoTokenizer.from_pretrained("gpt2")
+
+
+def get_bloom_560m_and_tokenizer():
+    return AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m"), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
-def get_bloom_and_tokenizer():
-    return AutoModelForCausalLM.from_pretrained(
-        "bigscience/bloom-560m"
-    ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+def get_bloom_and_tokenizer_with_6_layers():
+    return BloomForCausalLM(BloomConfig(n_layer=6)), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
 # TODO: Also add a function for a generic nn.Transformer model
@@ -52,7 +52,7 @@ def run_model_partitioner(
     gt_logits = model(input_ids=inputs["input_ids"]).logits
 
     partitioned_model = UniformPartitioner(model, parallel_context).split(["input_ids"])
-    assert len(partitioned_model) == pipeline_parallel_size
+    assert len(partitioned_model) == pipeline_parallel_size, f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
 
     for p in partitioned_model:
         print("==================")
@@ -75,9 +75,13 @@ def run_model_partitioner(
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 
 
-@pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4])
+@pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4, 5, 6])
 @pytest.mark.parametrize(
-    "model_retrieval_func", [get_gpt2_and_tokenizer, get_bloom_and_tokenizer]
+    "model_retrieval_func", [
+        get_gpt2_and_tokenizer,
+        get_bloom_and_tokenizer_with_6_layers,
+        get_bloom_560m_and_tokenizer
+    ]
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):
     TENSOR_PARALLEL_SIZE = 1

From 7e5f3a1050558ffdd4eb219f8af74dd3d2ffb48d Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sat, 25 Nov 2023 23:24:14 +0100
Subject: [PATCH 17/24] detect start and end of block

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 38 ++++++++++++++----
 .../nn/pipeline_parallel/test_partitioner.py  | 39 +++++++++++++------
 2 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 6334553..575c731 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -5,7 +5,7 @@
 import torch
 from typing import Dict
 from collections import defaultdict
-
+import re
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
 from transformers.utils.fx import symbolic_trace
@@ -28,6 +28,14 @@ def __init__(self, model: nn.Module, parallel_context: ParallelContext):
         self.model = model
         self.parallel_context = parallel_context
 
+    def _matches_transformer_block_start(self, node_name: str) -> bool:
+        pattern = r"^transformer_h_\d+_input_layernorm$"
+        return re.match(pattern, node_name) is not None
+
+    def _matches_transformer_block_end(self, node_name: str) -> bool:
+        pattern = r"^transformer_h_\d+_mlp_dense_\d+h_to_h$"
+        return re.match(pattern, node_name) is not None
+
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
     ) -> Dict:
@@ -69,26 +77,42 @@ def _split_nodes(
         # usually cause high skew which does not lead to equal partitioning of the
         # transformer blocks.
         per_shard_param = (param_count[""] - exclude_param_count) // shard_count
-
+        original_per_shard_param = per_shard_param
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
         shard_id_to_param_count = [0 for _ in range(shard_count)]
 
         output_from_shard = {}
-
+        is_transformer_block_ended = True
         for node in traced_graph_module.graph.nodes:
             if node.op == "output":
                 break
 
+            ##While splitting the nodes, we have to check if a node of the transformer block is split into
+            # different shards.
             if node.op in ("call_module", "get_attr"):
+                if self._matches_transformer_block_start(
+                    node.name
+                ) and not self._matches_transformer_block_end(node.name):
+                    per_shard_param += 1
+                    print("started: ", node)
+                elif self._matches_transformer_block_end(node.name):
+                    per_shard_param += 2
+
+                    print("ended: ", node.name)
+
+                print(node.name, shard_id)
                 # call_module and get_attr are the two operations which involve accessing parameters
                 current_param_count = param_count.get(node.name, 0)
 
-                if shard_id_to_param_count[shard_id] > 0 and (
-                    shard_id_to_param_count[shard_id] + current_param_count
-                ) >= per_shard_param and (shard_id + 1) < shard_count:
+                if (
+                    shard_id_to_param_count[shard_id] > 0
+                    and (shard_id_to_param_count[shard_id] + current_param_count)
+                    >= per_shard_param
+                    and (shard_id + 1) < shard_count
+                ):
                     shard_id += 1
-
+                    per_shard_param = original_per_shard_param
                 shard_id_to_param_count[shard_id] += current_param_count
 
             # we need to collect the nodes from the previous shards which are needed in the
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 6b80509..0468b9b 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -4,7 +4,7 @@
     AutoTokenizer,
     AutoModelForCausalLM,
     BloomConfig,
-    BloomForCausalLM
+    BloomForCausalLM,
 )
 from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
     UniformPartitioner,
@@ -13,15 +13,21 @@
 
 
 def get_gpt2_and_tokenizer():
-    return AutoModelForCausalLM.from_pretrained("gpt2"), AutoTokenizer.from_pretrained("gpt2")
+    return AutoModelForCausalLM.from_pretrained("gpt2"), AutoTokenizer.from_pretrained(
+        "gpt2"
+    )
 
 
 def get_bloom_560m_and_tokenizer():
-    return AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m"), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+    return AutoModelForCausalLM.from_pretrained(
+        "bigscience/bloom-560m"
+    ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
 def get_bloom_and_tokenizer_with_6_layers():
-    return BloomForCausalLM(BloomConfig(n_layer=6)), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+    return BloomForCausalLM(BloomConfig(n_layer=6)), AutoTokenizer.from_pretrained(
+        "bigscience/bloom-560m"
+    )
 
 
 # TODO: Also add a function for a generic nn.Transformer model
@@ -52,12 +58,22 @@ def run_model_partitioner(
     gt_logits = model(input_ids=inputs["input_ids"]).logits
 
     partitioned_model = UniformPartitioner(model, parallel_context).split(["input_ids"])
-    assert len(partitioned_model) == pipeline_parallel_size, f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
+    assert (
+        len(partitioned_model) == pipeline_parallel_size
+    ), f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
 
+    """
+    print("start")
     for p in partitioned_model:
         print("==================")
-        print(sum([x.numel() for x in p.parameters()]))
+        for k, v in p.named_children():
+            print(f"Layer type: {k}")
+            print(v)
         print("==================")
+    print("end")
+
+
+    """
 
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
 
@@ -75,13 +91,12 @@ def run_model_partitioner(
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 
 
-@pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4, 5, 6])
+@pytest.mark.parametrize("pipeline_parallel_size", [5])
 @pytest.mark.parametrize(
-    "model_retrieval_func", [
-        get_gpt2_and_tokenizer,
-        get_bloom_and_tokenizer_with_6_layers,
-        get_bloom_560m_and_tokenizer
-    ]
+    "model_retrieval_func",
+    [
+        get_bloom_560m_and_tokenizer,
+    ],
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):
     TENSOR_PARALLEL_SIZE = 1

From 5c0ddd76c80c533056cb823a914eb58a5369c471 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sun, 26 Nov 2023 01:38:04 +0100
Subject: [PATCH 18/24] semaphore-like implementation

It must use the transformers_h_X part to indentify the block, since from transformer_h_X to end changes from model to model
---
 pipegoose/nn/pipeline_parallel/partitioner.py | 52 ++++++++++---------
 .../nn/pipeline_parallel/test_partitioner.py  |  8 ++-
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 575c731..91bdaa9 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -10,6 +10,8 @@
 from pipegoose.distributed.parallel_mode import ParallelMode
 from transformers.utils.fx import symbolic_trace
 
+from typing import Optional
+
 
 class PartitionPolicy(Enum):
     UNIFORM = auto()
@@ -28,13 +30,9 @@ def __init__(self, model: nn.Module, parallel_context: ParallelContext):
         self.model = model
         self.parallel_context = parallel_context
 
-    def _matches_transformer_block_start(self, node_name: str) -> bool:
-        pattern = r"^transformer_h_\d+_input_layernorm$"
-        return re.match(pattern, node_name) is not None
-
-    def _matches_transformer_block_end(self, node_name: str) -> bool:
-        pattern = r"^transformer_h_\d+_mlp_dense_\d+h_to_h$"
-        return re.match(pattern, node_name) is not None
+    def _get_transformer_block_id(self, node_name: str) -> Optional[str]:
+        match = re.match(r"transformer_h_(\d+)", node_name)
+        return match.group(1) if match else None
 
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
@@ -81,38 +79,42 @@ def _split_nodes(
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
         shard_id_to_param_count = [0 for _ in range(shard_count)]
-
         output_from_shard = {}
-        is_transformer_block_ended = True
+        transformer_block_ended = False
+        current_transformer_block = 0
+
         for node in traced_graph_module.graph.nodes:
             if node.op == "output":
                 break
 
-            ##While splitting the nodes, we have to check if a node of the transformer block is split into
-            # different shards.
+            # While splitting the nodes, we have to check if a node of the transformer block is detected
+            # If so, we have to force it in the current shard
             if node.op in ("call_module", "get_attr"):
-                if self._matches_transformer_block_start(
-                    node.name
-                ) and not self._matches_transformer_block_end(node.name):
-                    per_shard_param += 1
-                    print("started: ", node)
-                elif self._matches_transformer_block_end(node.name):
-                    per_shard_param += 2
-
-                    print("ended: ", node.name)
-
-                print(node.name, shard_id)
-                # call_module and get_attr are the two operations which involve accessing parameters
                 current_param_count = param_count.get(node.name, 0)
+                new_transformer_block_id = self._get_transformer_block_id(node.name)
+                # Check if a new transformer block has started
+                print(
+                    new_transformer_block_id,
+                    current_transformer_block,
+                    new_transformer_block_id != current_transformer_block,
+                )
+                if new_transformer_block_id != current_transformer_block:
+                    # End the previous block and start a new one
+                    current_transformer_block = new_transformer_block_id
+                    transformer_block_ended = True
+                else:
+                    # We are still in the same transformer block
+                    transformer_block_ended = False
 
                 if (
                     shard_id_to_param_count[shard_id] > 0
                     and (shard_id_to_param_count[shard_id] + current_param_count)
                     >= per_shard_param
                     and (shard_id + 1) < shard_count
+                    and transformer_block_ended
                 ):
                     shard_id += 1
-                    per_shard_param = original_per_shard_param
+                    transformer_block_ended = False
                 shard_id_to_param_count[shard_id] += current_param_count
 
             # we need to collect the nodes from the previous shards which are needed in the
@@ -130,7 +132,7 @@ def _split_nodes(
                             output_from_shard.setdefault(idx, dict())[arg.name] = None
 
             node_name_to_shard_id[node.name] = shard_id
-
+            print("node: ", node.name, shard_id)
         return node_name_to_shard_id, output_from_shard
 
     def split(self, input_names: List[str]) -> List[nn.Module]:
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 0468b9b..c0e1a96 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -62,7 +62,6 @@ def run_model_partitioner(
         len(partitioned_model) == pipeline_parallel_size
     ), f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
 
-    """
     print("start")
     for p in partitioned_model:
         print("==================")
@@ -72,9 +71,6 @@ def run_model_partitioner(
         print("==================")
     print("end")
 
-
-    """
-
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
 
     partitioned_model_result = inputs["input_ids"]
@@ -95,7 +91,9 @@ def run_model_partitioner(
 @pytest.mark.parametrize(
     "model_retrieval_func",
     [
-        get_bloom_560m_and_tokenizer,
+        # get_gpt2_and_tokenizer,
+        get_bloom_and_tokenizer_with_6_layers,
+        # get_bloom_560m_and_tokenizer,
     ],
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):

From c1c45480c3c039c6fc4d1c9c57de61421a37df25 Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sun, 26 Nov 2023 01:54:00 +0100
Subject: [PATCH 19/24] remove log.txt file

---
 tests/nn/pipeline_parallel/test_partitioner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index c0e1a96..ae84304 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -87,13 +87,13 @@ def run_model_partitioner(
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 
 
-@pytest.mark.parametrize("pipeline_parallel_size", [5])
+@pytest.mark.parametrize("pipeline_parallel_size", [2, 3, 4, 5, 6])
 @pytest.mark.parametrize(
     "model_retrieval_func",
     [
-        # get_gpt2_and_tokenizer,
+        get_gpt2_and_tokenizer,
         get_bloom_and_tokenizer_with_6_layers,
-        # get_bloom_560m_and_tokenizer,
+        get_bloom_560m_and_tokenizer,
     ],
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):

From 5dd03a75c3ffae599c5fea4e48d980eeef7e442c Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Sun, 26 Nov 2023 02:41:45 +0100
Subject: [PATCH 20/24] better debugging

---
 pipegoose/nn/pipeline_parallel/partitioner.py |  6 ------
 .../nn/pipeline_parallel/test_partitioner.py  | 21 ++++++++++++-------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 91bdaa9..085aff5 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -93,11 +93,6 @@ def _split_nodes(
                 current_param_count = param_count.get(node.name, 0)
                 new_transformer_block_id = self._get_transformer_block_id(node.name)
                 # Check if a new transformer block has started
-                print(
-                    new_transformer_block_id,
-                    current_transformer_block,
-                    new_transformer_block_id != current_transformer_block,
-                )
                 if new_transformer_block_id != current_transformer_block:
                     # End the previous block and start a new one
                     current_transformer_block = new_transformer_block_id
@@ -132,7 +127,6 @@ def _split_nodes(
                             output_from_shard.setdefault(idx, dict())[arg.name] = None
 
             node_name_to_shard_id[node.name] = shard_id
-            print("node: ", node.name, shard_id)
         return node_name_to_shard_id, output_from_shard
 
     def split(self, input_names: List[str]) -> List[nn.Module]:
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index ae84304..0bdd0f5 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -62,14 +62,19 @@ def run_model_partitioner(
         len(partitioned_model) == pipeline_parallel_size
     ), f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
 
-    print("start")
-    for p in partitioned_model:
+    print("Start printing partitioned model")
+    for i, shard in enumerate(partitioned_model):
+        shard_param_count = 0
         print("==================")
-        for k, v in p.named_children():
-            print(f"Layer type: {k}")
-            print(v)
+        print(f"Shard {i + 1}")
+        for _, module in shard.named_children():
+            # Sum the parameters of each module in the shard
+            shard_param_count += sum(p.numel() for p in module.parameters())
+            print(f"Layer type: {type(module).__name__}")
+            print(module)
+        print(f"Total parameters in Shard {i + 1}: {shard_param_count}")
         print("==================")
-    print("end")
+    print("End printing partitioned model")
 
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
 
@@ -91,9 +96,9 @@ def run_model_partitioner(
 @pytest.mark.parametrize(
     "model_retrieval_func",
     [
-        get_gpt2_and_tokenizer,
+        # get_gpt2_and_tokenizer,
         get_bloom_and_tokenizer_with_6_layers,
-        get_bloom_560m_and_tokenizer,
+        # get_bloom_560m_and_tokenizer,
     ],
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):

From 6ce3419ddab8493012566b29994321ea0c9474f6 Mon Sep 17 00:00:00 2001
From: xrsrke <b3f0cus@icloud.com>
Date: Sun, 26 Nov 2023 09:49:10 +0700
Subject: [PATCH 21/24] =?UTF-8?q?[Feature]=20support=20the=20forward=20pas?=
 =?UTF-8?q?s=20of=20automatic=20pipeline=20parallelism=20for=20?=
 =?UTF-8?q?=F0=9F=A4=97=20transformers?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../nn/pipeline_parallel/_job/backward.py     | 11 ++-
 .../nn/pipeline_parallel/_job/creator.py      | 17 ++--
 .../nn/pipeline_parallel/_job/forward.py      | 12 ++-
 pipegoose/nn/pipeline_parallel/microbatch.py  | 11 ++-
 .../nn/pipeline_parallel/pipeline_engine.py   |  8 +-
 .../nn/pipeline_parallel/pipeline_parallel.py | 12 +--
 pipegoose/nn/pipeline_parallel/queue.py       |  8 +-
 pipegoose/testing/utils.py                    |  1 +
 tests/nn/pipeline_parallel/test_microbatch.py | 22 ++---
 .../test_pipeline_parallel.py                 | 96 ++++++++++---------
 tests/optim/zero/test_sharding.py             |  3 -
 11 files changed, 115 insertions(+), 86 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/_job/backward.py b/pipegoose/nn/pipeline_parallel/_job/backward.py
index ea05c0a..88d27ca 100644
--- a/pipegoose/nn/pipeline_parallel/_job/backward.py
+++ b/pipegoose/nn/pipeline_parallel/_job/backward.py
@@ -21,7 +21,14 @@ class _SaveGradLossFunction(torch.autograd.Function):
     def forward(ctx: Any, key, metadata, tensor: torch.Tensor):
         ctx.key = key
         ctx.package_metadata = metadata
-        new_tensor = tensor.detach().clone()
+        # NOTE: a hacky way to work around with `transformers`
+        if isinstance(tensor, torch.Tensor):
+            new_tensor = tensor.detach().clone()
+        elif isinstance(tensor, tuple):
+            new_tensor = tuple(t.detach().clone() for t in tensor)
+        else:
+            raise ValueError(f"tensor must be an instance of torch.Tensor or tuple, got {type(tensor)}")
+
         return new_tensor
 
     @staticmethod
@@ -45,8 +52,6 @@ def backward(ctx: Any, grad_output: torch.Tensor):
 def save_grad_loss(package: Package) -> Package:
     key = (package.metadata.microbatch_idx, package.metadata.partition_idx)
     package.data = _SaveGradLossFunction.apply(key, package.metadata, package.data)
-    if package.metadata.partition_idx == 3:
-        assert 1 == 1
     return package
 
 
diff --git a/pipegoose/nn/pipeline_parallel/_job/creator.py b/pipegoose/nn/pipeline_parallel/_job/creator.py
index 85ec14b..3554f09 100644
--- a/pipegoose/nn/pipeline_parallel/_job/creator.py
+++ b/pipegoose/nn/pipeline_parallel/_job/creator.py
@@ -40,17 +40,14 @@ def __init__(self, pipeline_context: PipelineContext):
         self.pipeline_context = pipeline_context
 
     def after_compute(self):
-        from pipegoose.nn.pipeline_parallel.queue import (
-            get_input_activations,
-            get_output_activations,
-        )
+        pass
 
         package = self.job.output
         microbatch_idx = self.job.input.metadata.microbatch_idx
         partition_idx = self.job.input.metadata.partition_idx
 
-        assert isinstance(get_input_activations(microbatch_idx, partition_idx), torch.Tensor)
-        assert isinstance(get_output_activations(microbatch_idx, partition_idx), torch.Tensor)
+        # assert isinstance(get_input_activations(microbatch_idx, partition_idx), torch.Tensor)
+        # assert isinstance(get_output_activations(microbatch_idx, partition_idx), torch.Tensor)
 
         if package.metadata.microbatch_idx == self.pipeline_context.num_microbatches - 1:
             new_package = schedule_backward_execution(package, self.pipeline_context)
@@ -187,7 +184,13 @@ class Function(torch.autograd.Function):
         @staticmethod
         def forward(ctx, metadata: Metadata, input: torch.Tensor) -> torch.Tensor:
             ctx.package_meta = metadata
-            new_input = input.detach().clone()
+            # NOTE: a hacky way to make it works with `transformers`
+            if type(input) in (list, tuple):
+                # NOTE: ignore attention mask, which is a bool tensor
+                new_input = [x.detach().clone() for x in input]
+            else:
+                new_input = input.detach().clone()
+
             return new_input
 
         @staticmethod
diff --git a/pipegoose/nn/pipeline_parallel/_job/forward.py b/pipegoose/nn/pipeline_parallel/_job/forward.py
index a511347..241e45f 100644
--- a/pipegoose/nn/pipeline_parallel/_job/forward.py
+++ b/pipegoose/nn/pipeline_parallel/_job/forward.py
@@ -14,8 +14,18 @@
 class ForwardJob(Job):
     def run_compute(self) -> torch.Tensor:
         is_training = self.input.metadata.training.is_training
+
         with torch.set_grad_enabled(is_training):
-            output = self.function(self.input.data)
+            # TODO: a hacky way to work around with `transformers`
+            if isinstance(self.input.data, torch.Tensor):
+                output = self.function(self.input.data)
+            elif type(self.input.data) in (list, tuple):
+                output = self.function(*self.input.data)
+            elif "input_ids" in self.input.data:
+                output = self.function(self.input.data["input_ids"])
+            else:
+                output = self.function(self.input.data)
+
         return output
 
 
diff --git a/pipegoose/nn/pipeline_parallel/microbatch.py b/pipegoose/nn/pipeline_parallel/microbatch.py
index 644da1d..025910f 100644
--- a/pipegoose/nn/pipeline_parallel/microbatch.py
+++ b/pipegoose/nn/pipeline_parallel/microbatch.py
@@ -10,9 +10,14 @@ class ModelInputs(TypedDict):
 
 def split(inputs: ModelInputs, n_microbatches: int) -> List[ModelInputs]:
     assert n_microbatches > 0, f"n_microbatches must be greater than 0, got {n_microbatches}"
-
-    input_ids_microbatches = torch.split(inputs["input_ids"], 2)
-    attention_mask_microbatches = torch.split(inputs["attention_mask"], 2)
+    assert "input_ids" in inputs, f"inputs must have 'input_ids' key, got {inputs.keys()}"
+    assert "attention_mask" in inputs, f"inputs must have 'attention_mask' key, got {inputs.keys()}"
+    assert (
+        inputs["input_ids"].size(0) % n_microbatches == 0
+    ), f"The batch size must be divisible by n_microbatches, got {inputs['input_ids'].size(0)} and {n_microbatches}"
+
+    input_ids_microbatches = torch.split(inputs["input_ids"], n_microbatches)
+    attention_mask_microbatches = torch.split(inputs["attention_mask"], n_microbatches)
 
     microbatches = []
     for input_ids, attention_mask in zip(input_ids_microbatches, attention_mask_microbatches):
diff --git a/pipegoose/nn/pipeline_parallel/pipeline_engine.py b/pipegoose/nn/pipeline_parallel/pipeline_engine.py
index 04273ab..bd1ec56 100644
--- a/pipegoose/nn/pipeline_parallel/pipeline_engine.py
+++ b/pipegoose/nn/pipeline_parallel/pipeline_engine.py
@@ -7,6 +7,7 @@
 
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
+from pipegoose.nn.pipeline_parallel import microbatch
 from pipegoose.nn.pipeline_parallel._comm import RECV_QUEUE
 from pipegoose.nn.pipeline_parallel._job.creator import create_job
 from pipegoose.nn.pipeline_parallel._job.job_type import JobType
@@ -56,13 +57,14 @@ def __init__(
         self.parallel_context = parallel_context
         self.pipeline_context = PipelineContext(scheduler, parallel_context)
 
-    def run(self, inputs: torch.Tensor) -> torch.Tensor:
+    def run(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor) -> torch.Tensor:
         self.worker_manager.spawn()
         self.pipeline_context.forward()
         n_microbatches = self.scheduler.n_microbatches
 
-        # microbatches = microbatch.split(inputs, n_microbatches=n_microbatches)
-        microbatches = torch.chunk(inputs, chunks=n_microbatches, dim=0)
+        inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+        microbatches = microbatch.split(inputs, n_microbatches=n_microbatches)
+        # microbatches = torch.chunk(inputs, chunks=n_microbatches, dim=0)
 
         # NOTE: add a callback to the progress tracker
         # that if the clock_idx is increased, then
diff --git a/pipegoose/nn/pipeline_parallel/pipeline_parallel.py b/pipegoose/nn/pipeline_parallel/pipeline_parallel.py
index 4f6ef20..14e8a11 100644
--- a/pipegoose/nn/pipeline_parallel/pipeline_parallel.py
+++ b/pipegoose/nn/pipeline_parallel/pipeline_parallel.py
@@ -1,5 +1,3 @@
-from typing import List
-
 import torch
 from torch import nn
 
@@ -7,6 +5,7 @@
 from pipegoose.nn.parallel import Parallel
 from pipegoose.nn.pipeline_parallel._utils import get_partition_idx
 from pipegoose.nn.pipeline_parallel._worker import WorkerManager
+from pipegoose.nn.pipeline_parallel.partitioner import UniformPartitioner
 from pipegoose.nn.pipeline_parallel.pipeline_engine import PipelineEngine
 from pipegoose.nn.pipeline_parallel.scheduler import GPipeScheduler
 
@@ -16,11 +15,11 @@ class PipelineParallel(Parallel):
 
     def __init__(
         self,
-        modules: List[nn.Module],
+        module: nn.Module,
         num_microbatches: int,
         parallel_context: ParallelContext,
     ):
-        self.modules = modules
+        self.module = module
         self.num_microbatches = num_microbatches
         self.parallel_context = parallel_context
 
@@ -28,7 +27,8 @@ def __init__(
     def parallelize(self) -> nn.Module:
         if self.parallel_context.pipeline_parallel_size > 1:
             partition_idx = get_partition_idx(self.parallel_context)
-            module = self.modules[partition_idx]
+            partitions = UniformPartitioner(self.module, self.parallel_context).split(["input_ids"])
+            module = partitions[partition_idx]
 
             n_partitions = self.parallel_context.pipeline_parallel_size
             scheduler = GPipeScheduler(self.num_microbatches, n_partitions)
@@ -47,4 +47,4 @@ def parallelize(self) -> nn.Module:
 
             return module
         else:
-            return self.modules
+            return self.module
diff --git a/pipegoose/nn/pipeline_parallel/queue.py b/pipegoose/nn/pipeline_parallel/queue.py
index 39e18b5..3ad42d5 100644
--- a/pipegoose/nn/pipeline_parallel/queue.py
+++ b/pipegoose/nn/pipeline_parallel/queue.py
@@ -73,7 +73,13 @@ def get_saved_activations(key: ActivationKey) -> torch.Tensor:
         """Get the saved activations for a given key for backward job."""
         # NOTE: because a partition can have multiple microbatches,
         input = _INPUT_ACTIVATIONS[key]
-        return input.requires_grad_(True)
+
+        # return input.requires_grad_(True)
+        # TODO: add support regular non-transformers model
+        if isinstance(input, torch.Tensor):
+            return input.requires_grad_(True)
+        else:
+            return input
 
     def save_activations(key: ActivationKey, data: torch.Tensor):
         """Save forward job's activations for backward job."""
diff --git a/pipegoose/testing/utils.py b/pipegoose/testing/utils.py
index 57db4e5..626aa71 100644
--- a/pipegoose/testing/utils.py
+++ b/pipegoose/testing/utils.py
@@ -37,6 +37,7 @@ def spawn(func: Callable, world_size: int = 1, **kwargs):
         kwargs.pop("port")
 
     wrapped_func = partial(func, world_size=world_size, port=port, **kwargs)
+
     mp.spawn(wrapped_func, nprocs=world_size)
 
 
diff --git a/tests/nn/pipeline_parallel/test_microbatch.py b/tests/nn/pipeline_parallel/test_microbatch.py
index 90cc1ff..cb3e119 100644
--- a/tests/nn/pipeline_parallel/test_microbatch.py
+++ b/tests/nn/pipeline_parallel/test_microbatch.py
@@ -6,31 +6,21 @@
 
 
 def test_split_a_mini_batch_to_microbatches():
+    BATCH_SIZE = 36
+    N_MICROBATCHES = 6
+
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     tokenizer.pad_token = tokenizer.eos_token
-    batch_sentences = [
-        "This is the first sentence.",
-        "Here's the second one.",
-        "This makes three.",
-        "Is this the fourth sentence?",
-        "Five sentences now.",
-        "This is the sixth sentence.",
-        "Sentence seven is here.",
-        "We're up to eight now.",
-        "This should be the ninth sentence.",
-        "And finally, the tenth sentence.",
-    ]
-    BATCH_SIZE = len(batch_sentences)
-    N_MICROBATCHES = 5
 
+    text = "Persistence is all you need."
+    batch_sentences = [text for _ in range(BATCH_SIZE)]
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
 
     microbatches = microbatch.split(inputs, n_microbatches=N_MICROBATCHES)
 
     assert isinstance(microbatches, list)
     assert len(microbatches) == N_MICROBATCHES
-    assert "input_ids" in microbatches[0]
-    assert "attention_mask" in microbatches[0]
+    assert all(set(batch.keys()) == set(inputs.keys()) for batch in microbatches) is True
 
     total_sentences = sum(microbatch["input_ids"].size(0) for microbatch in microbatches)
     assert total_sentences == BATCH_SIZE
diff --git a/tests/nn/pipeline_parallel/test_pipeline_parallel.py b/tests/nn/pipeline_parallel/test_pipeline_parallel.py
index f08b354..9b21820 100644
--- a/tests/nn/pipeline_parallel/test_pipeline_parallel.py
+++ b/tests/nn/pipeline_parallel/test_pipeline_parallel.py
@@ -1,11 +1,8 @@
-from copy import deepcopy
-from functools import reduce
-
-import torch
+import pytest
 from torch import nn
-from torch.optim import SGD
+from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM
 
-from pipegoose.nn.pipeline_parallel._utils import get_partition_idx, is_last_stage
+from pipegoose.nn.pipeline_parallel._utils import get_partition_idx
 from pipegoose.nn.pipeline_parallel.pipeline_parallel import PipelineParallel
 from pipegoose.testing.utils import init_parallel_context, spawn
 
@@ -26,11 +23,11 @@ def run_pipeline_parallel(
     rank, world_size, port, tensor_parallel_size, pipeline_parallel_size, data_parallel_size, num_microbatches, kwargs
 ):
     MODEL = kwargs["model"]
-    UPDATED_MODEL = kwargs["updated_model"]
+    # UPDATED_MODEL = kwargs["updated_model"]
     INPUTS = kwargs["inputs"]
-    REF_OUTPUTS = kwargs["ref_outputs"]
-    REF_GRADS = kwargs["ref_grads"]
-    LR = kwargs["lr"]
+    # REF_OUTPUTS = kwargs["ref_outputs"]
+    # REF_GRADS = kwargs["ref_grads"]
+    kwargs["lr"]
 
     forward_timeline = []
     backward_timeline = []
@@ -54,7 +51,8 @@ def forward(self, input):
             return self.module(input)
 
     # NOTE: just for recording the forward and backward timeline
-    model = nn.ModuleList([TimelineRegister(partition_idx, module) for partition_idx, module in enumerate(MODEL)])
+    # model = nn.ModuleList([TimelineRegister(partition_idx, module) for partition_idx, module in enumerate(MODEL)])
+    model = MODEL
 
     parallel_context = init_parallel_context(
         rank, world_size, port, tensor_parallel_size, pipeline_parallel_size, data_parallel_size
@@ -63,67 +61,79 @@ def forward(self, input):
     EXPECTED_FORWARD_TIMELINE, EXPECTED_BACKWARD_TIMELINE = generate_expected_timeline(num_microbatches, partition_idx)
 
     parallelized_model = PipelineParallel(model, num_microbatches, parallel_context).parallelize()
-    optim = SGD(parallelized_model.parameters(), LR)
+    # optim = SGD(parallelized_model.parameters(), LR)
 
     assert isinstance(parallelized_model, nn.Module)
-    assert count_parameters(parallelized_model) < count_parameters(model)
-    assert count_parameters(parallelized_model) == count_parameters(model[partition_idx])
+    # assert count_parameters(parallelized_model) < count_parameters(model)
+    # assert count_parameters(parallelized_model) == count_parameters(model[partition_idx])
 
-    outputs = parallelized_model(INPUTS)
+    parallelized_model(**INPUTS)
 
     assert forward_timeline == EXPECTED_FORWARD_TIMELINE
-    if is_last_stage(parallel_context):
-        assert torch.allclose(torch.cat(outputs, dim=0), REF_OUTPUTS)
+    # if is_last_stage(parallel_context):
+    #     assert torch.allclose(torch.cat(outputs, dim=0), REF_OUTPUTS)
 
-    optim.zero_grad()
-    for output in outputs:
-        output.sum().backward(retain_graph=True)
+    # optim.zero_grad()
+    # for output in outputs:
+    #     output.sum().backward(retain_graph=True)
 
-    optim.step()
+    # optim.step()
 
-    assert backward_timeline == EXPECTED_BACKWARD_TIMELINE
-    for p, ref_grad in zip(parallelized_model.parameters(), REF_GRADS[partition_idx]):
-        assert torch.allclose(p.grad, ref_grad)
+    # assert backward_timeline == EXPECTED_BACKWARD_TIMELINE
+    # for p, ref_grad in zip(parallelized_model.parameters(), REF_GRADS[partition_idx]):
+    #     assert torch.allclose(p.grad, ref_grad)
 
-    for p, ref_p in zip(parallelized_model.parameters(), UPDATED_MODEL[partition_idx].parameters()):
-        assert torch.allclose(p, ref_p)
+    # for p, ref_p in zip(parallelized_model.parameters(), UPDATED_MODEL[partition_idx].parameters()):
+    #     assert torch.allclose(p, ref_p)
 
 
-def test_pipeline_parallel():
-    TENSOR_PARALLEL_SIZE, PIPELINE_PARALLEL_SIZE, DATA_PARALLEL_SIZE = 1, 4, 1
-    WORLD_SIZE = TENSOR_PARALLEL_SIZE * PIPELINE_PARALLEL_SIZE * DATA_PARALLEL_SIZE
+@pytest.mark.parametrize("pipeline_parallel_size", [4])
+def test_pipeline_parallel(pipeline_parallel_size):
+    TENSOR_PARALLEL_SIZE = 1
+    DATA_PARALLEL_SIZE = 1
+    WORLD_SIZE = TENSOR_PARALLEL_SIZE * pipeline_parallel_size * DATA_PARALLEL_SIZE
 
     BATCH_SIZE, NUM_MICROBATCHES = 32, 6
     SEQ_LEN, HIDDEN_DIM = 10, 5
     LR = 0.1
 
-    inputs = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_DIM, requires_grad=False)
-    model = nn.ModuleList([nn.Sequential(nn.Linear(HIDDEN_DIM, HIDDEN_DIM), nn.ReLU()) for _ in range(PIPELINE_PARALLEL_SIZE)])
-    ORIG_MODEL = deepcopy(model)
-    optim = SGD(model.parameters(), lr=LR)
+    # inputs = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_DIM, requires_grad=False)
+    # model = nn.ModuleList([nn.Sequential(nn.Linear(HIDDEN_DIM, HIDDEN_DIM), nn.ReLU()) for _ in range(pipeline_parallel_size)])
+    text = "Persistence is all you need."
+    texts = [text for _ in range(BATCH_SIZE)]
+    # model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")
+    model = BloomForCausalLM(BloomConfig(n_layer=6))
+    tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+    # ORIG_MODEL = deepcopy(model)
+
+    inputs = tokenizer(texts, return_tensors="pt")
+    # optim = SGD(model.parameters(), lr=LR)
 
-    outputs = reduce(lambda inputs, layer: layer(inputs), model, inputs)
+    # outputs = reduce(lambda inputs, layer: layer(inputs), model, inputs)
+    outputs = model(**inputs, labels=inputs["input_ids"])
 
-    optim.zero_grad()
-    outputs.sum().backward()
-    optim.step()
+    # optim.zero_grad()
+    # outputs.loss.sum().backward()
+    # optim.step()
 
-    grads = [[p.grad for p in layer.parameters()] for layer in model]
+    # grads = [[p.grad for p in layer.parameters()] for layer in model]
 
     kwargs = {
         "lr": LR,
-        "model": ORIG_MODEL,
+        "model": model,
+        "tokenizer": tokenizer,
         "updated_model": model,
-        "inputs": inputs.detach(),
-        "ref_outputs": outputs.detach(),
-        "ref_grads": grads,
+        "inputs": inputs,
+        "ref_logits": outputs.logits.detach(),
+        "ref_loss": outputs.loss.detach(),
+        # "ref_grads": grads,
     }
 
     spawn(
         run_pipeline_parallel,
         world_size=WORLD_SIZE,
         tensor_parallel_size=TENSOR_PARALLEL_SIZE,
-        pipeline_parallel_size=PIPELINE_PARALLEL_SIZE,
+        pipeline_parallel_size=pipeline_parallel_size,
         data_parallel_size=DATA_PARALLEL_SIZE,
         num_microbatches=NUM_MICROBATCHES,
         kwargs=kwargs,
diff --git a/tests/optim/zero/test_sharding.py b/tests/optim/zero/test_sharding.py
index 1c61151..855303f 100644
--- a/tests/optim/zero/test_sharding.py
+++ b/tests/optim/zero/test_sharding.py
@@ -41,9 +41,6 @@ def calculate_total_sharded_elements(sharded_params):
     assert len(sharded_params) == world_size
 
     for rank, shard in enumerate(sharded_params):
-        if rank == 4:
-            assert 1 == 1
-
         assert isinstance(shard, list)
         for param_group in shard:
             assert len(param_group["params"]) > 0

From 46339baff2c86d7553db94cfb010d8ae2a97861e Mon Sep 17 00:00:00 2001
From: Ayman <aymanbourramouss@gmail.com>
Date: Mon, 27 Nov 2023 00:37:39 +0100
Subject: [PATCH 22/24] generalize block name finding

Since the name of a transformer block (start and end nodes) can follow the pattern model_layers_X (for mistral) or transformer_h_X we must generalize it.

Co-Authored-By: Daniel Grittner <29932077+danielgrittner@users.noreply.github.com>
---
 pipegoose/nn/pipeline_parallel/partitioner.py | 24 +++++++++++++++++--
 .../nn/pipeline_parallel/test_partitioner.py  |  8 ++++---
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 085aff5..13fd8c0 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -30,9 +30,29 @@ def __init__(self, model: nn.Module, parallel_context: ParallelContext):
         self.model = model
         self.parallel_context = parallel_context
 
+    def _find_transformer_block_prefix(self):
+        # Since there is no standarized way to find the name of the transformer block, we have to find the name first
+        transformer_attr_name = self.model.base_model_prefix
+
+        transformer_block_name = None
+        transformer = getattr(self.model, transformer_attr_name)
+        for attr_name in dir(transformer):
+            part = getattr(transformer, attr_name)
+            if isinstance(part, nn.ModuleList):
+                transformer_block_name = attr_name
+                break
+
+        transformer_block_prefix = (
+            rf"{transformer_attr_name}_{transformer_block_name}_(\d+)"
+        )
+
+        print(f"transformer_block_prefix: {transformer_block_prefix}")
+        return transformer_block_prefix
+
     def _get_transformer_block_id(self, node_name: str) -> Optional[str]:
-        match = re.match(r"transformer_h_(\d+)", node_name)
-        return match.group(1) if match else None
+        pattern = self._find_transformer_block_prefix()
+        match = re.match(pattern, node_name)
+        return match.group() if match else None
 
     def _split_nodes(
         self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 0bdd0f5..081672e 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -96,15 +96,17 @@ def run_model_partitioner(
 @pytest.mark.parametrize(
     "model_retrieval_func",
     [
-        # get_gpt2_and_tokenizer,
+        get_gpt2_and_tokenizer,
         get_bloom_and_tokenizer_with_6_layers,
-        # get_bloom_560m_and_tokenizer,
+        get_bloom_560m_and_tokenizer,
     ],
 )
 def test_naive_partitioning(pipeline_parallel_size, model_retrieval_func):
     TENSOR_PARALLEL_SIZE = 1
     DATA_PARALLEL_SIZE = 1
-
+    print(
+        f"Running test with pipeline_parallel_size={pipeline_parallel_size}, tensor_parallel_size={TENSOR_PARALLEL_SIZE}, data_parallel_size={DATA_PARALLEL_SIZE}"
+    )
     spawn(
         run_model_partitioner,
         world_size=pipeline_parallel_size,

From ed433d11b78b9c7bdd7e5dfccc786cbf8a5701de Mon Sep 17 00:00:00 2001
From: xrsrke <b3f0cus@icloud.com>
Date: Mon, 27 Nov 2023 09:16:01 +0700
Subject: [PATCH 23/24] [Refactor] Apply pre-commit to model partitioner

---
 pipegoose/nn/pipeline_parallel/partitioner.py | 95 ++++++-------------
 .../nn/pipeline_parallel/test_partitioner.py  | 27 ++----
 2 files changed, 40 insertions(+), 82 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 13fd8c0..391c0f2 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -1,16 +1,15 @@
+import re
 from abc import ABC, abstractclassmethod
+from collections import defaultdict
 from enum import Enum, auto
-from typing import List
-from torch import nn
+from typing import Dict, List, Optional
+
 import torch
-from typing import Dict
-from collections import defaultdict
-import re
-from pipegoose.distributed.parallel_context import ParallelContext
-from pipegoose.distributed.parallel_mode import ParallelMode
+from torch import nn
 from transformers.utils.fx import symbolic_trace
 
-from typing import Optional
+from pipegoose.distributed.parallel_context import ParallelContext
+from pipegoose.distributed.parallel_mode import ParallelMode
 
 
 class PartitionPolicy(Enum):
@@ -31,7 +30,7 @@ def __init__(self, model: nn.Module, parallel_context: ParallelContext):
         self.parallel_context = parallel_context
 
     def _find_transformer_block_prefix(self):
-        # Since there is no standarized way to find the name of the transformer block, we have to find the name first
+        # Since there is no standardized way to find the name of the transformer block, we have to find the name first
         transformer_attr_name = self.model.base_model_prefix
 
         transformer_block_name = None
@@ -42,11 +41,8 @@ def _find_transformer_block_prefix(self):
                 transformer_block_name = attr_name
                 break
 
-        transformer_block_prefix = (
-            rf"{transformer_attr_name}_{transformer_block_name}_(\d+)"
-        )
+        transformer_block_prefix = rf"{transformer_attr_name}_{transformer_block_name}_(\d+)"
 
-        print(f"transformer_block_prefix: {transformer_block_prefix}")
         return transformer_block_prefix
 
     def _get_transformer_block_id(self, node_name: str) -> Optional[str]:
@@ -54,9 +50,7 @@ def _get_transformer_block_id(self, node_name: str) -> Optional[str]:
         match = re.match(pattern, node_name)
         return match.group() if match else None
 
-    def _split_nodes(
-        self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3
-    ) -> Dict:
+    def _split_nodes(self, traced_graph_module: torch.fx.GraphModule, shard_count: int = 3) -> Dict:
         """Utility used to trace a graph and identify shard cutpoints."""
 
         param_count: Dict[str, int] = {}
@@ -64,7 +58,7 @@ def _split_nodes(
         # Find the total number of params in the model and
         # the number of params per shard we are aiming for.
 
-        # Note: we need to iterate over named_parameters AND named_modules because
+        # NOTE: we need to iterate over named_parameters AND named_modules because
         # sometimes the parameters of a module is split into weight and bia in the
         # traced graph and sometimes not.
         # Example:
@@ -83,19 +77,18 @@ def _split_nodes(
             # exclude embedding layers
             if isinstance(module, nn.Embedding):
                 name = name.replace(".", "_")
-                exclude_param_count += sum([x.numel() for x in module.parameters()])
+                exclude_param_count += sum(x.numel() for x in module.parameters())
                 param_count[name] = 0
                 continue
 
             name = name.replace(".", "_")
-            param_count[name] = sum([x.numel() for x in module.parameters()])
+            param_count[name] = sum(x.numel() for x in module.parameters())
 
         # Note that we use param_count[""] as total parameters which does not include the
         # lm_head. We want to exclude the lm_head and the embeddings here because they
         # usually cause high skew which does not lead to equal partitioning of the
         # transformer blocks.
         per_shard_param = (param_count[""] - exclude_param_count) // shard_count
-        original_per_shard_param = per_shard_param
         node_name_to_shard_id: Dict[str, int] = {}
         shard_id = 0
         shard_id_to_param_count = [0 for _ in range(shard_count)]
@@ -123,8 +116,7 @@ def _split_nodes(
 
                 if (
                     shard_id_to_param_count[shard_id] > 0
-                    and (shard_id_to_param_count[shard_id] + current_param_count)
-                    >= per_shard_param
+                    and (shard_id_to_param_count[shard_id] + current_param_count) >= per_shard_param
                     and (shard_id + 1) < shard_count
                     and transformer_block_ended
                 ):
@@ -158,9 +150,7 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
 
         symbolic_traced_module = symbolic_trace(model, input_names=input_names)
 
-        node_name_to_shard_id, output_from_shard = self._split_nodes(
-            symbolic_traced_module, n_partitions
-        )
+        node_name_to_shard_id, output_from_shard = self._split_nodes(symbolic_traced_module, n_partitions)
 
         nodes_per_shard = defaultdict(dict)
 
@@ -169,43 +159,24 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
         for node in symbolic_traced_module.graph.nodes:
             # If the current node is in the next shard, we insert an output node.
             # A new graph is created and a placeholder is added for the next shard.
-            if (
-                node.name in node_name_to_shard_id
-                and prev_shard_id < node_name_to_shard_id[node.name]
-            ):
+            if node.name in node_name_to_shard_id and prev_shard_id < node_name_to_shard_id[node.name]:
                 assert prev_node, "prev_node cannot be None"
 
                 # generate output node for the past graph/shard
                 with new_graph.inserting_after(prev_node):
                     outputs = output_from_shard[prev_shard_id]
-                    graph_output_names = [prev_node.name] + [
-                        i for i in outputs if i != prev_node.name
-                    ]
-
-                    if isinstance(
-                        nodes_per_shard[prev_shard_id][prev_node.name], tuple
-                    ):
-                        graph_outputs = nodes_per_shard[prev_shard_id][
-                            prev_node.name
-                        ] + tuple(
-                            [
-                                nodes_per_shard[prev_shard_id][i]
-                                for i in outputs
-                                if i != prev_node.name
-                            ]
+                    graph_output_names = [prev_node.name] + [i for i in outputs if i != prev_node.name]
+
+                    if isinstance(nodes_per_shard[prev_shard_id][prev_node.name], tuple):
+                        graph_outputs = nodes_per_shard[prev_shard_id][prev_node.name] + tuple(
+                            nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name
                         )
                     else:
                         graph_outputs = tuple(
                             [nodes_per_shard[prev_shard_id][prev_node.name]]
-                            + [
-                                nodes_per_shard[prev_shard_id][i]
-                                for i in outputs
-                                if i != prev_node.name
-                            ]
+                            + [nodes_per_shard[prev_shard_id][i] for i in outputs if i != prev_node.name]
                         )
-                    new_graph.create_node(
-                        op="output", target="output", args=(graph_outputs,)
-                    )
+                    new_graph.create_node(op="output", target="output", args=(graph_outputs,))
 
                 # generate new graph/shard and its input nodes (i.e., the output from the previous graph/shard)
                 num_graphs += 1
@@ -213,12 +184,8 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
                 new_graph = torch.fx.Graph()
                 # generate placeholder nodes in the new graph/shard which matches the output nodes of the previous graph/shard
                 for new_graph_input_name in graph_output_names:
-                    graph_input_node = new_graph.create_node(
-                        "placeholder", new_graph_input_name
-                    )
-                    nodes_per_shard[node_name_to_shard_id[node.name]][
-                        new_graph_input_name
-                    ] = graph_input_node
+                    graph_input_node = new_graph.create_node("placeholder", new_graph_input_name)
+                    nodes_per_shard[node_name_to_shard_id[node.name]][new_graph_input_name] = graph_input_node
 
             if node.op in [
                 "placeholder",
@@ -229,9 +196,7 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
             ]:
                 # Copy the nodes from the existing graph to the new graph.
                 current_shard_id = node_name_to_shard_id[node.name]
-                new_node = new_graph.node_copy(
-                    node, lambda x: nodes_per_shard[current_shard_id][x.name]
-                )
+                new_node = new_graph.node_copy(node, lambda x: nodes_per_shard[current_shard_id][x.name])
                 nodes_per_shard[current_shard_id][node.name] = new_node
             elif node.op == "output":
                 # If this is the last node, we should add an output
@@ -245,6 +210,10 @@ def split(self, input_names: List[str]) -> List[nn.Module]:
             prev_node = new_node
             prev_shard_id = node_name_to_shard_id[node.name]
 
+        for shard in module_list:
+            shard.graph.lint()
+            shard.recompile()
+
         return module_list
 
 
@@ -257,9 +226,7 @@ def _get_partitioner(policy: PartitionPolicy) -> BasePartitioner:
     return policy_to_partitioner[policy]
 
 
-def get_model_partition(
-    module: nn.Module, policy: PartitionPolicy, parallel_context: ParallelContext
-) -> nn.Module:
+def get_model_partition(module: nn.Module, policy: PartitionPolicy, parallel_context: ParallelContext) -> nn.Module:
     """Get the corresponding partition of the current process."""
     partitioner = _get_partitioner(policy)
     partitions = partitioner(module, parallel_context).split()
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 081672e..9d2c22d 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -1,33 +1,28 @@
 import pytest
 import torch
 from transformers import (
-    AutoTokenizer,
     AutoModelForCausalLM,
+    AutoTokenizer,
     BloomConfig,
     BloomForCausalLM,
 )
-from pipegoose.nn.pipeline_parallel.partitioner import (  # PartitionPolicy,; get_model_partition,
-    UniformPartitioner,
-)
+
+from pipegoose.nn.pipeline_parallel.partitioner import UniformPartitioner
 from pipegoose.testing.utils import init_parallel_context, spawn
 
 
 def get_gpt2_and_tokenizer():
-    return AutoModelForCausalLM.from_pretrained("gpt2"), AutoTokenizer.from_pretrained(
-        "gpt2"
-    )
+    return AutoModelForCausalLM.from_pretrained("gpt2"), AutoTokenizer.from_pretrained("gpt2")
 
 
 def get_bloom_560m_and_tokenizer():
-    return AutoModelForCausalLM.from_pretrained(
+    return AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m"), AutoTokenizer.from_pretrained(
         "bigscience/bloom-560m"
-    ), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+    )
 
 
 def get_bloom_and_tokenizer_with_6_layers():
-    return BloomForCausalLM(BloomConfig(n_layer=6)), AutoTokenizer.from_pretrained(
-        "bigscience/bloom-560m"
-    )
+    return BloomForCausalLM(BloomConfig(n_layer=6)), AutoTokenizer.from_pretrained("bigscience/bloom-560m")
 
 
 # TODO: Also add a function for a generic nn.Transformer model
@@ -81,13 +76,9 @@ def run_model_partitioner(
     partitioned_model_result = inputs["input_ids"]
     for partition_id in range(pipeline_parallel_size):
         if type(partitioned_model_result) in (list, tuple):
-            partitioned_model_result = partitioned_model[partition_id](
-                *partitioned_model_result
-            )
+            partitioned_model_result = partitioned_model[partition_id](*partitioned_model_result)
         else:
-            partitioned_model_result = partitioned_model[partition_id](
-                partitioned_model_result
-            )
+            partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
 
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"
 

From 369263a77b9f6ae5f9c932d6f75a1c13dad180c7 Mon Sep 17 00:00:00 2001
From: xrsrke <b3f0cus@icloud.com>
Date: Mon, 27 Nov 2023 09:30:52 +0700
Subject: [PATCH 24/24] [Refactor] Remove sample input in model partitioning
 for

---
 pipegoose/nn/pipeline_parallel/partitioner.py  |  4 +++-
 tests/nn/pipeline_parallel/test_partitioner.py | 10 ++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pipegoose/nn/pipeline_parallel/partitioner.py b/pipegoose/nn/pipeline_parallel/partitioner.py
index 391c0f2..cc10abf 100644
--- a/pipegoose/nn/pipeline_parallel/partitioner.py
+++ b/pipegoose/nn/pipeline_parallel/partitioner.py
@@ -11,6 +11,8 @@
 from pipegoose.distributed.parallel_context import ParallelContext
 from pipegoose.distributed.parallel_mode import ParallelMode
 
+INPUT_NAMES = ["input_ids", "attention_mask"]
+
 
 class PartitionPolicy(Enum):
     UNIFORM = auto()
@@ -141,7 +143,7 @@ def _split_nodes(self, traced_graph_module: torch.fx.GraphModule, shard_count: i
             node_name_to_shard_id[node.name] = shard_id
         return node_name_to_shard_id, output_from_shard
 
-    def split(self, input_names: List[str]) -> List[nn.Module]:
+    def split(self, input_names: List[str] = INPUT_NAMES) -> List[nn.Module]:
         n_partitions = self.parallel_context.pipeline_parallel_size
         model = self.model
         module_list: List[torch.fx.GraphModule] = []
diff --git a/tests/nn/pipeline_parallel/test_partitioner.py b/tests/nn/pipeline_parallel/test_partitioner.py
index 9d2c22d..fafc38c 100644
--- a/tests/nn/pipeline_parallel/test_partitioner.py
+++ b/tests/nn/pipeline_parallel/test_partitioner.py
@@ -50,9 +50,9 @@ def run_model_partitioner(
     model.eval()
     tokenizer.pad_token = tokenizer.eos_token
     inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
-    gt_logits = model(input_ids=inputs["input_ids"]).logits
+    gt_logits = model(**inputs).logits
 
-    partitioned_model = UniformPartitioner(model, parallel_context).split(["input_ids"])
+    partitioned_model = UniformPartitioner(model, parallel_context).split()
     assert (
         len(partitioned_model) == pipeline_parallel_size
     ), f"Received model with {len(partitioned_model)} instead of {pipeline_parallel_size}"
@@ -71,14 +71,12 @@ def run_model_partitioner(
         print("==================")
     print("End printing partitioned model")
 
-    inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
-
-    partitioned_model_result = inputs["input_ids"]
+    partitioned_model_result = inputs
     for partition_id in range(pipeline_parallel_size):
         if type(partitioned_model_result) in (list, tuple):
             partitioned_model_result = partitioned_model[partition_id](*partitioned_model_result)
         else:
-            partitioned_model_result = partitioned_model[partition_id](partitioned_model_result)
+            partitioned_model_result = partitioned_model[partition_id](**partitioned_model_result)
 
     assert torch.allclose(gt_logits, partitioned_model_result), "Results are not close"