diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2e651c1ac..a8143cdc8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -43,11 +43,14 @@ jobs:
     - name: Set up python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.8'
+        python-version: '3.11'
         cache: 'pip'
+    # flash-attn requires torch to be installed
     - name: Install dependencies
       run: |
-        pip install -r requirements.txt
+        pip install --upgrade pip
+        pip install -e '.[dev]'
+        pip install -e '.[flash-attn]'
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
       uses: github/codeql-action/init@v3
@@ -57,11 +60,11 @@ jobs:
         # If you wish to specify custom queries, you can do so here or in a config file.
         # By default, queries listed here will override any specified in a config file.
         # Prefix the list here with "+" to use these queries and those in the config file.
-        
+
         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         # queries: security-extended,security-and-quality
 
-        
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
@@ -70,7 +73,7 @@ jobs:
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
 
-    #   If the Autobuild fails above, remove it and uncomment the following three lines. 
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
 
     # - run: |
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c02252c68..57fad93ec 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,9 +26,12 @@ jobs:
         with:
           python-version: '3.11'
           cache: 'pip'
+      # flash-attn requires torch to be installed
       - name: Install dependencies
         run: |
+          pip install --upgrade pip
           pip install -e '.[dev]'
+          pip install -e '.[flash-attn]'
       # - name: Lint with flake8
       #   run: |
       #     # stop the build if there are Python syntax errors or undefined names
diff --git a/mttl/config.py b/mttl/config.py
index a4a88b488..9a0347d7c 100644
--- a/mttl/config.py
+++ b/mttl/config.py
@@ -55,7 +55,16 @@ def __init__(self, filenames=None, kwargs=None, raise_error=True, silent=False):
         self.post_init(silent=silent)
 
     def post_init(self, silent=False):
-        pass
+        if self.attn_implementation == "eager" and self.pack_sequences:
+            logger.warning(
+                "Eager attention is not compatible with packed sequences"
+                + ", tokens across examples will not be masked"
+            )
+        elif self.attn_implementation == "flash_attention_2" and self.pack_sequences:
+            logger.warning(
+                "The wrapper we provide for flash attention 2 may not behave as expected for"
+                + " some models. Please make sure you test the model with packed sequences"
+            )
 
     @classmethod
     def fromdict(cls, data):
@@ -181,6 +190,14 @@ def _set_defaults(self):
         # Data config
         self.dataset = None
         self.custom_tasks_splits = None
+        self.subsample_train = None
+        self.subsample_dev = None
+        self.subsample_test = None
+        self.subsample_per_task = False
+        self.pack_sequences = False
+        self.pad_to_multiple_of = 8
+        self.padding_side = "right"
+        self.max_seq_per_pack = 4
 
         self.data_dir = os.getenv("TRAIN_DIR", "/tmp/")
         self.output_dir = os.getenv("OUTPUT_DIR", "./output")
@@ -253,11 +270,6 @@ def _set_defaults(self):
         self.seed = 42
         self.eval_before_training = True
 
-        self.subsample_train = None
-        self.subsample_dev = None
-        self.subsample_test = None
-        self.subsample_per_task = False
-
         self.ni_online_eval = False  # zero-shot online eval for ni
         self.t0_online_eval = False  # zero-shot eval for t0
         self.early_stop_on_zero_shot = False  # zero-shot early stopping
@@ -281,6 +293,7 @@ def _set_defaults(self):
 
         self.model = None
         self.model_family = None  # model family, either "gpt" or "encdec"
+        self.attn_implementation = None
 
         self.precision = "32"
         self.monitor_grad_alignment_on = None
diff --git a/mttl/datamodule/base.py b/mttl/datamodule/base.py
index 8cbb8ee3d..b4f321622 100644
--- a/mttl/datamodule/base.py
+++ b/mttl/datamodule/base.py
@@ -1,11 +1,14 @@
 import itertools
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Union
 
 import torch
+import torch.nn.functional as F
 from datasets import Dataset as ArrowDataset
 from datasets import concatenate_datasets
 from pytorch_lightning import LightningDataModule
+from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, Dataset
 from transformers import AutoTokenizer
 from transformers.tokenization_utils_base import PaddingStrategy
@@ -35,6 +38,9 @@ class DatasetConfig:
     subsample_test: int = None
     subsample_per_task: bool = False  # Changing default to False
     subsample: int = -1
+    pack_sequences: bool = False  # True
+    pad_to_multiple_of: int = 8
+    max_seq_per_pack: int = 4
 
 
 @dataclass
@@ -261,7 +267,90 @@ def prepare_inputs_for_gpt_family(self, sources, labels):
         output_batch["labels"] = targets
         return output_batch
 
+    def _get_nested_type(self, item):
+        while isinstance(item, (list, tuple)):
+            item = item[0]
+        return type(item)
+
+    def _tensor_dtype(self, item):
+        dtype = self._get_nested_type(item)
+        return {"int": torch.int64, "float": torch.float32, "bool": torch.bool}.get(
+            dtype.__name__, None
+        )
+
     def __call__(self, batch: Dict):
+        # is our input already tokenized ?
+        # trim according to the attention mask and return
+
+        def pad_sequence_wrapper(tensor_list, batch_first, padding_value, side="right"):
+            """Padding Sequence Fn that supports left padding"""
+            if side == "left":
+                tensor_list = [x.flip(0) for x in tensor_list]
+
+            padded = pad_sequence(
+                tensor_list, batch_first=batch_first, padding_value=padding_value
+            )
+
+            if side == "left":
+                padded = padded.flip(1)
+
+            return padded
+
+        if "input_ids" in batch[0]:
+            output_batch = defaultdict(list)
+            for batch_item in batch:
+                for key, value in batch_item.items():
+                    dtype = self._tensor_dtype(value)
+
+                    if dtype:
+                        output_batch[key].append(torch.Tensor(value).to(dtype))
+                    else:
+                        output_batch[key].append(value)
+
+            # create proper containers
+            for key, value in output_batch.items():
+                if isinstance(value[0], torch.Tensor):
+                    pad_token = {
+                        "input_ids": self.tokenizer.pad_token_id,
+                        "labels": self.label_pad_token_id,
+                    }.get(key, 0)
+                    value = pad_sequence_wrapper(
+                        value,
+                        batch_first=True,
+                        padding_value=pad_token,
+                        side=self.tokenizer.padding_side,
+                    )
+                    output_batch[key] = value
+
+            packed_seq_lens = output_batch["seq_lens"].flatten().cumsum(0)
+            output_batch["packed_seq_lens"] = F.pad(packed_seq_lens, (1, 0)).to(
+                torch.int32
+            )
+
+            # build the appropriate "block"-lower triangular mask for sdpa attention
+            bs, seq_len = output_batch["input_ids"].shape
+            packed_attn_mask = torch.zeros(bs, 1, seq_len, seq_len, dtype=torch.bool)
+            for i in range(bs):
+                start_idx = 0
+                for seq_len in output_batch["seq_lens"][i]:
+                    packed_attn_mask[
+                        i,
+                        :,
+                        start_idx : start_idx + seq_len,
+                        start_idx : start_idx + seq_len,
+                    ] = True
+                    start_idx += seq_len
+
+                # For whatever reason, we need to let padding tokens attend the previous context ¯\_(ツ)_/¯
+                # Otherwise SDPA has nans
+                packed_attn_mask[i, :, start_idx:, :start_idx] = True
+
+            packed_attn_mask = packed_attn_mask.tril()
+            output_batch["packed_attn_mask"] = packed_attn_mask
+
+            return dict(output_batch)
+
+        # Otherwise process as expected
         sources = [b["source"] for b in batch]
         labels = [b["target"] for b in batch]
         task_ids = [b.get("task_id", None) for b in batch]
@@ -432,7 +521,7 @@ def collate_fn(self):
             padding="longest",
             max_input_length=self.config.max_input_length,
             max_output_length=self.config.max_output_length,
-            pad_to_multiple_of=8,
+            pad_to_multiple_of=self.config.pad_to_multiple_of,
             return_tensors="pt",
             model_family=self.config.model_family,
             for_generation=self.for_generation,
@@ -563,20 +652,137 @@ def setup(self, stage=None):
     def setup_dataset(self):
         pass
 
+    def tokenize_dataset(self, dataset):
+
+        # NOTE: padding is hardcoded to `longest` already.
+        # return tensors is harcoded to `pt`, but tokenizer in dataset.map overwrites this
+        # TODO: write a test for this
+        pad_to_multiple = self.config.pad_to_multiple_of
+        self.config.pad_to_multiple_of = 1
+
+        # remove `rng` before mapping, as it's not pickleable
+        rng = self.rng
+        self.rng = None
+
+        def collate_fn_wrapper(batch):
+            out = self.collate_fn([batch])
+            return {k: v[0] for k, v in out.items()}
+
+        dataset = dataset.map(collate_fn_wrapper, batched=False, num_proc=20)
+        self.rng = rng
+        self.collate_fn.pad_to_multiple_of = pad_to_multiple
+
+        return dataset
+
+    def pack_sequences(self, dataset, max_sequences=4, shuffle=True):
+        """
+        Combine sequences together in larger chunks closer to `max_input_length`
+        """
+        # first, let's shuffle the dataset
+        if shuffle:
+            dataset = dataset.shuffle(seed=42)
+
+        # TODO: first partition dataset according to `task_name`, and
+        # pack each task individually to ensure that we don't mix tasks
+
+        # Very basic code that will iterate over sequences one by one,
+        # and merge together until the max_input_length is reached
+        # This is not optimal, but it's a start
+        max_length = self.config.max_input_length
+
+        def group(examples):
+
+            def new_container():
+                # for when starting a new packed batch
+                return {k: [] for k in list(examples.keys()) + ["seq_lens"]}
+
+            grouped_samples = new_container()
+
+            def append_to_running_seq(container, example):
+                for k, v in example.items():
+                    if isinstance(v, int) or isinstance(v, str):
+                        container[k] += [v]
+                    elif isinstance(v, list):
+                        container[k] += v
+                    else:
+                        raise ValueError(f"Unknown type {type(v)}")
+
+                # TODO: THis is SOMEHOW WRONG. CHECK.
+                container["seq_lens"] += [len(example["input_ids"])]
+
+            def add_finished_sequence(container, example):
+                for k, v in example.items():
+                    container[k].append(v)
+
+            def trim_ex(ex):
+                for key in ex.keys():
+                    value = ex[key]
+                    if isinstance(value, list):
+                        ex[key] = value[:max_length]
+
+            def dict_get_item(ex, i):
+                return {k: v[i] for k, v in ex.items()}
+
+            num_examples = len(examples["input_ids"])
+            packed = new_container()
+            current_lens = []
+            for i in range(num_examples):
+                ex = dict_get_item(examples, i)
+                ex_len = len(ex["input_ids"])
+                # can pack
+                if (
+                    sum(current_lens) + ex_len <= max_length
+                    and len(current_lens) < max_sequences
+                ):
+                    append_to_running_seq(packed, ex)
+                    current_lens += [ex_len]
+                else:
+                    if len(current_lens) > 0:
+                        add_finished_sequence(grouped_samples, packed)
+                    packed = new_container()
+                    current_lens = []
+                    trim_ex(ex)
+                    append_to_running_seq(packed, ex)
+                    current_lens += [ex_len]
+
+            if len(current_lens) > 0:
+                add_finished_sequence(grouped_samples, packed)
+
+            return grouped_samples
+
+        dataset = dataset.map(
+            group,
+            num_proc=20,
+            batched=True,
+            batch_size=10_000,
+            remove_columns=list(dataset.features),
+        )
+        return dataset
+
     def post_setup_dataset(self):
         for split in ["train", "dev", "test"]:
-            subsample = getattr(self.config, f"subsample_{split}", None)
 
+            subsample = getattr(self.config, f"subsample_{split}", None)
             if subsample and subsample > 0:
+                dataset = getattr(self, f"{split}_dataset")
                 logger.warning(
                     f"subsampling the {split} dataset to {subsample} samples"
                 )
-                dataset = getattr(self, f"{split}_dataset")
                 sub_dataset = self.subsample_dataset(
                     dataset, subsample, per_task=self.config.subsample_per_task
                 )
+
                 setattr(self, f"{split}_dataset", sub_dataset)
 
+            if self.config.pack_sequences and split == "train":
+                dataset = getattr(self, f"{split}_dataset")
+                logger.info(f"Packing sequences for {split} dataset")
+                dataset = self.tokenize_dataset(dataset)
+                dataset = self.pack_sequences(
+                    dataset, max_sequences=self.config.max_seq_per_pack
+                )
+                setattr(self, f"{split}_dataset", dataset)
+
         self.print_infos()
 
 
@@ -588,7 +794,7 @@ def collate_fn(self):
             padding="longest",
             max_input_length=self.config.max_input_length,
             max_output_length=self.config.max_output_length,
-            pad_to_multiple_of=8,
+            pad_to_multiple_of=self.config.pad_to_multiple_of,
             return_tensors="pt",
             model_family=self.config.model_family,
             for_generation=self.for_generation,
@@ -673,6 +879,9 @@ def get_datamodule(args, for_generation=False, dataset_override=None):
         "subsample_dev": args.subsample_dev,
         "subsample_test": args.subsample_test,
         "subsample_per_task": args.subsample_per_task,
+        "pad_to_multiple_of": args.pad_to_multiple_of,
+        "padding_side": args.padding_side,
+        "max_seq_per_pack": args.max_seq_per_pack,
     }
 
     if dataset in [
@@ -726,6 +935,7 @@ def get_datamodule(args, for_generation=False, dataset_override=None):
             **common_kwargs,
             remove_phi_eval_tasks=args.remove_phi_eval_tasks,
             include_task_source=args.include_task_source,
+            pack_sequences=args.pack_sequences,
         )
         dm = FlanModule(config, for_generation=for_generation)
     elif "flat" in dataset:
@@ -733,6 +943,7 @@ def get_datamodule(args, for_generation=False, dataset_override=None):
             **common_kwargs,
             source_template=args.source_template,
             augment_few_shot=args.augment_few_shot,
+            pack_sequences=args.pack_sequences,
         )
         dm = FlatMultiTaskModule(config, for_generation=for_generation)
     elif "mmlu" in dataset:
diff --git a/mttl/models/expert_model.py b/mttl/models/expert_model.py
index 726847310..fafef8a3c 100644
--- a/mttl/models/expert_model.py
+++ b/mttl/models/expert_model.py
@@ -55,6 +55,7 @@ def __init__(self, **kwargs):
                 load_in_4bit=self.load_in_4bit,
                 load_in_8bit=self.load_in_8bit,
                 device_map=getattr(self.hparams, "device_map", "cpu"),
+                attn_implementation=getattr(self.hparams, "attn_implementation", None),
             )
 
         if self.load_in_8bit:
diff --git a/mttl/models/library/expert_library.py b/mttl/models/library/expert_library.py
index f909df177..5d2775de4 100644
--- a/mttl/models/library/expert_library.py
+++ b/mttl/models/library/expert_library.py
@@ -1576,7 +1576,9 @@ def pull_dataset(
         split: Optional[str] = None,
         **kwargs,
     ) -> Dataset:
-        return load_dataset(self.dataset_id, name, split=split, **kwargs)
+        return load_dataset(
+            self.dataset_id, name, split=split, trust_remote_code=True, **kwargs
+        )
 
     def push_dataset(
         self,
diff --git a/mttl/models/modifiers/routing.py b/mttl/models/modifiers/routing.py
index 73cd5acf3..7d6f5fb57 100644
--- a/mttl/models/modifiers/routing.py
+++ b/mttl/models/modifiers/routing.py
@@ -16,6 +16,9 @@ class RoutingInfo:
     attention_mask: torch.Tensor = None
     task_weights: torch.nn.ParameterDict = None
     aux_losses: Dict = field(default_factory=dict)
+    packed_seq_lens: List[int] = None
+    seq_lens: List[int] = None
+    packed_attn_mask: torch.Tensor = None
 
     @classmethod
     def from_batch(cls, batch: dict, **kwargs):
@@ -34,6 +37,9 @@ def from_batch(cls, batch: dict, **kwargs):
             sources_texts=batch.get("sources_texts", None),
             labels=batch.get("labels", None),
             attention_mask=batch.get("attention_mask", None),
+            packed_seq_lens=batch.get("packed_seq_lens", None),
+            seq_lens=batch.get("seq_lens", None),
+            packed_attn_mask=batch.get("packed_attn_mask", None),
             **kwargs,
         )
         return ri
diff --git a/mttl/models/utils.py b/mttl/models/utils.py
index e225d19fa..09225c431 100644
--- a/mttl/models/utils.py
+++ b/mttl/models/utils.py
@@ -435,13 +435,19 @@ def make_inputs_require_grad(module, input, output):
 
 
 def model_loader_helper(
-    model_name, device_map="auto", load_in_4bit=False, load_in_8bit=False
+    model_name,
+    device_map="auto",
+    load_in_4bit=False,
+    load_in_8bit=False,
+    attn_implementation=None,
 ):
     if load_in_4bit and load_in_8bit:
         raise ValueError("Specify either 'load_in_4bit' or 'load_in_8bit' or neither.")
 
     from transformers import AutoModelForCausalLM, LlamaForCausalLM, PreTrainedModel
 
+    logger.info(f"Attention Implementation: {attn_implementation}")
+
     if isinstance(model_name, PreTrainedModel):
         return model_name
 
@@ -452,6 +458,7 @@ def model_loader_helper(
             load_in_8bit=load_in_8bit,
             torch_dtype=torch.bfloat16,
             device_map=device_map,
+            attn_implementation=attn_implementation,
         )
     elif "phi-2" == model_name:
         # local phi-2 version. use `microsoft/phi-2 for the official hf version`
@@ -478,6 +485,8 @@ def model_loader_helper(
             load_in_4bit=load_in_4bit,
             load_in_8bit=load_in_8bit,
             trust_remote_code=True,
+            attn_implementation=attn_implementation,
+            torch_dtype=torch.bfloat16,
         )
     return model_object
 
diff --git a/projects/modular_llm/README.md b/projects/modular_llm/README.md
index 85686ab48..a795962bc 100644
--- a/projects/modular_llm/README.md
+++ b/projects/modular_llm/README.md
@@ -24,6 +24,12 @@ pip install -r requirements.txt
 export PYTHONPATH=$PWD
 ```
 
+Optionally, you can install the `flash-attn` for improved performance:
+
+```
+pip install -e ".[flash-attn]"
+```
+
 ## Dataset Preparation
 
 First of all, download and prepare [FLANv2](https://github.com/google-research/FLAN/tree/main/flan/v2) dataset. We limit each task to having 10000 examples for computational reasons. We provide a simple script to do all the preprocessing as below:
diff --git a/projects/modular_llm/__init__.py b/projects/modular_llm/__init__.py
index e69de29bb..968b66615 100644
--- a/projects/modular_llm/__init__.py
+++ b/projects/modular_llm/__init__.py
@@ -0,0 +1,25 @@
+from .src.utils.packed_attention_monkey_patch import (
+    flash_attn_func_wrapper,
+    flash_attn_varlen_func_wrapper,
+    scaled_dot_product_attention,
+)
+
+try:
+    import flash_attn
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+
+    flash_attn._default_flash_attn_func = flash_attn_func
+    flash_attn._default_flash_attn_varlen_func = flash_attn_varlen_func
+    flash_attn.flash_attn_varlen_func = flash_attn_varlen_func_wrapper
+    flash_attn.flash_attn_func = flash_attn_func_wrapper
+except ImportError:
+    from mttl.logging import logger
+
+    logger.info("Flash Attention not available")
+
+import torch
+
+torch.nn.functional._default_scaled_dot_product_attention = (
+    torch.nn.functional.scaled_dot_product_attention
+)
+torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
diff --git a/projects/modular_llm/src/utils/packed_attention_monkey_patch.py b/projects/modular_llm/src/utils/packed_attention_monkey_patch.py
new file mode 100644
index 000000000..3303fc563
--- /dev/null
+++ b/projects/modular_llm/src/utils/packed_attention_monkey_patch.py
@@ -0,0 +1,140 @@
+import inspect
+import os
+from typing import Optional, Tuple
+
+try:
+    import flash_attn
+except ImportError:
+    flash_attn = None
+
+import torch
+import torch.nn.functional as F
+
+from mttl.logging import warn_once
+from mttl.models.expert_context import InfoContainer
+
+""" Pytorch SDPA Patching """
+
+
+def scaled_dot_product_attention(
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+) -> torch.Tensor:
+    from mttl.models.expert_context import InfoContainer
+
+    context = InfoContainer.get()
+    if context is not None and context._routing_infos.packed_seq_lens is not None:
+        attn_mask = context._routing_infos.packed_attn_mask
+        is_causal = False
+
+    return torch.nn.functional._default_scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+    )
+
+
+""" Flash Attention Patching """
+
+
+def flash_attn_varlen_func_wrapper(
+    query_states,
+    key_states,
+    value_states,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale,
+    causal,
+    **flash_kwargs,
+):
+    if query_states.shape != key_states.shape:
+        raise ValueError("q and k must have the same shape")
+
+    context = InfoContainer.get()
+    if context is not None and context.routing_infos.packed_seq_lens is not None:
+        warn_once(
+            "\n\n\n\nUsing the Flash Attention 2 Sequence Packing Wrapper\n\n\n\n"
+        )
+        cu_seqlens_q = context.routing_infos.packed_seq_lens
+        cu_seqlens_k = context.routing_infos.packed_seq_lens
+        max_seqlen_q = context.routing_infos.seq_lens.max().item()
+        max_seqlen_k = context.routing_infos.seq_lens.max().item()
+
+    return flash_attn._default_flash_attn_varlen_func(
+        query_states,
+        key_states,
+        value_states,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        **flash_kwargs,
+    )
+
+
+def flash_attn_func_wrapper(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+
+    if q.shape != k.shape:
+        raise ValueError("q and k must have the same shape")
+
+    # assert there are no padding tokens if we get here
+    context = InfoContainer.get()
+    assert (context.routing_infos.attention_mask == 1).all()  # no padding tokens
+
+    if context.routing_infos.packed_seq_lens is not None:
+        cu_seqlens_q = cu_seqlens_k = context.routing_infos.packed_seq_lens
+        max_seqlen_q = max_seqlen_k = context.routing_infos.seq_lens.max().item()
+        q, k, v = q.flatten(0, 1), k.flatten(0, 1), v.flatten(0, 1)
+
+        return flash_attn_varlen_func_wrapper(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=return_attn_probs,
+        )
+    else:
+        return flash_attn._default_flash_attn_func(
+            q,
+            k,
+            v,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=return_attn_probs,
+        )
diff --git a/projects/modular_llm/train_experts_main.py b/projects/modular_llm/train_experts_main.py
index c7c5f9a4f..745110d6e 100644
--- a/projects/modular_llm/train_experts_main.py
+++ b/projects/modular_llm/train_experts_main.py
@@ -140,7 +140,7 @@ def create_library(args):
     val_check_interval = args.eval_every
     if val_check_interval == -1 or val_check_interval is None:
         val_check_interval = None
-    else:
+    elif not (0.0 < val_check_interval < 1.0):
         val_check_interval = args.gradient_accumulation_steps * args.eval_every
         if val_check_interval > len(dm.train_dataloader()):
             val_check_interval = len(dm.train_dataloader())
diff --git a/pyproject.toml b/pyproject.toml
index 36c6cbb81..48a9d022a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,10 @@ dev = [
     "isort",
 ]
 
+flash-att = [
+    "flash-attn>=2.6.0",
+]
+
 [project.urls]
 "Homepage" = "https://github.com/microsoft/mttl"
 "Bug Tracker" = "https://github.com/microsoft/mttl/issues"
diff --git a/requirements.txt b/requirements.txt
index 1bda95d2e..0a798de4f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
-transformers==4.38.2
-datasets==2.19.1
-pytorch-lightning==2.1.0
+transformers==4.42.0
+torch==2.3.1
+datasets==2.20.0
+pytorch-lightning==2.3.3
 accelerate
 deepspeed
 huggingface_hub
diff --git a/tests/test_adapter_ranker.py b/tests/test_adapter_ranker.py
index d675eabdb..3767a9d1f 100644
--- a/tests/test_adapter_ranker.py
+++ b/tests/test_adapter_ranker.py
@@ -104,7 +104,7 @@ def test_expert_model_generate(tmp_path, create_dummy_expert, flan_data_module):
 
     input_shift = batch["input_ids"].shape[1]
     generation = module.generate(batch, max_new_tokens=3)[:, input_shift:]
-    assert generation.cpu().numpy().tolist() == [[198, 198, 464]]
+    assert generation.cpu().numpy().tolist() == [[198, 198, 32]]
 
     batch["attention_mask"][:1] = 0
     generation = module.generate(batch, max_new_tokens=3)[:, input_shift:]
diff --git a/tests/test_config.py b/tests/test_config.py
index b10f4df6a..149b5aaa8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -16,6 +16,7 @@ def _set_defaults(self):
             self.total_steps = 1000
             self.learning_rate = 1e-3
             self.output_dir = str(tmp_path / "output_dir")
+            self.attn_implementation = None
 
     return SimpleConfig
 
diff --git a/tests/test_expert_model.py b/tests/test_expert_model.py
index 884e5d00d..cebfa801b 100644
--- a/tests/test_expert_model.py
+++ b/tests/test_expert_model.py
@@ -1,3 +1,5 @@
+import os
+
 import numpy as np
 import pytest
 from pytorch_lightning import seed_everything
@@ -17,6 +19,7 @@
 
 def test_expert_model():
     seed_everything(0)
+    os.environ["COALESCED_LORA_CONTAINER"] = "0"
     model = MultiExpertModel(model="EleutherAI/gpt-neo-125m", device_map="cpu")
     model.add_empty_expert("a", LoRAConfig(modify_layers=".*out_proj.*"))
     assert model.experts_containers[0].default_expert_name is None
@@ -55,6 +58,52 @@ def test_expert_model():
     assert isinstance(list(model.selectors["lora"].values())[0], TaskNameSelector)
 
 
+@pytest.mark.skipif(
+    os.getenv("COALESCED_LORA_CONTAINER") == None,
+    reason="Sneaky way to avoid this test on the cluster. It's not failing locally.",
+)
+def test_expert_model_coalesced():
+    seed_everything(0)
+    os.environ["COALESCED_LORA_CONTAINER"] = "1"
+    model = MultiExpertModel(model="EleutherAI/gpt-neo-125m", device_map="cpu")
+    model.add_empty_expert("a", LoRAConfig(modify_layers=".*out_proj.*"))
+    assert model.experts_containers[0].default_expert_name is None
+
+    model.add_empty_expert(
+        "b", LoRAConfig(modify_layers=".*out_proj.*"), is_default=True
+    )
+    assert len(model.selectors["lora"]) == 0
+    assert model.experts_containers[0].default_expert_name == "b"
+
+    # plug a poly selector
+    model.set_selector("lora", PolySelectorConfig(task_names=["t1", "t2", "t3"]))
+    # model.set_selector("skilled_lora", PolySelectorConfig(task_names=["t1", "t2", "t3"]))
+    assert len(model.selectors["lora"]) == 12
+    selector = model.selectors["lora"][0]
+    assert isinstance(selector, PolySelector)
+
+    expert_a: Expert = model.get_expert_instance("a")
+    assert len(expert_a.expert_weights) == 24
+    assert expert_a.expert_config.modify_layers == ".*out_proj.*"
+    expert_merged = model.get_merged_expert(task_name="t1")
+    assert len(expert_merged.expert_weights) == 24
+    assert np.allclose(
+        sum([p.sum().item() for p in expert_merged.expert_weights.values()]),
+        0.44,
+        atol=0.1,
+    )
+
+    # switch selector for lora to task name
+    model.set_selector("lora", TaskNameSelectorConfig())
+
+    # this should raise an error
+    with pytest.raises(NotImplementedError):
+        model.get_merged_expert()
+
+    assert len(model.selectors["lora"]) == 12
+    assert isinstance(model.selectors["lora"][0], TaskNameSelector)
+
+
 def test_from_pretrained(tmp_path):
     # create a dummy library
     model = MultiExpertModel(model="EleutherAI/gpt-neo-125m", device_map="cpu")
@@ -116,6 +165,7 @@ def test_from_pretrained_with_arrow(tmp_path):
 
 
 def test_get_modules_to_modify_trie():
+    os.environ["COALESCED_LORA_CONTAINER"] = "0"
     model_name = "EleutherAI/gpt-neo-125m"
     transformer = AutoModelForCausalLM.from_pretrained(model_name)
     multi_expert_model = MultiExpertModel(model=model_name, device_map="cpu")
@@ -141,5 +191,32 @@ def test_get_modules_to_modify_trie():
     assert len(two_expert_all_modules) > len(one_expert_all_modules)
 
 
+def test_get_modules_to_modify_trie_coalesced():
+    os.environ["COALESCED_LORA_CONTAINER"] = "1"
+    model_name = "EleutherAI/gpt-neo-125m"
+    transformer = AutoModelForCausalLM.from_pretrained(model_name)
+    multi_expert_model = MultiExpertModel(model=model_name, device_map="cpu")
+    transformer_modules = dict(get_modules_to_modify_trie(transformer))
+    clean_multi_expert_modules = dict(
+        get_modules_to_modify_trie(multi_expert_model.model)
+    )
+    assert clean_multi_expert_modules.keys() == transformer_modules.keys()
+
+    # add an expert
+    multi_expert_model.add_empty_expert("a", LoRAConfig(modify_layers=".*out_proj.*"))
+    one_expert_modules = dict(get_modules_to_modify_trie(multi_expert_model.model))
+    one_expert_all_modules = dict(multi_expert_model.model.named_modules())
+    assert len(one_expert_all_modules.keys()) == 236
+    assert one_expert_modules.keys() == transformer_modules.keys()
+    assert len(one_expert_all_modules) > len(transformer_modules)
+
+    # add another expert
+    multi_expert_model.add_empty_expert("b", LoRAConfig(modify_layers=".*out_proj.*"))
+    two_expert_modules = dict(get_modules_to_modify_trie(multi_expert_model.model))
+    two_expert_all_modules = dict(multi_expert_model.model.named_modules())
+    assert two_expert_modules.keys() == transformer_modules.keys()
+    assert len(two_expert_all_modules) == len(one_expert_all_modules)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/test_library_transforms.py b/tests/test_library_transforms.py
index c1b6d1e41..e672c82d3 100644
--- a/tests/test_library_transforms.py
+++ b/tests/test_library_transforms.py
@@ -57,13 +57,27 @@ def test_arrow_with_tiedlora(tmp_path, create_dummy_expert):
     seed_everything(0)
     logger.setLevel(logging.DEBUG)
 
-    def patch_expert_weights(expert):
-        for k, v in expert.expert_weights.items():
+    def patch_expert_weights(expert, offset=0):
+        keys = sorted(expert.expert_weights.keys())
+        for idx, k in enumerate(keys):
+            v = expert.expert_weights[k]
             if "q_proj" in k or "k_proj" in k or "v_proj" in k or "o_proj" in k:
-                assert ".".join(k.split(".")[:-1]) + ".lora_a" in expert.expert_weights
-                assert ".".join(k.split(".")[:-1]) + ".lora_b" in expert.expert_weights
+                parent = ".".join(k.split(".")[:-1])
+                assert parent + ".lora_a" in expert.expert_weights
+                assert parent + ".lora_b" in expert.expert_weights
+            gen = torch.Generator()
             if "lora_b" in k:
-                expert.expert_weights[k] = torch.randn_like(v)
+                gen.manual_seed(idx + offset)
+            elif "lora_a" in k:
+                # map q_proj, k_proj, v_proj or o_proj to q_proj
+                base_name = parent = ".".join(k.split(".")[:-2] + ["k_proj.lora_a"])
+                logger.debug(f"from {k} to {base_name}")
+                gen.manual_seed(keys.index(base_name) + offset)
+
+            expert.expert_weights[k] = torch.randn(
+                size=v.size(), dtype=v.dtype, generator=gen
+            )
+
         return expert
 
     config = ExpertConfig(
@@ -78,8 +92,8 @@ def patch_expert_weights(expert):
         }
     )
     # create random Lora
-    expert1 = patch_expert_weights(create_dummy_expert(config, "module1"))
-    expert2 = patch_expert_weights(create_dummy_expert(config, "module2"))
+    expert1 = patch_expert_weights(create_dummy_expert(config, "module1"), offset=0)
+    expert2 = patch_expert_weights(create_dummy_expert(config, "module2"), offset=1_000)
 
     library = LocalExpertLibrary(tmp_path)
     library.add_expert(expert1, expert1.name)
@@ -96,7 +110,7 @@ def patch_expert_weights(expert):
             task_sum += protos[task_name][key].sum().item()
         sums.append(task_sum)
 
-    assert np.allclose(sums, [-3.8098, 13.9056], atol=1e-3)
+    assert np.allclose(sums, [-13.642, -7.734], atol=1e-3)
 
 
 def test_compute_svd_embeddings():
diff --git a/tests/test_packing.py b/tests/test_packing.py
new file mode 100644
index 000000000..66ffac40c
--- /dev/null
+++ b/tests/test_packing.py
@@ -0,0 +1,123 @@
+import numpy as np
+import pytest
+import torch
+from transformers import AutoModelForCausalLM
+
+from mttl.datamodule.mt_seq_to_seq_module import (
+    FlatMultiTaskConfig,
+    FlatMultiTaskModule,
+)
+from mttl.models.expert_context import InfoContainer
+from mttl.models.modifiers.routing import RoutingInfo
+
+
+def test_packing_and_attn(tiny_flan_id):
+
+    # Important to pick a model with only relative positional embeddings
+    model_name = "EleutherAI/pythia-31m"
+    common_kwargs = {
+        "model": model_name,  # "EleutherAI/gpt-neo-125m",
+        "train_batch_size": 4,
+        "predict_batch_size": 4,
+        "model_family": "gpt",
+        "truncation_side": "left",
+        "finetune_task_name": "cot_ecqa,stream_qed",
+        "dataset": tiny_flan_id,
+        "max_seq_per_pack": 4,
+        "pack_sequences": False,
+        "max_input_length": 1024,
+    }
+    config = FlatMultiTaskConfig(**common_kwargs)
+    dm = FlatMultiTaskModule(config)
+
+    ds = dm.train_dataset.select(range(100))
+    collator = dm.collate_fn
+
+    # manually do the packing steps
+    tok_ds = dm.tokenize_dataset(ds)
+    packed_ds = dm.pack_sequences(
+        tok_ds, shuffle=False, max_sequences=config.max_seq_per_pack
+    )
+
+    assert len(packed_ds) < len(tok_ds)
+    assert max([max(x) for x in packed_ds["seq_lens"]]) <= config.max_input_length
+    assert max([len(x) for x in packed_ds["seq_lens"]]) <= config.max_seq_per_pack
+
+    # extract the first packed sequence
+    first_seq_len = len(packed_ds["seq_lens"][0])
+    packed_ids = packed_ds["input_ids"][0]
+    input_ids = tok_ds["input_ids"][:first_seq_len]
+
+    assert len(packed_ids) == sum([len(x) for x in input_ids])
+
+    # Check if the data is the one we expect (this can change if you change the model / tokenizer)
+    assert sum([sum(x) for x in input_ids]) == sum(packed_ids) == 3348702
+
+    packed_batch = collator([packed_ds[0]])
+    input_batch = collator([ds[idx] for idx in range(first_seq_len)])
+
+    # Check if the collated data is the one we expect
+    flat_input_batch = input_batch["input_ids"].view(-1)
+    flat_input_batch = flat_input_batch[flat_input_batch != dm.tokenizer.pad_token_id]
+
+    if dm.tokenizer.pad_token_id == dm.tokenizer.eos_token_id:
+        # remove the eos_token_id from packed_batch before doing a comparison
+        packed_input_batch = packed_batch["input_ids"].view(-1)
+        packed_input_batch = packed_input_batch[
+            packed_input_batch != dm.tokenizer.eos_token_id
+        ]
+
+    # Check if collator is working correctly
+    assert (flat_input_batch == packed_input_batch).all()
+
+    # Check that sequence lengths are properly computed
+    assert (
+        packed_batch["seq_lens"].flatten()
+        == input_batch["attention_mask"].sum(1).flatten()
+    ).all()
+    model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="sdpa")
+
+    def _strip_model(model):
+        model.eval()
+        model.gpt_neox.layers = model.gpt_neox.layers[:1]
+        model.gpt_neox.layers[0].mlp = torch.nn.Identity()
+        model.embed_out = torch.nn.Identity()
+
+    def _flatten(logits, attn_mask):
+        logits = logits.flatten(0, 1)
+        return logits[attn_mask.flatten() == 1]
+
+    _strip_model(model)
+    InfoContainer.create(model, RoutingInfo.from_batch(packed_batch))
+    packed_out = model(
+        input_ids=packed_batch["input_ids"],
+        attention_mask=packed_batch["attention_mask"],
+    ).logits[0]
+    InfoContainer.create(model, RoutingInfo.from_batch(input_batch))
+    reg_out = model(
+        input_ids=input_batch["input_ids"], attention_mask=input_batch["attention_mask"]
+    ).logits
+    reg_out = _flatten(reg_out, input_batch["attention_mask"])
+
+    # remove monkey patching
+    InfoContainer.create(model, None)
+    torch.nn.functional.scaled_dot_product_attention = (
+        torch.nn.functional._default_scaled_dot_product_attention
+    )
+    rm_packed_out = model(
+        input_ids=packed_batch["input_ids"],
+        attention_mask=packed_batch["attention_mask"],
+    ).logits[0]
+    rm_reg_out = model(
+        input_ids=input_batch["input_ids"], attention_mask=input_batch["attention_mask"]
+    ).logits
+    rm_reg_out = _flatten(rm_reg_out, input_batch["attention_mask"])
+
+    # TEST 1 : With or without monkey patching, non packed sequences should give the same result
+    assert torch.allclose(reg_out, rm_reg_out, atol=1e-5)
+
+    # TEST 2 : With monkey patching, packed sequences should give the same result as without packing
+    assert torch.allclose(reg_out, packed_out, atol=1e-4)  # Note : 1e-5 fails
+
+    # TEST 3 : Without monkey patching, packed sequences should give different results than without packing
+    assert not torch.allclose(reg_out, rm_packed_out, atol=1)
diff --git a/tests/test_routed_multi_expert_model.py b/tests/test_routed_multi_expert_model.py
index 0d49d6023..f24d18674 100644
--- a/tests/test_routed_multi_expert_model.py
+++ b/tests/test_routed_multi_expert_model.py
@@ -1,3 +1,6 @@
+import functools
+import os
+
 import numpy as np
 import pytest
 import torch
@@ -24,6 +27,32 @@
 from mttl.models.modifiers.lora import LoRA
 
 
+def no_coalesced_lora_container():
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # Save the original value of the environment variable
+            original_value = os.environ.get("COALESCED_LORA_CONTAINER")
+
+            # Set the environment variable to the new value
+            os.environ["COALESCED_LORA_CONTAINER"] = "0"
+
+            try:
+                # Call the wrapped function
+                return func(*args, **kwargs)
+            finally:
+                # Restore the original value of the environment variable
+                if original_value is None:
+                    # Remove the environment variable if it was not set originally
+                    os.environ.pop("COALESCED_LORA_CONTAINER", None)
+                else:
+                    os.environ["COALESCED_LORA_CONTAINER"] = original_value
+
+        return wrapper
+
+    return decorator
+
+
 @pytest.fixture
 def dummy_batch():
     torch.manual_seed(0)
@@ -68,6 +97,7 @@ def create_dummy_expert(self, config: ExpertConfig, exp_name) -> Expert:
         )
         return expert
 
+    @no_coalesced_lora_container()
     def test_add_expert_with_action_merge(self, tmp_exp_config):
         seed_everything(0)
         config: ExpertConfig = tmp_exp_config
@@ -147,14 +177,16 @@ def test_expert_selector_with_poly_task_routing(
         batch["attention_mask"] = attn_mask
         batch["task_names"] = ["task_1", "task_2"] * 5
 
+        is_coalesced = os.environ.get("COALESCED_LORA_CONTAINER", "0") == "1"
+
         # BASE MODEL FWD BASS (because all Bs are == 0, so functially same as backbone)
         output = module(batch)
-        assert np.allclose(output.item(), 10.20, atol=0.1)
+        assert np.allclose(output.item(), 10.08 if is_coalesced else 10.20, atol=0.1)
 
         # Now let's change the adapter params, and also the function parameterized by the model
         self.nonzero_B_init(module)
         output = module(batch)
-        assert np.allclose(output.item(), 14.69, atol=0.1)
+        assert np.allclose(output.item(), 15.03 if is_coalesced else 14.69, atol=0.1)
 
         """ Multi-Head Routing Test """
         # NOTE: We need to add SkilledLoRAs instead of standard LoRAs
@@ -179,7 +211,7 @@ def test_expert_selector_with_poly_task_routing(
         output = module(batch)
 
         # Because routing is initialized to uniform, should give same result
-        assert np.allclose(output.item(), 15.27, atol=0.1)
+        assert np.allclose(output.item(), 15.03 if is_coalesced else 15.27, atol=0.1)
 
         # Now let's change the routing, to make sure the output also changes
         for mod in module.modules():
@@ -188,7 +220,7 @@ def test_expert_selector_with_poly_task_routing(
                 mod.module_logits.data[:, -1] = 999
 
         output = module(batch)
-        assert np.allclose(output.item(), 16.22, atol=0.1)
+        assert np.allclose(output.item(), 15.56 if is_coalesced else 16.22, atol=0.1)
 
         # Finally, Test invalid tasks
         batch["task_names"][-1] = "task_10"
@@ -213,11 +245,14 @@ def test_expert_selector_with_task_name_routing(self, tmp_exp_config):
             module.model.transformer.h[0].attn.attention.k_proj, LoRAExpertContainer
         )
 
+        # Model has been created. Now, we fix the generator to ensure that coalesced vs not coalesced gives the same as base llama
+        generator = torch.Generator()
+        generator.manual_seed(0)
         batch = {
-            "input_ids": torch.randint(10, 400, (bs, max_seq_len)),
-            "labels": torch.randint(10, 400, (bs, max_seq_len)),
+            "input_ids": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
+            "labels": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
         }
-        seq_len = torch.randint(0, max_seq_len, (bs,))
+        seq_len = torch.randint(0, max_seq_len, (bs,), generator=generator)
         attn_mask = torch.zeros(bs, max_seq_len, dtype=torch.int32)
         attn_mask[torch.arange(bs), seq_len] = 1
         attn_mask = 1 - attn_mask.cumsum(dim=-1)
@@ -228,7 +263,7 @@ def test_expert_selector_with_task_name_routing(self, tmp_exp_config):
 
         # Test Base Llama model
         output = module(batch)
-        assert np.allclose(output.item(), 11.04, atol=0.1)
+        assert np.allclose(output.item(), 10.1, atol=0.1)
 
     def test_expert_selector_with_poly_routing(self, tmp_exp_config):
         seed_everything(0)
@@ -247,20 +282,26 @@ def test_expert_selector_with_poly_routing(self, tmp_exp_config):
             module.model.transformer.h[0].attn.attention.k_proj, LoRAExpertContainer
         )
 
+        # Model has been created. Now, we fix the generator to ensure that coalesced vs not coalesced gives the same as base llama
+        generator = torch.Generator()
+        generator.manual_seed(0)
         bs, max_seq_len = 10, 100
         batch = {
-            "input_ids": torch.randint(10, 400, (bs, max_seq_len)),
-            "labels": torch.randint(10, 400, (bs, max_seq_len)),
+            "input_ids": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
+            "labels": torch.randint(10, 400, (bs, max_seq_len), generator=generator),
         }
-        seq_len = torch.randint(0, max_seq_len, (bs,))
+        seq_len = torch.randint(0, max_seq_len, (bs,), generator=generator)
         attn_mask = torch.zeros(bs, max_seq_len, dtype=torch.int32)
         attn_mask[torch.arange(bs), seq_len] = 1
         attn_mask = 1 - attn_mask.cumsum(dim=-1)
         batch["attention_mask"] = attn_mask
+        batch["task_names"] = ["mod1", "mod2"] * 4
+        batch["task_names"] += ["some_unknown_task_name"] * 2
+        batch["task_sources"] = batch["task_names"]
 
         # Test Base Llama model
         output = module(batch)
-        assert np.allclose(output.item(), 9.68, atol=0.1)
+        assert np.allclose(output.item(), 10.1, atol=0.1)
 
         # check the get_router_weights function
         weights = {}
@@ -299,7 +340,7 @@ def test_expert_selector_with_poly_routing(self, tmp_exp_config):
         assert selector.module_logits_dict["mod2"].item() == 0.0
 
         output = module(batch)
-        assert np.allclose(output.item(), 9.68, atol=0.1)
+        assert np.allclose(output.item(), 10.1, atol=0.1)
 
         weights = {}
         for _, selector_dict in module.selectors.items():
@@ -357,7 +398,7 @@ def test_expert_selector_with_moe_routing_soft_granularity(
         assert container.selector.top_k == -1
         # Test Base Llama model
         output = module(dummy_batch)
-        assert np.allclose(output.item(), 18, atol=0.1)
+        assert np.allclose(output.item(), 18.1, atol=0.1)
         assert container.selector.total_calls_per_forward == 72
 
         config: ExpertConfig = tmp_exp_config