From 67c6011f1659813675817c9a3bed92c68498cdb9 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:41:33 -0800
Subject: [PATCH 01/14] Add OLMo November release implementation

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 404 ++++++++++++++++++++++++
 1 file changed, 404 insertions(+)
 create mode 100644 vllm/model_executor/models/olmo_1124.py

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
new file mode 100644
index 0000000000000..139e4c0994757
--- /dev/null
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -0,0 +1,404 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Olmo1124Config
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.distributed.utils import split_tensor_along_last_dim
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+
+class OlmoAttention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo1124Config)
+
+        hidden_size = self.config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = self.config.num_attention_heads
+
+        assert hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = (
+            self.config.num_key_value_heads or self.total_num_heads
+        )
+        if self.total_num_kv_heads >= self.tp_size:
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            assert self.tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.rope_theta = self.config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta, # type: ignore
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm.forward_native(q)
+        k = self.k_norm.forward_native(k)
+        if self.tp_size > 1:
+            splitter = partial(
+                split_tensor_along_last_dim, num_partitions=self.tp_size
+            )
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoMLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo1124Config)
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OlmoDecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo1124Config)
+        # Attention block.
+        self.self_attn = OlmoAttention(
+            vllm_config=vllm_config, prefix=f"{prefix}.self_attn"
+        )
+
+        # MLP block.
+        self.mlp = OlmoMLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+
+        # LayerNorm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(
+            positions, hidden_states, kv_cache, attn_metadata
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class OlmoModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo1124Config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: OlmoDecoderLayer(
+                vllm_config=vllm_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states"], self.config.hidden_size
+            )
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        if get_pp_group().is_first_rank:
+            # Get embeddings of input.
+            # shape: (batch_size, seq_len, d_model)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+            # embed positions
+            hidden_states = inputs_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            assert isinstance(hidden_states, torch.Tensor)
+
+        # Apply blocks one-by-one.
+        for i in range(self.start_layer, self.end_layer):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = self.layers[i](
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Olmo1124ForCausalLM(nn.Module, SupportsPP):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo1124Config)
+        self.config = config
+        self.model = OlmoModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=vllm_config.quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(
+            self.lm_head, hidden_states, sampling_metadata
+        )
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens

From e0735906d7dde064e8ec8b437c45f704ac00f478 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:42:23 -0800
Subject: [PATCH 02/14] Add OLMo model to registry

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/registry.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 22c2e328bfb65..831dcebd8b7ae 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -73,6 +73,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
+    "Olmo1124ForCausalLM": ("olmo_1124", "Olmo1124ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),

From 75bc97323675513691e1fcd7a9012dee5d0b35f4 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:43:06 -0800
Subject: [PATCH 03/14] Add weight loading

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 53 ++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index 139e4c0994757..b734dd5386a7c 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -22,7 +22,7 @@
 """Inference-only OLMo2 model compatible with HuggingFace weights."""
 
 from functools import partial
-from typing import List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -48,9 +48,11 @@
     ParallelLMHead,
     VocabParallelEmbedding,
 )
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.models.utils import (
+    is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     make_layers,
     maybe_prefix,
@@ -402,3 +404,52 @@ def sample(
     ) -> Optional[SamplerOutput]:
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if (
+                "rotary_emb.cos_cached" in name
+                or "rotary_emb.sin_cached" in name
+            ):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader  # type: ignore
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", default_weight_loader
+                )
+                weight_loader(param, loaded_weight)

From ab57eb0e4cc029ba810180a34afef4acd183de31 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:46:42 -0800
Subject: [PATCH 04/14] Run formatter

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 98 ++++++++++---------------
 1 file changed, 37 insertions(+), 61 deletions(-)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index b734dd5386a7c..271fcfcb88af4 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -36,27 +36,19 @@
 from vllm.distributed.utils import split_tensor_along_last_dim
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
-    MergedColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
-)
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead,
-    VocabParallelEmbedding,
-)
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-
 from vllm.model_executor.models.interfaces import SupportsPP
 from vllm.model_executor.models.utils import (
-    is_pp_missing_parameter,
-    make_empty_intermediate_tensors_factory,
-    make_layers,
-    maybe_prefix,
-)
+    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
+    make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -81,9 +73,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert self.total_num_heads % self.tp_size == 0
 
         self.num_heads = self.total_num_heads // self.tp_size
-        self.total_num_kv_heads = (
-            self.config.num_key_value_heads or self.total_num_heads
-        )
+        self.total_num_kv_heads = (self.config.num_key_value_heads
+                                   or self.total_num_heads)
         if self.total_num_kv_heads >= self.tp_size:
             assert self.total_num_kv_heads % self.tp_size == 0
         else:
@@ -112,16 +103,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.total_num_kv_heads * self.head_dim,
             eps=self.config.rms_norm_eps,
         )
-        self.q_norm = RMSNorm(
-            self.config.hidden_size, eps=self.config.rms_norm_eps
-        )
+        self.q_norm = RMSNorm(self.config.hidden_size,
+                              eps=self.config.rms_norm_eps)
 
         # Rotary embeddings.
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta, # type: ignore
+            base=self.rope_theta,  # type: ignore
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
@@ -142,18 +132,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.o_proj",
         )
 
-    def _apply_qk_norm(
-        self, q: torch.Tensor, k: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.tp_size > 1:
             q = tensor_model_parallel_all_gather(q.contiguous())
             k = tensor_model_parallel_all_gather(k.contiguous())
         q = self.q_norm.forward_native(q)
         k = self.k_norm.forward_native(k)
         if self.tp_size > 1:
-            splitter = partial(
-                split_tensor_along_last_dim, num_partitions=self.tp_size
-            )
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
             q = splitter(q)[self.tp_rank]
             k = splitter(k)[self.tp_rank]
         return q, k
@@ -231,21 +219,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, Olmo1124Config)
         # Attention block.
-        self.self_attn = OlmoAttention(
-            vllm_config=vllm_config, prefix=f"{prefix}.self_attn"
-        )
+        self.self_attn = OlmoAttention(vllm_config=vllm_config,
+                                       prefix=f"{prefix}.self_attn")
 
         # MLP block.
         self.mlp = OlmoMLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
-        self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
-        self.post_feedforward_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
+        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
+                                                  eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -256,9 +241,8 @@ def forward(
     ) -> torch.Tensor:
         # Attention block.
         residual = hidden_states
-        hidden_states = self.self_attn(
-            positions, hidden_states, kv_cache, attn_metadata
-        )
+        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
+                                       attn_metadata)
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = hidden_states + residual
 
@@ -271,6 +255,7 @@ def forward(
 
 
 class OlmoModel(nn.Module):
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
@@ -283,9 +268,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.config.num_hidden_layers,
-            lambda prefix: OlmoDecoderLayer(
-                vllm_config=vllm_config, prefix=prefix
-            ),
+            lambda prefix: OlmoDecoderLayer(vllm_config=vllm_config,
+                                            prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(
@@ -293,10 +277,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             eps=self.config.rms_norm_eps,
         )
         self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states"], self.config.hidden_size
-            )
-        )
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    self.config.hidden_size))
 
     def forward(
         self,
@@ -350,9 +332,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, Olmo1124Config)
         self.config = config
-        self.model = OlmoModel(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
+        self.model = OlmoModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
@@ -367,8 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
         self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
+            self.model.make_empty_intermediate_tensors)
 
     def forward(
         self,
@@ -392,9 +372,8 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(
-            self.lm_head, hidden_states, sampling_metadata
-        )
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def sample(
@@ -419,10 +398,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            if (
-                "rotary_emb.cos_cached" in name
-                or "rotary_emb.sin_cached" in name
-            ):
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
@@ -449,7 +426,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(
-                    param, "weight_loader", default_weight_loader
-                )
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 weight_loader(param, loaded_weight)

From ffe121d47d97e20e4d6912d12d284fca44fe2422 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:52:52 -0800
Subject: [PATCH 05/14] Update tests

Signed-off-by: Shane A <shanea@allenai.org>
---
 tests/distributed/test_pipeline_parallel.py | 1 +
 tests/models/registry.py                    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index c49ed9802cde8..386877e0e0a2c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -167,6 +167,7 @@ def iter_params(self, model_name: str):
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
     "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
     "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3848367b6126c..14c7ce464e271 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -91,6 +91,7 @@ class _HfExamplesInfo:
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
+    "Olmo1124ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",

From 1d3c611f2cb24256d8e42e3d099f05e610ce08d0 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 13:52:59 -0800
Subject: [PATCH 06/14] Update docs

Signed-off-by: Shane A <shanea@allenai.org>
---
 docs/source/models/supported_models.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index e902d393f2f70..3de60a0663916 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -234,6 +234,11 @@ Text Generation
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
     -
     - ✅︎
+  * - :code:`OLMo1124ForCausalLM`
+    - OLMo November 2024
+    - :code:`shanearora/OLMo-7B-1124-hf`, etc.
+    -
+    - ✅︎
   * - :code:`OLMoEForCausalLM`
     - OLMoE
     - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.

From b8a47a87c66c341c498552649d3677dfee6d39b0 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 14:09:54 -0800
Subject: [PATCH 07/14] Update comments

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index 271fcfcb88af4..dfeb64866d2ac 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -1,5 +1,5 @@
 # Adapted from
-# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo_1124/modeling_olmo_1124.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
@@ -19,7 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+"""Inference-only OLMo November model compatible with HuggingFace weights."""
 
 from functools import partial
 from typing import Iterable, List, Optional, Tuple, Union

From 65d62c91f635f94d533c2536ed7adf83aa4a5887 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 14:11:34 -0800
Subject: [PATCH 08/14] Change module prefixes from Olmo to Olmo1124

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index dfeb64866d2ac..3319bb6e9a437 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -53,7 +53,7 @@
 from vllm.sequence import IntermediateTensors
 
 
-class OlmoAttention(nn.Module):
+class Olmo1124Attention(nn.Module):
     """
     This is the attention block where the output is computed as
     ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
@@ -162,7 +162,7 @@ def forward(
         return output
 
 
-class OlmoMLP(nn.Module):
+class Olmo1124MLP(nn.Module):
     """
     This is the MLP block where the output is computed as
     ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
@@ -207,7 +207,7 @@ def forward(
         return x
 
 
-class OlmoDecoderLayer(nn.Module):
+class Olmo1124DecoderLayer(nn.Module):
     """
     This is a typical transformer block where the output is
     computed as ``MLP(LN(x + Attention(LN(x))))``
@@ -219,11 +219,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, Olmo1124Config)
         # Attention block.
-        self.self_attn = OlmoAttention(vllm_config=vllm_config,
+        self.self_attn = Olmo1124Attention(vllm_config=vllm_config,
                                        prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = OlmoMLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+        self.mlp = Olmo1124MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -254,7 +254,7 @@ def forward(
         return hidden_states
 
 
-class OlmoModel(nn.Module):
+class Olmo1124Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -268,7 +268,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.config.num_hidden_layers,
-            lambda prefix: OlmoDecoderLayer(vllm_config=vllm_config,
+            lambda prefix: Olmo1124DecoderLayer(vllm_config=vllm_config,
                                             prefix=prefix),
             prefix=f"{prefix}.layers",
         )
@@ -332,7 +332,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         assert isinstance(config, Olmo1124Config)
         self.config = config
-        self.model = OlmoModel(vllm_config=vllm_config,
+        self.model = Olmo1124Model(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens

From b941a24746d04582ef4cfa73f9cb76f368aa6230 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Wed, 20 Nov 2024 14:12:33 -0800
Subject: [PATCH 09/14] Run formatter

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index 3319bb6e9a437..f1afb4a049d90 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -220,7 +220,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert isinstance(config, Olmo1124Config)
         # Attention block.
         self.self_attn = Olmo1124Attention(vllm_config=vllm_config,
-                                       prefix=f"{prefix}.self_attn")
+                                           prefix=f"{prefix}.self_attn")
 
         # MLP block.
         self.mlp = Olmo1124MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
@@ -269,7 +269,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.config.num_hidden_layers,
             lambda prefix: Olmo1124DecoderLayer(vllm_config=vllm_config,
-                                            prefix=prefix),
+                                                prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(
@@ -333,7 +333,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         assert isinstance(config, Olmo1124Config)
         self.config = config
         self.model = Olmo1124Model(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
+                                   prefix=maybe_prefix(prefix, "model"))
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:

From 19e26b7dd139c954932e760a216df19567cbcde7 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Sat, 23 Nov 2024 23:05:13 -0800
Subject: [PATCH 10/14] Pass Olmo1124Attention prefix to Attention

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo_1124.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo_1124.py
index f1afb4a049d90..e696ef44b2ab7 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo_1124.py
@@ -121,6 +121,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             num_kv_heads=self.num_kv_heads,
             cache_config=vllm_config.cache_config,
             quant_config=vllm_config.quant_config,
+            prefix=prefix,
         )
 
         # Attention output projection.

From b71788795b0cfa778ea59d84188d2f7700d0be48 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 08:19:02 -0800
Subject: [PATCH 11/14] Rename Olmo1124 to Olmo2

Signed-off-by: Shane A <shanea@allenai.org>
---
 docs/source/models/supported_models.rst       |  2 +-
 tests/models/registry.py                      |  2 +-
 .../models/{olmo_1124.py => olmo2.py}         | 38 +++++++++----------
 vllm/model_executor/models/registry.py        |  2 +-
 4 files changed, 22 insertions(+), 22 deletions(-)
 rename vllm/model_executor/models/{olmo_1124.py => olmo2.py} (93%)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3de60a0663916..6149c4901e40c 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -234,7 +234,7 @@ Text Generation
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
     -
     - ✅︎
-  * - :code:`OLMo1124ForCausalLM`
+  * - :code:`OLMo2ForCausalLM`
     - OLMo November 2024
     - :code:`shanearora/OLMo-7B-1124-hf`, etc.
     -
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 14c7ce464e271..88a62f1d8e2ef 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -91,7 +91,7 @@ class _HfExamplesInfo:
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
-    "Olmo1124ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
+    "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
diff --git a/vllm/model_executor/models/olmo_1124.py b/vllm/model_executor/models/olmo2.py
similarity index 93%
rename from vllm/model_executor/models/olmo_1124.py
rename to vllm/model_executor/models/olmo2.py
index e696ef44b2ab7..6dad01ea955ba 100644
--- a/vllm/model_executor/models/olmo_1124.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -1,5 +1,5 @@
 # Adapted from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo_1124/modeling_olmo_1124.py
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
@@ -26,7 +26,7 @@
 
 import torch
 from torch import nn
-from transformers import Olmo1124Config
+from transformers import Olmo2Config
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import VllmConfig
@@ -53,7 +53,7 @@
 from vllm.sequence import IntermediateTensors
 
 
-class Olmo1124Attention(nn.Module):
+class Olmo2Attention(nn.Module):
     """
     This is the attention block where the output is computed as
     ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
@@ -63,7 +63,7 @@ class Olmo1124Attention(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
-        assert isinstance(self.config, Olmo1124Config)
+        assert isinstance(self.config, Olmo2Config)
 
         hidden_size = self.config.hidden_size
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -163,7 +163,7 @@ def forward(
         return output
 
 
-class Olmo1124MLP(nn.Module):
+class Olmo2MLP(nn.Module):
     """
     This is the MLP block where the output is computed as
     ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
@@ -173,7 +173,7 @@ class Olmo1124MLP(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo1124Config)
+        assert isinstance(config, Olmo2Config)
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
 
@@ -208,7 +208,7 @@ def forward(
         return x
 
 
-class Olmo1124DecoderLayer(nn.Module):
+class Olmo2DecoderLayer(nn.Module):
     """
     This is a typical transformer block where the output is
     computed as ``MLP(LN(x + Attention(LN(x))))``
@@ -218,13 +218,13 @@ class Olmo1124DecoderLayer(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo1124Config)
+        assert isinstance(config, Olmo2Config)
         # Attention block.
-        self.self_attn = Olmo1124Attention(vllm_config=vllm_config,
-                                           prefix=f"{prefix}.self_attn")
+        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
+                                        prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = Olmo1124MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+        self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -255,12 +255,12 @@ def forward(
         return hidden_states
 
 
-class Olmo1124Model(nn.Module):
+class Olmo2Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.config = vllm_config.model_config.hf_config
-        assert isinstance(self.config, Olmo1124Config)
+        assert isinstance(self.config, Olmo2Config)
 
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
@@ -269,8 +269,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             self.config.num_hidden_layers,
-            lambda prefix: Olmo1124DecoderLayer(vllm_config=vllm_config,
-                                                prefix=prefix),
+            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
+                                             prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         self.norm = RMSNorm(
@@ -323,7 +323,7 @@ def forward(
         return hidden_states
 
 
-class Olmo1124ForCausalLM(nn.Module, SupportsPP):
+class Olmo2ForCausalLM(nn.Module, SupportsPP):
     """
     Extremely barebones HF model wrapper.
     """
@@ -331,10 +331,10 @@ class Olmo1124ForCausalLM(nn.Module, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo1124Config)
+        assert isinstance(config, Olmo2Config)
         self.config = config
-        self.model = Olmo1124Model(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        self.model = Olmo2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 831dcebd8b7ae..653aa50aedb53 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -73,7 +73,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
-    "Olmo1124ForCausalLM": ("olmo_1124", "Olmo1124ForCausalLM"),
+    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),

From 17893b4a9b2ec2f2f0b18e15367a55e10e4faed2 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 08:22:20 -0800
Subject: [PATCH 12/14] Update OLMo2 documentation

Signed-off-by: Shane A <shanea@allenai.org>
---
 docs/source/models/supported_models.rst | 4 ++--
 vllm/model_executor/models/olmo2.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 6149c4901e40c..578aecfe0426e 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -235,8 +235,8 @@ Text Generation
     -
     - ✅︎
   * - :code:`OLMo2ForCausalLM`
-    - OLMo November 2024
-    - :code:`shanearora/OLMo-7B-1124-hf`, etc.
+    - OLMo2
+    - :code:`allenai/OLMo2-7B-1124`, etc.
     -
     - ✅︎
   * - :code:`OLMoEForCausalLM`
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 6dad01ea955ba..0df0f8ab9452d 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -19,7 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only OLMo November model compatible with HuggingFace weights."""
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
 
 from functools import partial
 from typing import Iterable, List, Optional, Tuple, Union

From b50a3bf6e5a72005b2401b0dbb07b097a490e1c8 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 10:29:52 -0800
Subject: [PATCH 13/14] Add Olmo2Config implementation

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/model_executor/models/olmo2.py         |   2 +-
 vllm/transformers_utils/config.py           |   2 +
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/olmo2.py    | 166 ++++++++++++++++++++
 4 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 vllm/transformers_utils/configs/olmo2.py

diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 0df0f8ab9452d..a35c911f90d96 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -26,7 +26,6 @@
 
 import torch
 from torch import nn
-from transformers import Olmo2Config
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import VllmConfig
@@ -51,6 +50,7 @@
     make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 
 
 class Olmo2Attention(nn.Module):
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 59096753c395d..4e569eb90b754 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,6 +27,7 @@
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
+                                             Olmo2Config,
                                              RWConfig, SolarConfig,
                                              UltravoxConfig)
 # yapf: enable
@@ -60,6 +61,7 @@
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
+    "olmo2": Olmo2Config,
     "solar": SolarConfig,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d1e19c9a33c24..4c721001d8434 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -15,6 +15,7 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -33,6 +34,7 @@
     "MLPSpeculatorConfig",
     "NemotronConfig",
     "NVLM_D_Config",
+    "Olmo2Config",
     "SolarConfig",
     "UltravoxConfig",
 ]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
new file mode 100644
index 0000000000000..0e6d8e4879b06
--- /dev/null
+++ b/vllm/transformers_utils/configs/olmo2.py
@@ -0,0 +1,166 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
+"""OLMo 2 configuration."""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Olmo2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+
+    ```python
+    >>> from transformers import Olmo2Model, Olmo2Config
+
+    >>> # Initializing a Olmo2 7B style configuration
+    >>> configuration = Olmo2Config()
+
+    >>> # Initializing a model from the Olmo2 7B style configuration
+    >>> model = Olmo2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "olmo2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

From a39863521595412d912f7192084357fad7a539e9 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 10:31:42 -0800
Subject: [PATCH 14/14] Run formatter

Signed-off-by: Shane A <shanea@allenai.org>
---
 vllm/transformers_utils/config.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4e569eb90b754..894de73a7c226 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -27,9 +27,8 @@
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
-                                             Olmo2Config,
-                                             RWConfig, SolarConfig,
-                                             UltravoxConfig)
+                                             Olmo2Config, RWConfig,
+                                             SolarConfig, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file