diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e4e8536b..b93d52fb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Changed + +- Rename `Olmo` to `OLMo` everywhere in the codebase + +### Removed + +- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore. + + ## [v0.2.5](https://github.com/allenai/OLMo/releases/tag/v0.2.5) - 2024-03-06 ### Fixed @@ -28,10 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changed legacy checkpoint unsharding to use processes and shared memory instead of threads -### Removed - -- Removed `AMDLayerNorm`, since the original layer norm bug has been fixed and we don't need this workaround anymore. - ## [v0.2.4](https://github.com/allenai/OLMo/releases/tag/v0.2.4) - 2024-02-02 diff --git a/docs/NOTES.md b/docs/NOTES.md index 6a8f3bfa7..c6611b33a 100644 --- a/docs/NOTES.md +++ b/docs/NOTES.md @@ -70,10 +70,10 @@ For example, checkpoints for the run [https://wandb.ai/ai2-llm/c4-small/runs/euo You can load a checkpoint like this: ```python -from olmo import Olmo, Tokenizer +from olmo import OLMo, Tokenizer checkpoint = "gs://ai2-olmo/ai2-llm/c4-small/euox4j8q/step73000-unsharded" -model = Olmo.from_checkpoint(checkpoint, device="cuda") +model = OLMo.from_checkpoint(checkpoint, device="cuda") tokenizer = Tokenizer.from_checkpoint(checkpoint) ``` diff --git a/hf_olmo/configuration_olmo.py b/hf_olmo/configuration_olmo.py index 5b15fa194..cb7670f6c 100644 --- a/hf_olmo/configuration_olmo.py +++ b/hf_olmo/configuration_olmo.py @@ -21,8 +21,8 @@ def __init__(self, use_cache: bool = False, **kwargs): all_kwargs.update({"use_cache": use_cache}) all_kwargs.update( { - "architectures": all_kwargs.get("architectures", ["OlmoModelForCausalLM"]) - or ["OlmoModelForCausalLM"] + "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"]) + or ["OLMoModelForCausalLM"] } ) super().__init__(**all_kwargs) diff --git a/hf_olmo/modeling_olmo.py b/hf_olmo/modeling_olmo.py index 6a279cb10..a1cc569f7 100644 --- a/hf_olmo/modeling_olmo.py +++ b/hf_olmo/modeling_olmo.py @@ -7,7 +7,7 @@ from transformers.models.auto import AutoModelForCausalLM from olmo.config import ModelConfig -from olmo.model import Olmo +from olmo.model import OLMo from .configuration_olmo import OLMoConfig @@ -34,14 +34,14 @@ class OLMoForCausalLM(PreTrainedModel): base_model_prefix = "model" _no_split_modules = ["OLMoBlock"] - def __init__(self, config: OLMoConfig, model: Optional[Olmo] = None, init_params: bool = False): + def __init__(self, config: OLMoConfig, model: Optional[OLMo] = None, init_params: bool = False): super().__init__(config) if not model: model_config = create_model_config_from_pretrained_config(config) # Initialize model (always on CPU to start with so we don't run out of GPU memory). model_config.init_device = "cpu" - self.model = Olmo(model_config, init_params=init_params) + self.model = OLMo(model_config, init_params=init_params) else: self.model = model diff --git a/hf_olmo/tokenization_olmo_fast.py b/hf_olmo/tokenization_olmo_fast.py index e2bd665d1..19543a6c7 100644 --- a/hf_olmo/tokenization_olmo_fast.py +++ b/hf_olmo/tokenization_olmo_fast.py @@ -4,7 +4,7 @@ class OLMoTokenizerFast(PreTrainedTokenizerFast): - # Note: Olmo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary. + # Note: OLMo's tokenizer is already a wrapper around huggingface. This is potentially unnecessary. pass # def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: diff --git a/inference/NOTES.md b/inference/NOTES.md index 0af0f9d09..ea57a322a 100644 --- a/inference/NOTES.md +++ b/inference/NOTES.md @@ -45,12 +45,12 @@ To add an `olmo.py` module, we can basically just imitate what was done for othe There's one important wrinkle here: some OLMo models use *fused linear attention*. I'm not sure how GPTQ handles this or whether any existing supported models implement attention the same way. This might be something to discuss with Dirk and Pete. ```python -Olmo( +OLMo( (transformer): ModuleDict( (wte): Embedding(50304, 768) (emb_drop): Dropout(p=0.1, inplace=False) (blocks): ModuleList( - (0-11): 12 x OlmoSequentialBlock( + (0-11): 12 x OLMoSequentialBlock( (dropout): Dropout(p=0.1, inplace=False) (norm): LayerNorm() (act): SwiGLU() diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py index 46ce32e69..9beb2ff33 100644 --- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py +++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/auto.py @@ -13,7 +13,7 @@ from .internlm import InternLMGPTQForCausalLM from .llama import LlamaGPTQForCausalLM from .moss import MOSSGPTQForCausalLM -from .olmo import OlmoGPTQForCausalLM +from .olmo import OLMoGPTQForCausalLM from .opt import OPTGPTQForCausalLM from .qwen import QwenGPTQForCausalLM from .rw import RWGPTQForCausalLM @@ -24,7 +24,7 @@ "gptj": GPTJGPTQForCausalLM, "gpt2": GPT2GPTQForCausalLM, "llama": LlamaGPTQForCausalLM, - "olmo": OlmoGPTQForCausalLM, + "olmo": OLMoGPTQForCausalLM, "opt": OPTGPTQForCausalLM, "moss": MOSSGPTQForCausalLM, "gpt_bigcode": GPTBigCodeGPTQForCausalLM, diff --git a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py index 0bf18fc8c..01264bfdb 100644 --- a/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py +++ b/inference/compression/dependencies/AutoGPTQ/auto_gptq/modeling/olmo.py @@ -1,7 +1,7 @@ from ._base import * -class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # Attribute name of Transformer layer block. layers_block_name = "model.transformer.blocks" @@ -19,4 +19,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]] -__all__ = ["OlmoGPTQForCausalLM"] +__all__ = ["OLMoGPTQForCausalLM"] diff --git a/inference/compression/olmo_gptq_class.py b/inference/compression/olmo_gptq_class.py index 645349d7b..0f6580a59 100644 --- a/inference/compression/olmo_gptq_class.py +++ b/inference/compression/olmo_gptq_class.py @@ -1,7 +1,7 @@ from auto_gptq.modeling._base import BaseGPTQForCausalLM -class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # Attribute name of Transformer layer block. layers_block_name = "model.transformer.blocks" @@ -17,12 +17,12 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): inside_layer_modules = [["att_proj"], ["attn_out"], ["ff_proj"], ["ff_out"]] -__all__ = ["OlmoGPTQForCausalLM"] +__all__ = ["OLMoGPTQForCausalLM"] # NOTE: In progress; may change if OLMo model is updated. -# class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): +# class OLMoGPTQForCausalLM(BaseGPTQForCausalLM): # # Attribute name of Transformer layer block. # layers_block_name = "transformer.blocks" # NOTE(wadden) Correct # @@ -51,4 +51,4 @@ class OlmoGPTQForCausalLM(BaseGPTQForCausalLM): # ] -# __all__ = ["OlmoGPTQForCausalLM"] +# __all__ = ["OLMoGPTQForCausalLM"] diff --git a/olmo/config.py b/olmo/config.py index d9e257f88..a68a10a99 100644 --- a/olmo/config.py +++ b/olmo/config.py @@ -23,7 +23,7 @@ from torch.distributed.fsdp import MixedPrecision, ShardingStrategy from .aliases import PathOrStr -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError from .util import StrEnum __all__ = [ @@ -116,7 +116,7 @@ def new(cls: Type[C], **kwargs) -> C: conf = om.merge(conf, kwargs) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise OlmoConfigurationError(str(e)) + raise OLMoConfigurationError(str(e)) @classmethod def load( @@ -139,7 +139,7 @@ def load( conf = om.merge(conf, om.from_dotlist(overrides)) return cast(C, om.to_object(conf)) except OmegaConfBaseException as e: - raise OlmoConfigurationError(str(e)) + raise OLMoConfigurationError(str(e)) def save(self, path: PathOrStr) -> None: """Save to a YAML file.""" diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py index 52421b57a..7d8fbb56b 100644 --- a/olmo/data/__init__.py +++ b/olmo/data/__init__.py @@ -5,7 +5,7 @@ from ..aliases import PathOrStr from ..config import DataConfig, TrainConfig -from ..exceptions import OlmoConfigurationError +from ..exceptions import OLMoConfigurationError from ..torch_util import barrier, get_global_rank, get_world_size from .collator import DataCollator from .iterable_dataset import IterableDataset @@ -21,7 +21,7 @@ def build_memmap_dataset( metadata: List[Dict[str, Any]] = [] if data_config.paths: if data_config.datasets: - raise OlmoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets") + raise OLMoConfigurationError("DataConfig.paths is mutually exclusive with DataConfig.datasets") paths = data_config.paths for path in paths: metadata.append({"path": str(path)}) @@ -32,7 +32,7 @@ def build_memmap_dataset( paths.extend(label_paths) metadata.extend([{"label": label}] * len(label_paths)) else: - raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") + raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") return MemMapDataset( *paths, chunk_size=train_config.model.max_sequence_length, @@ -87,7 +87,7 @@ def build_train_dataloader(train_config: TrainConfig) -> DataLoader: work_dir = Path(train_config.save_folder) / "train_data" if get_global_rank() == 0: if work_dir.is_dir() and not train_config.save_overwrite: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "train data working directory already exists, use --save_overwrite to overwrite" ) else: diff --git a/olmo/data/memmap_dataset.py b/olmo/data/memmap_dataset.py index 5af73c277..c00f29e06 100644 --- a/olmo/data/memmap_dataset.py +++ b/olmo/data/memmap_dataset.py @@ -7,7 +7,7 @@ import torch from torch.utils.data import Dataset -from olmo.exceptions import OlmoEnvironmentError +from olmo.exceptions import OLMoEnvironmentError from ..aliases import PathOrStr from ..util import _get_s3_client, file_size, get_bytes_range @@ -93,7 +93,7 @@ def offsets(self) -> List[Tuple[int, int]]: _get_s3_client("s3") try: _get_s3_client("r2") - except OlmoEnvironmentError: + except OLMoEnvironmentError: # R2 might not be needed, so ignore this error. We will get an error # later if R2 is needed. pass diff --git a/olmo/eval/__init__.py b/olmo/eval/__init__.py index 748e8e46f..bc8313c78 100644 --- a/olmo/eval/__init__.py +++ b/olmo/eval/__init__.py @@ -5,7 +5,7 @@ from torchmetrics import MeanMetric, Metric from ..config import EvaluatorConfig, EvaluatorType, TrainConfig -from ..exceptions import OlmoConfigurationError +from ..exceptions import OLMoConfigurationError from ..tokenizer import Tokenizer from ..torch_util import get_global_rank, get_world_size from .downstream import ICLMetric, label_to_task_map @@ -93,7 +93,7 @@ def make_metric(): elif eval_config.data.datasets: eval_metric = {label: make_metric() for label in eval_config.data.datasets.keys()} else: - raise OlmoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") + raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") return Evaluator( label=eval_config.label, diff --git a/olmo/exceptions.py b/olmo/exceptions.py index 754580c95..5474facc3 100644 --- a/olmo/exceptions.py +++ b/olmo/exceptions.py @@ -1,37 +1,37 @@ -__all__ = ["OlmoError", "OlmoConfigurationError", "OlmoCliError", "OlmoEnvironmentError", "OlmoNetworkError"] +__all__ = ["OLMoError", "OLMoConfigurationError", "OLMoCliError", "OLMoEnvironmentError", "OLMoNetworkError"] -class OlmoError(Exception): +class OLMoError(Exception): """ Base class for all custom OLMo exceptions. """ -class OlmoConfigurationError(OlmoError): +class OLMoConfigurationError(OLMoError): """ An error with a configuration file. """ -class OlmoCliError(OlmoError): +class OLMoCliError(OLMoError): """ An error from incorrect CLI usage. """ -class OlmoEnvironmentError(OlmoError): +class OLMoEnvironmentError(OLMoError): """ An error from incorrect environment variables. """ -class OlmoNetworkError(OlmoError): +class OLMoNetworkError(OLMoError): """ An error with a network request. """ -class OlmoThreadError(Exception): +class OLMoThreadError(Exception): """ Raised when a thread fails. """ diff --git a/olmo/model.py b/olmo/model.py index f975c7c98..4235f7561 100644 --- a/olmo/model.py +++ b/olmo/model.py @@ -42,7 +42,7 @@ LayerNormType, ModelConfig, ) -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError from .initialization import ModuleType, init_weights from .torch_util import ensure_finite_ @@ -62,12 +62,12 @@ "GELU", "ReLU", "SwiGLU", - "OlmoBlock", - "OlmoSequentialBlock", - "OlmoParallelBlock", - "Olmo", - "OlmoOutput", - "OlmoGenerateOutput", + "OLMoBlock", + "OLMoSequentialBlock", + "OLMoParallelBlock", + "OLMo", + "OLMoOutput", + "OLMoGenerateOutput", ] @@ -386,7 +386,7 @@ def alibi_attention_bias(seq_len: int, config: ModelConfig, device: torch.device return alibi_bias * (1.0 / (2 ** m.view(1, config.n_heads, 1, 1))) # type: ignore -class OlmoBlock(nn.Module): +class OLMoBlock(nn.Module): """ A base class for transformer block implementations. """ @@ -589,18 +589,18 @@ def forward( raise NotImplementedError @classmethod - def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OlmoBlock: + def build(cls, layer_id: int, config: ModelConfig, cache: BufferCache) -> OLMoBlock: if config.block_type == BlockType.sequential: - return OlmoSequentialBlock(layer_id, config, cache) + return OLMoSequentialBlock(layer_id, config, cache) elif config.block_type == BlockType.parallel: - return OlmoParallelBlock(layer_id, config, cache) + return OLMoParallelBlock(layer_id, config, cache) elif config.block_type == BlockType.llama: - return OlmoLlamaBlock(layer_id, config, cache) + return OLMoLlamaBlock(layer_id, config, cache) else: raise NotImplementedError(f"Unknown block type: '{config.block_type}'") -class OlmoSequentialBlock(OlmoBlock): +class OLMoSequentialBlock(OLMoBlock): """ This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). @@ -689,11 +689,11 @@ def forward( return x, cache -class OlmoParallelBlock(OlmoBlock): +class OLMoParallelBlock(OLMoBlock): """ This is a transformer block where the output is computed as ``MLP(LN(x)) + Attention(LN(x))`` as in the PaLM architecture, as opposed to the typical ``MLP(LN(x + Attention(LN(x))))`` - as in :class:`OlmoSequentialBlock` (ignoring some skip connections). + as in :class:`OLMoSequentialBlock` (ignoring some skip connections). The decoupling of the MLP and Attention functions allow us to fuse the separate input projections into a single linear layer to increase throughput. In this configuration it's also straight-forward @@ -781,10 +781,10 @@ def forward( ) -class OlmoLlamaBlock(OlmoBlock): +class OLMoLlamaBlock(OLMoBlock): """ This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` - (plus another skip connection). This block is similar to `OlmoSequentialBlock` + (plus another skip connection). This block is similar to `OLMoSequentialBlock` but some operations have slightly different implementations to imitate the behavior of Llama. """ @@ -904,7 +904,7 @@ def forward( return x, cache -class OlmoOutput(NamedTuple): +class OLMoOutput(NamedTuple): logits: torch.FloatTensor """ A tensor of shape `(batch_size, seq_len, vocab_size)` representing the log probabilities @@ -922,7 +922,7 @@ class OlmoOutput(NamedTuple): """ -class OlmoGenerateOutput(NamedTuple): +class OLMoGenerateOutput(NamedTuple): token_ids: torch.LongTensor """ The generated token IDs, a tensor of shape `(batch_size, beam_size, max_steps)`. @@ -935,7 +935,7 @@ class OlmoGenerateOutput(NamedTuple): """ -class OlmoBlockGroup(nn.ModuleList): +class OLMoBlockGroup(nn.ModuleList): def __init__(self, config: ModelConfig, layer_offset: int, modules: Optional[Iterable[nn.Module]] = None): super().__init__(modules) self.config = config @@ -991,7 +991,7 @@ def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointin block.set_activation_checkpointing(strategy) -class Olmo(nn.Module): +class OLMo(nn.Module): def __init__(self, config: ModelConfig, init_params: bool = True): super().__init__() self.config = config @@ -999,14 +999,14 @@ def __init__(self, config: ModelConfig, init_params: bool = True): # Validate config. if self.config.alibi and self.config.flash_attention: - raise OlmoConfigurationError("ALiBi is currently not supported with FlashAttention") + raise OLMoConfigurationError("ALiBi is currently not supported with FlashAttention") if self.config.alibi and self.config.rope: - raise OlmoConfigurationError("ALiBi and RoPE are mutually exclusive") + raise OLMoConfigurationError("ALiBi and RoPE are mutually exclusive") if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size: if self.config.embedding_size < self.config.vocab_size: - raise OlmoConfigurationError("embedding size should be at least as big as vocab size") + raise OLMoConfigurationError("embedding size should be at least as big as vocab size") elif self.config.embedding_size % 128 != 0: import warnings @@ -1021,7 +1021,7 @@ def __init__(self, config: ModelConfig, init_params: bool = True): 0 < self.config.block_group_size <= self.config.n_layers and self.config.n_layers % self.config.block_group_size == 0 ): - raise OlmoConfigurationError("n layers must be divisible by block group size") + raise OLMoConfigurationError("n layers must be divisible by block group size") torch.backends.cuda.enable_flash_sdp(self.config.flash_attention) torch.backends.cuda.enable_mem_efficient_sdp(False) # this is super slow so make sure torch won't use it @@ -1036,10 +1036,10 @@ def __init__(self, config: ModelConfig, init_params: bool = True): ) ) - blocks = [OlmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)] + blocks = [OLMoBlock.build(i, config, self.__cache) for i in range(config.n_layers)] if self.config.block_group_size > 1: block_groups = [ - OlmoBlockGroup(config, i, blocks[i : i + config.block_group_size]) + OLMoBlockGroup(config, i, blocks[i : i + config.block_group_size]) for i in range(0, config.n_layers, config.block_group_size) ] self.transformer.update({"block_groups": nn.ModuleList(block_groups)}) @@ -1138,7 +1138,7 @@ def forward( use_cache: bool = False, last_logits_only: bool = False, output_hidden_states: Optional[bool] = None, - ) -> OlmoOutput: + ) -> OLMoOutput: """ :param input_ids: A tensor of shape `(batch_size, seq_len)`. :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input @@ -1316,7 +1316,7 @@ def forward( if self.config.scale_logits: logits.mul_(1 / math.sqrt(self.config.d_model)) - return OlmoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] + return OLMoOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type] def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None): if wrap_strategy is None: @@ -1336,7 +1336,7 @@ def get_fsdp_wrap_policy(self, wrap_strategy: Optional[FSDPWrapStrategy] = None) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlock) + wrap = isinstance(module, OLMoBlock) if recurse: return True else: @@ -1347,7 +1347,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, (OlmoBlock,)) or module in size_based_module_to_wrap + wrap = isinstance(module, (OLMoBlock,)) or module in size_based_module_to_wrap if recurse: return True else: @@ -1356,13 +1356,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): return fsdp_wrap_fn elif wrap_strategy == FSDPWrapStrategy.by_block_group: if self.config.block_group_size <= 1: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "'by_block_group' FSDP wrapping strategy requires block group size greater than 1" ) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlockGroup) + wrap = isinstance(module, OLMoBlockGroup) if recurse: return True else: @@ -1371,13 +1371,13 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): return fsdp_wrap_fn elif wrap_strategy == FSDPWrapStrategy.by_block_group_and_size: if self.config.block_group_size <= 1: - raise OlmoConfigurationError( + raise OLMoConfigurationError( "'by_block_group_and_size' FSDP wrapping strategy requires block group size greater than 1" ) def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, (OlmoBlockGroup,)) or module in size_based_module_to_wrap + wrap = isinstance(module, (OLMoBlockGroup,)) or module in size_based_module_to_wrap if recurse: return True else: @@ -1403,7 +1403,7 @@ def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): def fsdp_wrap_fn(module, recurse: bool = True, nonwrapped_numel: int = 0): del nonwrapped_numel - wrap = isinstance(module, OlmoBlock) and module.layer_id % c == 0 + wrap = isinstance(module, OLMoBlock) and module.layer_id % c == 0 if recurse: return True else: @@ -1454,7 +1454,7 @@ def generate( min_steps: Optional[int] = None, final_sequence_scorer: Optional[FinalSequenceScorer] = None, constraints: Optional[List[Constraint]] = None, - ) -> OlmoGenerateOutput: + ) -> OLMoGenerateOutput: """ Generate token IDs using beam search. @@ -1564,7 +1564,7 @@ def step( with torch.no_grad(): token_ids, scores = beam_search.search(initial_preds, state, step) - return OlmoGenerateOutput( + return OLMoGenerateOutput( token_ids=token_ids, # type: ignore[arg-type] scores=scores, # type: ignore[arg-type] ) @@ -1572,7 +1572,7 @@ def step( @classmethod def from_checkpoint( cls, checkpoint_dir: PathOrStr, device: str = "cpu", checkpoint_type: Optional[CheckpointType] = None - ) -> Olmo: + ) -> OLMo: """ Load an OLMo model from a checkpoint. """ @@ -1595,7 +1595,7 @@ def from_checkpoint( if checkpoint_type == CheckpointType.unsharded: # Initialize model (always on CPU to start with so we don't run out of GPU memory). model_config.init_device = "cpu" - model = Olmo(model_config) + model = OLMo(model_config) # Load state dict directly to target device. state_dict_path = resource_path(checkpoint_dir, "model.pt") @@ -1608,7 +1608,7 @@ def from_checkpoint( # Initialize model on target device. In this case the state dict is loaded in-place # so it's not necessary to start on CPU if the target device is a GPU. model_config.init_device = device - model = Olmo(model_config) + model = OLMo(model_config) # Load state dict in place. load_model_state(checkpoint_dir, model) diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py index a833d3c21..3ed064097 100644 --- a/olmo/tokenizer.py +++ b/olmo/tokenizer.py @@ -8,7 +8,7 @@ from .aliases import PathOrStr from .config import ModelConfig, TokenizerConfig, TrainConfig, TruncationDirection -from .exceptions import OlmoConfigurationError +from .exceptions import OLMoConfigurationError __all__ = ["Tokenizer"] @@ -68,7 +68,7 @@ def from_train_config(cls, config: TrainConfig) -> Tokenizer: pad_token_id=config.model.pad_token_id, ) if config.model.vocab_size != tokenizer.vocab_size: - raise OlmoConfigurationError("vocab size mismatch between config and tokenizer") + raise OLMoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer @classmethod @@ -117,7 +117,7 @@ def from_checkpoint(cls, checkpoint_dir: PathOrStr) -> Tokenizer: pad_token_id=model_config.pad_token_id, ) if model_config.vocab_size != tokenizer.vocab_size: - raise OlmoConfigurationError("vocab size mismatch between config and tokenizer") + raise OLMoConfigurationError("vocab size mismatch between config and tokenizer") return tokenizer def add_special_tokens(self, input_ids: List[int]) -> List[int]: diff --git a/olmo/train.py b/olmo/train.py index 79132f0fc..43d4ee5fc 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -33,8 +33,8 @@ ) from .data import IterableDataset from .eval import Evaluator -from .exceptions import OlmoConfigurationError -from .model import Olmo +from .exceptions import OLMoConfigurationError +from .model import OLMo from .optim import Optimizer, Scheduler from .torch_util import ( barrier, @@ -96,7 +96,7 @@ def check(self) -> Dict[str, float]: @dataclass class Trainer: cfg: TrainConfig - model: Olmo + model: OLMo fsdp_model: FSDP optim: Optimizer scheduler: Scheduler @@ -351,7 +351,7 @@ def _save_checkpoint( upload_to=remote_checkpoint_dir, ) except FileExistsError: - raise OlmoConfigurationError( + raise OLMoConfigurationError( f"Checkpoint for step {self.global_step} already exists, use --save-overwrite to overwrite it" ) diff --git a/olmo/util.py b/olmo/util.py index c0519e9a1..c13cc8086 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -25,11 +25,11 @@ from .aliases import PathOrStr from .exceptions import ( - OlmoCliError, - OlmoEnvironmentError, - OlmoError, - OlmoNetworkError, - OlmoThreadError, + OLMoCliError, + OLMoEnvironmentError, + OLMoError, + OLMoNetworkError, + OLMoThreadError, ) from .torch_util import get_global_rank, get_local_rank, get_node_rank, is_distributed @@ -150,9 +150,9 @@ def excepthook(exctype, value, traceback): """ if issubclass(exctype, KeyboardInterrupt): sys.__excepthook__(exctype, value, traceback) - elif issubclass(exctype, OlmoCliError): + elif issubclass(exctype, OLMoCliError): rich.get_console().print(f"[yellow]{value}[/]", highlight=False) - elif issubclass(exctype, OlmoError): + elif issubclass(exctype, OLMoError): rich.get_console().print(Text(f"{exctype.__name__}:", style="red"), value, highlight=False) else: log.critical("Uncaught %s: %s", exctype.__name__, value, exc_info=(exctype, value, traceback)) @@ -450,7 +450,7 @@ def _get_s3_profile_name(scheme: str) -> Optional[str]: if scheme == "r2": profile_name = os.environ.get("R2_PROFILE") if profile_name is None: - raise OlmoEnvironmentError( + raise OLMoEnvironmentError( "R2 profile name is not set. Did you forget to set the 'R2_PROFILE' env var?" ) @@ -465,7 +465,7 @@ def _get_s3_endpoint_url(scheme: str) -> Optional[str]: if scheme == "r2": r2_endpoint_url = os.environ.get("R2_ENDPOINT_URL") if r2_endpoint_url is None: - raise OlmoEnvironmentError( + raise OLMoEnvironmentError( "R2 endpoint url is not set. Did you forget to set the 'R2_ENDPOINT_URL' env var?" ) @@ -511,12 +511,12 @@ def _s3_upload( _wait_before_retry(attempt) if err is not None: - raise OlmoNetworkError("Failed to check object existence during s3 upload") from err + raise OLMoNetworkError("Failed to check object existence during s3 upload") from err try: _get_s3_client(scheme).upload_file(source, bucket_name, key) except boto_exceptions.ClientError as e: - raise OlmoNetworkError("Failed to upload to s3") from e + raise OLMoNetworkError("Failed to upload to s3") from e def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3) -> int: @@ -533,7 +533,7 @@ def _s3_file_size(scheme: str, bucket_name: str, key: str, max_attempts: int = 3 log.warning("%s failed attempt %d with retriable error: %s", _s3_file_size.__name__, attempt, err) _wait_before_retry(attempt) - raise OlmoNetworkError("Failed to get s3 file size") from err + raise OLMoNetworkError("Failed to get s3 file size") from err def _s3_get_bytes_range( @@ -572,7 +572,7 @@ def _s3_get_bytes_range( # This can cause an irrelevant exception (e.g. KeyError: 'error'), resulting # in us losing the true exception info. To avoid this, we change the exception # to a type that has a single-parameter constructor. - raise OlmoNetworkError("Failed to get bytes range from s3") from err + raise OLMoNetworkError("Failed to get bytes range from s3") from err def _s3_find_latest_checkpoint(scheme: str, bucket_name: str, prefix: str) -> Optional[str]: @@ -626,7 +626,7 @@ def fill_queue(): for x in iter(q.get, sentinel): if isinstance(x, Exception): - raise OlmoThreadError(f"generator thread {thread_name} failed") from x + raise OLMoThreadError(f"generator thread {thread_name} failed") from x else: yield x diff --git a/olmo/version.py b/olmo/version.py index 3f9d92c5b..e75c8373e 100644 --- a/olmo/version.py +++ b/olmo/version.py @@ -1,8 +1,8 @@ _MAJOR = "0" -_MINOR = "2" +_MINOR = "3" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "5" +_PATCH = "0" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" diff --git a/scripts/init_config.py b/scripts/init_config.py index 22143d401..22c223d7b 100644 --- a/scripts/init_config.py +++ b/scripts/init_config.py @@ -7,7 +7,7 @@ from typing import List from olmo import TrainConfig -from olmo.exceptions import OlmoCliError +from olmo.exceptions import OLMoCliError from olmo.util import clean_opt, prepare_cli_environment log = logging.getLogger(__name__) @@ -27,6 +27,6 @@ def main(save_path: Path, args_list: List[str]) -> None: try: save_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_PATH] [OPTIONS]") main(Path(save_path), [clean_opt(s) for s in args_list]) diff --git a/scripts/inspect_train_data.py b/scripts/inspect_train_data.py index 871702b8c..bed8b432f 100644 --- a/scripts/inspect_train_data.py +++ b/scripts/inspect_train_data.py @@ -9,7 +9,7 @@ from olmo.config import TrainConfig from olmo.data import build_memmap_dataset -from olmo.exceptions import OlmoCliError +from olmo.exceptions import OLMoCliError from olmo.tokenizer import Tokenizer from olmo.util import clean_opt, prepare_cli_environment @@ -51,6 +51,6 @@ def main(save_folder: Path, *steps: int, rank: Optional[int] = None): try: save_folder, rank, steps = sys.argv[1], int(sys.argv[2]), [int(i) for i in sys.argv[3:]] except (IndexError, ValueError): - raise OlmoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [SAVE_FOLDER] [RANK] [STEP_NUMBER...]") main(Path(save_folder), *steps, rank=rank if rank >= 0 else None) diff --git a/scripts/show_model_size.py b/scripts/show_model_size.py index 3740137dc..cf2ca1e22 100644 --- a/scripts/show_model_size.py +++ b/scripts/show_model_size.py @@ -10,8 +10,8 @@ import logging import sys -from olmo import Olmo, TrainConfig -from olmo.exceptions import OlmoCliError +from olmo import OLMo, TrainConfig +from olmo.exceptions import OLMoCliError from olmo.util import clean_opt, prepare_cli_environment log = logging.getLogger(__name__) @@ -23,7 +23,7 @@ def main(cfg: TrainConfig) -> None: n_layers = cfg.model.n_layers cfg.model.n_layers = 1 - single_layer_model = Olmo(cfg.model) + single_layer_model = OLMo(cfg.model) block = single_layer_model.transformer.blocks[0] # type: ignore params_per_block = sum(p.numel() for p in block.parameters()) # type: ignore @@ -42,7 +42,7 @@ def main(cfg: TrainConfig) -> None: try: yaml_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") cfg = TrainConfig.load( yaml_path, diff --git a/scripts/train.py b/scripts/train.py index fca309a7f..c59bc51f3 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -16,8 +16,8 @@ from olmo.config import CheckpointType, TrainConfig from olmo.data import build_train_dataloader from olmo.eval import build_evaluators -from olmo.exceptions import OlmoCliError, OlmoConfigurationError -from olmo.model import Olmo +from olmo.exceptions import OLMoCliError, OLMoConfigurationError +from olmo.model import OLMo from olmo.optim import BoltOnWarmupScheduler, build_optimizer, build_scheduler from olmo.torch_util import ( barrier, @@ -37,7 +37,7 @@ def main(cfg: TrainConfig) -> None: # Ensure run name set. if cfg.run_name is None: - raise OlmoConfigurationError("--run_name is required") + raise OLMoConfigurationError("--run_name is required") log_extra_field("run_name", cfg.run_name) # Sanity check @@ -77,7 +77,7 @@ def main(cfg: TrainConfig) -> None: # Save config. save_path = Path(cfg.save_folder) / "config.yaml" if save_path.is_file() and not cfg.save_overwrite: - raise OlmoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") + raise OLMoConfigurationError(f"{save_path} already exists, use --save_overwrite to overwrite") else: log.info(f"Saving config to {save_path}") save_path.parent.mkdir(exist_ok=True, parents=True) @@ -114,7 +114,7 @@ def main(cfg: TrainConfig) -> None: # Initialize the model. log.info("Building model...") - olmo_model = Olmo(cfg.model) + olmo_model = OLMo(cfg.model) log.info(f"Total number of parameters: {olmo_model.num_params():,d}") log.info(f"Number of non-embedding parameters: {olmo_model.num_params(include_embedding=False):,d}") log.info(f"Peak GPU Memory (MB) before FSDP: {int(peak_gpu_memory() or 0)}") @@ -159,7 +159,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: if cfg.save_data_indices: indices_file_path = Path(cfg.save_folder) / f"data-indices/rank{get_global_rank()}.tsv.gz" if indices_file_path.exists() and not cfg.save_overwrite: - raise OlmoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") + raise OLMoConfigurationError(f"{indices_file_path} already exists, use --save_overwrite to overwrite") indices_file_path.parent.mkdir(exist_ok=True, parents=True) indices_file = gzip.open(indices_file_path, "wt") @@ -250,7 +250,7 @@ def dummy_init_fn(module: torch.nn.Module) -> None: try: yaml_path, args_list = sys.argv[1], sys.argv[2:] except IndexError: - raise OlmoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") + raise OLMoCliError(f"Usage: {sys.argv[0]} [CONFIG_PATH] [OPTIONS]") cfg = TrainConfig.load(yaml_path, [clean_opt(s) for s in args_list]) main(cfg) diff --git a/test_fixtures/test-olmo-model/config.json b/test_fixtures/test-olmo-model/config.json index 71e7b981e..352a4c976 100644 --- a/test_fixtures/test-olmo-model/config.json +++ b/test_fixtures/test-olmo-model/config.json @@ -3,7 +3,7 @@ "alibi": false, "alibi_bias_max": 8.0, "architectures": [ - "OlmoModelForCausalLM" + "OLMoModelForCausalLM" ], "attention_dropout": 0.1, "attention_layer_norm": false, diff --git a/tests/hf_olmo/hf_olmo_test.py b/tests/hf_olmo/hf_olmo_test.py index 0b323c4e8..6f70c0090 100644 --- a/tests/hf_olmo/hf_olmo_test.py +++ b/tests/hf_olmo/hf_olmo_test.py @@ -3,7 +3,7 @@ from olmo import BlockType, Tokenizer, TrainConfig from olmo.data import DataCollator -from olmo.model import Olmo +from olmo.model import OLMo from olmo.torch_util import seed_all @@ -188,7 +188,7 @@ def test_forward( use_amp = dtype in {torch.float16, torch.bfloat16} seed_all(1234) - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() hf_config = OLMoConfig(**model.config.asdict()) diff --git a/tests/hf_olmo/modeling_olmo_test.py b/tests/hf_olmo/modeling_olmo_test.py index fda1bd715..e4bb02f54 100644 --- a/tests/hf_olmo/modeling_olmo_test.py +++ b/tests/hf_olmo/modeling_olmo_test.py @@ -3,7 +3,7 @@ import pytest import torch -from olmo.model import Olmo +from olmo.model import OLMo def test_olmo_model(model_path: str): @@ -11,7 +11,7 @@ def test_olmo_model(model_path: str): from hf_olmo import OLMoForCausalLM, OLMoTokenizerFast # noqa: F401 - model = Olmo.from_checkpoint(model_path) + model = OLMo.from_checkpoint(model_path) hf_model = AutoModelForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) diff --git a/tests/model_test.py b/tests/model_test.py index 79f2b1a26..1833223ce 100644 --- a/tests/model_test.py +++ b/tests/model_test.py @@ -3,7 +3,7 @@ import torch.nn.functional as F from torch.nn import CrossEntropyLoss -from olmo import BlockType, LayerNorm, Olmo, Tokenizer, TrainConfig +from olmo import BlockType, LayerNorm, OLMo, Tokenizer, TrainConfig from olmo.config import ModelConfig, PaddingDirection from olmo.data import DataCollator @@ -173,7 +173,7 @@ def test_forward( use_amp = dtype in {torch.float16, torch.bfloat16} - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() input1 = tokenizer.encode("My name is OLMo!") input2 = tokenizer.encode("I'm a delightful large open language model :)") @@ -293,7 +293,7 @@ def test_backward( else: train_config.model.init_device = "cpu" - model = Olmo(train_config.model).train() + model = OLMo(train_config.model).train() with torch.autocast( device_type="cuda" if cuda else "cpu", enabled=use_amp, dtype=None if not use_amp else dtype @@ -364,7 +364,7 @@ def test_generate( train_config.model.init_device = "cpu" use_amp = dtype in {torch.float16, torch.bfloat16} - model = Olmo(train_config.model).eval() + model = OLMo(train_config.model).eval() input1 = tokenizer.encode("My name is OLMo! ", add_special_tokens=False) input2 = tokenizer.encode("I'm a delightful large open language model :) ", add_special_tokens=False) @@ -426,8 +426,8 @@ def test_layer_norm(train_config: TrainConfig, elementwise_affine: bool, include def test_block_groups(): - model_with_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval() - model_without_block_groups = Olmo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval() + model_with_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=3)).eval() + model_without_block_groups = OLMo(ModelConfig(d_model=128, n_heads=2, n_layers=9, block_group_size=1)).eval() # We should be able to load the state dict from one model into the other, and vice-versa. state_dict_to_load, og_keys_to_new_keys = model_with_block_groups._make_state_dict_compatible(