From 0953db14a7f88332d41fee926aff7bc348933d7c Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 25 Aug 2024 15:52:23 -0700 Subject: [PATCH 01/11] Clean-up unused files and fix test rot --- .gitignore | 1 + model/__init__.py | 0 model/data.py | 102 ------ model/model_tracker.py | 81 ----- model/model_updater.py | 105 ------ model/storage/__init__.py | 0 .../chain/chain_model_metadata_store.py | 167 ---------- model/storage/disk/__init__.py | 0 model/storage/disk/disk_model_store.py | 134 -------- model/storage/disk/utils.py | 117 ------- model/storage/hugging_face/__init__.py | 0 .../hugging_face/hugging_face_model_store.py | 91 ----- .../test_hugging_face_model_store.py | 125 ------- model/storage/local_model_store.py | 31 -- model/storage/model_metadata_store.py | 17 - model/storage/remote_model_store.py | 19 -- model/utils.py | 22 -- neurons/validator.py | 44 ++- tests/model/storage/disk/__init__.py | 0 .../storage/disk/test_disk_model_store.py | 293 ---------------- tests/model/storage/disk/test_utils.py | 272 --------------- .../storage/fake_model_metadata_store.py | 5 +- .../model/storage/fake_remote_model_store.py | 39 ++- tests/model/test_data.py | 33 -- tests/model/test_model_tracker.py | 173 ---------- tests/model/test_model_updater.py | 314 ------------------ tests/model/test_model_utils.py | 58 ---- tests/pretrain/test_dataset.py | 27 +- tests/pretrain/test_mining.py | 22 +- tests/utilities/__init__.py | 0 tests/utilities/test_miner_iterator.py | 73 ---- tests/utilities/test_perf_monitor.py | 44 --- tests/utilities/test_utils.py | 237 ------------- utilities/__init__.py | 0 utilities/miner_iterator.py | 61 ---- utilities/perf_monitor.py | 67 ---- utilities/utils.py | 204 ------------ 37 files changed, 81 insertions(+), 2897 deletions(-) delete mode 100644 model/__init__.py delete mode 100644 model/data.py delete mode 100644 model/model_tracker.py delete mode 100644 model/model_updater.py delete mode 100644 model/storage/__init__.py delete mode 100644 model/storage/chain/chain_model_metadata_store.py delete mode 100644 model/storage/disk/__init__.py delete mode 100644 model/storage/disk/disk_model_store.py delete mode 100644 model/storage/disk/utils.py delete mode 100644 model/storage/hugging_face/__init__.py delete mode 100644 model/storage/hugging_face/hugging_face_model_store.py delete mode 100644 model/storage/hugging_face/test_hugging_face_model_store.py delete mode 100644 model/storage/local_model_store.py delete mode 100644 model/storage/model_metadata_store.py delete mode 100644 model/storage/remote_model_store.py delete mode 100644 model/utils.py delete mode 100644 tests/model/storage/disk/__init__.py delete mode 100644 tests/model/storage/disk/test_disk_model_store.py delete mode 100644 tests/model/storage/disk/test_utils.py delete mode 100644 tests/model/test_data.py delete mode 100644 tests/model/test_model_tracker.py delete mode 100644 tests/model/test_model_updater.py delete mode 100644 tests/model/test_model_utils.py delete mode 100644 tests/utilities/__init__.py delete mode 100644 tests/utilities/test_miner_iterator.py delete mode 100644 tests/utilities/test_perf_monitor.py delete mode 100644 tests/utilities/test_utils.py delete mode 100644 utilities/__init__.py delete mode 100644 utilities/miner_iterator.py delete mode 100644 utilities/perf_monitor.py delete mode 100644 utilities/utils.py diff --git a/.gitignore b/.gitignore index de117ca..0847a9f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ .vscode/ test-models/ +model-store/ # Exclude the Miner's directory for saving the models. local-models/ diff --git a/model/__init__.py b/model/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/model/data.py b/model/data.py deleted file mode 100644 index 4d87f2d..0000000 --- a/model/data.py +++ /dev/null @@ -1,102 +0,0 @@ -import dataclasses -from enum import IntEnum -from typing import Any, ClassVar, Dict, Optional, Type -from transformers import PreTrainedModel -from pydantic import BaseModel, Field, PositiveInt - -# The maximum bytes for metadata on the chain. -MAX_METADATA_BYTES = 128 -# The length, in bytes, of a git commit hash. -GIT_COMMIT_LENGTH = 40 -# The length, in bytes, of a base64 encoded sha256 hash. -SHA256_BASE_64_LENGTH = 44 - - -class ModelId(BaseModel): - """Uniquely identifies a trained model""" - - # Makes the object "Immutable" once created. - class Config: - frozen = True - extra = "forbid" - - MAX_REPO_ID_LENGTH: ClassVar[int] = ( - MAX_METADATA_BYTES - GIT_COMMIT_LENGTH - SHA256_BASE_64_LENGTH - 3 # separators - ) - - namespace: str = Field( - description="Namespace where the model can be found. ex. Hugging Face username/org." - ) - name: str = Field(description="Name of the model.") - - # When handling a model locally the commit and hash are not necessary. - # Commit must be filled when trying to download from a remote store. - commit: Optional[str] = Field( - description="Commit of the model. May be empty if not yet committed." - ) - # Hash is filled automatically when uploading to or downloading from a remote store. - hash: Optional[str] = Field(description="Hash of the trained model.") - - def to_compressed_str(self) -> str: - """Returns a compressed string representation.""" - return f"{self.namespace}:{self.name}:{self.commit}:{self.hash}" - - @classmethod - def from_compressed_str(cls, cs: str) -> Type["ModelId"]: - """Returns an instance of this class from a compressed string representation""" - tokens = cs.split(":") - return cls( - namespace=tokens[0], - name=tokens[1], - commit=tokens[2] if tokens[2] != "None" else None, - hash=tokens[3] if tokens[3] != "None" else None, - ) - - -class Model(BaseModel): - """Represents a pre trained foundation model.""" - - class Config: - arbitrary_types_allowed = True - - id: ModelId = Field(description="Identifier for this model.") - # PreTrainedModel.base_model returns torch.nn.Module if needed. - pt_model: PreTrainedModel = Field(description="Pre trained model.") - - -class ModelMetadata(BaseModel): - id: ModelId = Field(description="Identifier for this trained model.") - block: PositiveInt = Field( - description="Block on which this model was claimed on the chain." - ) - - -class TokenizerIdentifier(IntEnum): - """Identifiers the tokenizer to use. This may mean different tokenizers or different implementations.""" - - DISTILGPT_2 = 1 - GPT_4_TIKTOKEN = 2 - - -@dataclasses.dataclass() -class ModelCriteria: - """Collection of criteria that are relevant for interacting with models in the subnet.""" - - # Sequence length used for inference. - sequence_length: int - - # Whether bfloat16 and flash attention optimizations should be used. - optimized: bool - - # Maximum model size in bytes. - max_model_bytes: int - - # Maximum model parameter size. - max_model_parameters: int - - # Allowed model types. - allowed_model_types: dict[type] - - # Tokenizer to use. - tokenizer_identifier: TokenizerIdentifier - diff --git a/model/model_tracker.py b/model/model_tracker.py deleted file mode 100644 index 642310f..0000000 --- a/model/model_tracker.py +++ /dev/null @@ -1,81 +0,0 @@ -import copy -import threading -from typing import Dict, List, Optional, Set -import pickle -import bittensor as bt - -from model.data import ModelMetadata - - -class ModelTracker: - """Tracks the current model for each miner. - - Thread safe. - """ - - def __init__( - self, - ): - # Create a dict from miner hotkey to model metadata. - self.miner_hotkey_to_model_metadata_dict = dict() - - # Make this class thread safe because it will be accessed by multiple threads. - # One for the downloading new models loop and one for the validating models loop. - self.lock = threading.RLock() - - def save_state(self, filepath): - """Save the current state to the provided filepath.""" - - # Open a writable binary file for pickle. - with self.lock: - with open(filepath, "wb") as f: - pickle.dump(self.miner_hotkey_to_model_metadata_dict, f) - - def load_state(self, filepath): - """Load the state from the provided filepath.""" - - # Open a readable binary file for pickle. - with open(filepath, "rb") as f: - self.miner_hotkey_to_model_metadata_dict = pickle.load(f) - - def get_miner_hotkey_to_model_metadata_dict(self) -> Dict[str, ModelMetadata]: - """Returns the mapping from miner hotkey to model metadata.""" - - # Return a copy to ensure outside code can't modify the scores. - with self.lock: - return copy.deepcopy(self.miner_hotkey_to_model_metadata_dict) - - def get_model_metadata_for_miner_hotkey( - self, hotkey: str - ) -> Optional[ModelMetadata]: - """Returns the model metadata for a given hotkey if any.""" - - with self.lock: - if hotkey in self.miner_hotkey_to_model_metadata_dict: - return self.miner_hotkey_to_model_metadata_dict[hotkey] - return None - - def on_hotkeys_updated(self, incoming_hotkeys: Set[str]): - """Notifies the tracker which hotkeys are currently being tracked on the metagraph.""" - - with self.lock: - existing_hotkeys = set(self.miner_hotkey_to_model_metadata_dict.keys()) - for hotkey in existing_hotkeys - incoming_hotkeys: - del self.miner_hotkey_to_model_metadata_dict[hotkey] - bt.logging.trace(f"Removed outdated hotkey: {hotkey} from ModelTracker") - - def on_miner_model_updated( - self, - hotkey: str, - model_metadata: ModelMetadata, - ) -> None: - """Notifies the tracker that a miner has had their associated model updated. - - Args: - hotkey (str): The miner's hotkey. - model_metadata (ModelMetadata): The latest model metadata of the miner. - """ - with self.lock: - self.miner_hotkey_to_model_metadata_dict[hotkey] = model_metadata - - bt.logging.trace(f"Updated Miner {hotkey}. ModelMetadata={model_metadata}.") diff --git a/model/model_updater.py b/model/model_updater.py deleted file mode 100644 index 196b83b..0000000 --- a/model/model_updater.py +++ /dev/null @@ -1,105 +0,0 @@ -import bittensor as bt -from typing import Optional -import constants -from model import utils -from model.data import ModelMetadata -from model.model_tracker import ModelTracker -from model.storage.local_model_store import LocalModelStore -from model.storage.model_metadata_store import ModelMetadataStore -from model.storage.remote_model_store import RemoteModelStore - - -class ModelUpdater: - """Checks if the currently tracked model for a hotkey matches what the miner committed to the chain.""" - - def __init__( - self, - metadata_store: ModelMetadataStore, - remote_store: RemoteModelStore, - local_store: LocalModelStore, - model_tracker: ModelTracker, - ): - self.metadata_store = metadata_store - self.remote_store = remote_store - self.local_store = local_store - self.model_tracker = model_tracker - - async def _get_metadata(self, hotkey: str) -> Optional[ModelMetadata]: - """Get metadata about a model by hotkey""" - return await self.metadata_store.retrieve_model_metadata(hotkey) - - async def sync_model(self, hotkey: str, force: bool = False) -> bool: - """Updates local model for a hotkey if out of sync and returns if it was updated. - - Args: - hotkey (str): The hotkey of the model to sync. - force (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed. - """ - # Get the metadata for the miner. - metadata = await self._get_metadata(hotkey) - - if not metadata: - bt.logging.trace( - f"No valid metadata found on the chain for hotkey {hotkey}" - ) - return False - - # Check what model id the model tracker currently has for this hotkey. - tracker_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey( - hotkey - ) - - # If we are not forcing a sync due to retrying a top model we can short-circuit if no change. - if not force and metadata == tracker_model_metadata: - return False - - # Get the local path based on the local store to download to (top level hotkey path) - path = self.local_store.get_path(hotkey) - - # Otherwise we need to download the new model based on the metadata. - try: - # Max size according to the block. - model_size_limit = utils.get_model_criteria(metadata.block).max_model_bytes - model = await self.remote_store.download_model( - metadata.id, path, model_size_limit - ) - except Exception as e: - bt.logging.trace( - f"Failed to download model for hotkey {hotkey} due to {e}." - ) - return False - - # Check that the hash of the downloaded content matches. - if model.id.hash != metadata.id.hash: - # If the hash does not match directly, also try it with the hotkey of the miner. - # This is allowed to help miners prevent same-block copiers. - hash_with_hotkey = utils.get_hash_of_two_strings(model.id.hash, hotkey) - if hash_with_hotkey != metadata.id.hash: - bt.logging.trace( - f"Sync for hotkey {hotkey} failed. Hash of content downloaded from hugging face {model.id.hash} " - + f"or the hash including the hotkey {hash_with_hotkey} do not match chain metadata {metadata}." - ) - return False - - # Check that the parameter count of the model is within allowed bounds. - parameter_size = sum(p.numel() for p in model.pt_model.parameters()) - parameter_limit = utils.get_model_criteria(metadata.block).max_model_parameters - if parameter_size > parameter_limit: - bt.logging.trace( - f"Sync for hotkey {hotkey} failed. Parameter size of the model {parameter_size} exceeded max size {parameter_limit} at block {metadata.block}." - ) - return False - - allowed_model_types = utils.get_model_criteria( - metadata.block - ).allowed_model_types - if type(model.pt_model) not in allowed_model_types: - bt.logging.trace( - f"Sync for hotkey {hotkey} failed. Model type {type(model.pt_model)} is not allowed at block {metadata.block}." - ) - return False - - # Update the tracker - self.model_tracker.on_miner_model_updated(hotkey, metadata) - - return True diff --git a/model/storage/__init__.py b/model/storage/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/model/storage/chain/chain_model_metadata_store.py b/model/storage/chain/chain_model_metadata_store.py deleted file mode 100644 index 19d8447..0000000 --- a/model/storage/chain/chain_model_metadata_store.py +++ /dev/null @@ -1,167 +0,0 @@ -import asyncio -import functools -import bittensor as bt -import os -from model.data import ModelId, ModelMetadata -import constants -from model.storage.model_metadata_store import ModelMetadataStore -from typing import Optional - -from utilities import utils - - -class ChainModelMetadataStore(ModelMetadataStore): - """Chain based implementation for storing and retrieving metadata about a model.""" - - def __init__( - self, - subtensor: bt.subtensor, - wallet: Optional[bt.wallet] = None, - subnet_uid: int = constants.SUBNET_UID, - ): - self.subtensor = subtensor - self.wallet = ( - wallet # Wallet is only needed to write to the chain, not to read. - ) - self.subnet_uid = subnet_uid - - async def store_model_metadata(self, hotkey: str, model_id: ModelId): - """Stores model metadata on this subnet for a specific wallet.""" - if self.wallet is None: - raise ValueError("No wallet available to write to the chain.") - - # Wrap calls to the subtensor in a subprocess with a timeout to handle potential hangs. - partial = functools.partial( - self.subtensor.commit, - self.wallet, - self.subnet_uid, - model_id.to_compressed_str(), - ) - utils.run_in_subprocess(partial, 60) - - async def retrieve_model_metadata(self, hotkey: str) -> Optional[ModelMetadata]: - """Retrieves model metadata on this subnet for specific hotkey""" - - # Wrap calls to the subtensor in a subprocess with a timeout to handle potential hangs. - partial = functools.partial( - bt.extrinsics.serving.get_metadata, self.subtensor, self.subnet_uid, hotkey - ) - - metadata = utils.run_in_subprocess(partial, 180) - - if not metadata: - return None - - commitment = metadata["info"]["fields"][0] - hex_data = commitment[list(commitment.keys())[0]][2:] - - chain_str = bytes.fromhex(hex_data).decode() - - model_id = None - - try: - model_id = ModelId.from_compressed_str(chain_str) - except: - # If the metadata format is not correct on the chain then we return None. - bt.logging.trace( - f"Failed to parse the metadata on the chain for hotkey {hotkey}." - ) - return None - - model_metadata = ModelMetadata(id=model_id, block=metadata["block"]) - - return model_metadata - - -# Can only commit data every ~20 minutes. -async def test_store_model_metadata(): - """Verifies that the ChainModelMetadataStore can store data on the chain.""" - model_id = ModelId( - namespace="TestPath", name="TestModel", hash="TestHash1", commit="1.0" - ) - - # Use a different subnet that does not leverage chain storage to avoid conflicts. - # TODO switch to a mocked version when it supports commits. - subtensor = bt.subtensor() - - # Uses .env configured wallet/hotkey/uid for the test. - coldkey = os.getenv("TEST_COLDKEY") - hotkey = os.getenv("TEST_HOTKEY") - net_uid = int(os.getenv("TEST_SUBNET_UID")) - - wallet = bt.wallet(name=coldkey, hotkey=hotkey) - - metadata_store = ChainModelMetadataStore( - subtensor=subtensor, wallet=wallet, subnet_uid=net_uid - ) - - # Store the metadata on chain. - await metadata_store.store_model_metadata(hotkey=hotkey, model_id=model_id) - - print(f"Finished storing {model_id} on the chain.") - - -async def test_retrieve_model_metadata(): - """Verifies that the ChainModelMetadataStore can retrieve data from the chain.""" - expected_model_id = ModelId( - namespace="TestPath", name="TestModel", hash="TestHash1", commit="1.0" - ) - - # Use a different subnet that does not leverage chain storage to avoid conflicts. - # TODO switch to a mocked version when it supports commits. - subtensor = bt.subtensor() - - # Uses .env configured hotkey/uid for the test. - net_uid = int(os.getenv("TEST_SUBNET_UID")) - hotkey_address = os.getenv("TEST_HOTKEY_ADDRESS") - - # Do not require a wallet for retrieving data. - metadata_store = ChainModelMetadataStore( - subtensor=subtensor, wallet=None, subnet_uid=net_uid - ) - - # Retrieve the metadata from the chain. - model_metadata = await metadata_store.retrieve_model_metadata(hotkey_address) - - print(f"Expecting matching model id: {expected_model_id == model_metadata.id}") - - -# Can only commit data every ~20 minutes. -async def test_roundtrip_model_metadata(): - """Verifies that the ChainModelMetadataStore can roundtrip data on the chain.""" - model_id = ModelId( - namespace="TestPath", name="TestModel", hash="TestHash1", commit="1.0" - ) - - # Use a different subnet that does not leverage chain storage to avoid conflicts. - # TODO switch to a mocked version when it supports commits. - subtensor = bt.subtensor() - - # Uses .env configured wallet/hotkey/uid for the test. - coldkey = os.getenv("TEST_COLDKEY") - hotkey = os.getenv("TEST_HOTKEY") - net_uid = int(os.getenv("TEST_SUBNET_UID")) - - wallet = bt.wallet(name=coldkey, hotkey=hotkey) - - metadata_store = ChainModelMetadataStore( - subtensor=subtensor, wallet=wallet, subnet_uid=net_uid - ) - - # Store the metadata on chain. - await metadata_store.store_model_metadata(hotkey=hotkey, model_id=model_id) - - # May need to use the underlying publish_metadata function with wait_for_inclusion: True to pass here. - # Otherwise it defaults to False and we only wait for finalization not necessarily inclusion. - - # Retrieve the metadata from the chain. - model_metadata = await metadata_store.retrieve_model_metadata(hotkey) - - print(f"Expecting matching metadata: {model_id == model_metadata.id}") - - -if __name__ == "__main__": - # Can only commit data every ~20 minutes. - # asyncio.run(test_roundtrip_model_metadata()) - # asyncio.run(test_store_model_metadata()) - asyncio.run(test_retrieve_model_metadata()) diff --git a/model/storage/disk/__init__.py b/model/storage/disk/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/model/storage/disk/disk_model_store.py b/model/storage/disk/disk_model_store.py deleted file mode 100644 index be86d70..0000000 --- a/model/storage/disk/disk_model_store.py +++ /dev/null @@ -1,134 +0,0 @@ -import trace -import traceback -import bittensor as bt -from typing import Dict - -import torch -from model.data import Model, ModelId -from model.storage.disk import utils -from model.storage.local_model_store import LocalModelStore -from transformers import AutoModelForCausalLM -from pathlib import Path - - -class DiskModelStore(LocalModelStore): - """Local storage based implementation for storing and retrieving a model on disk.""" - - def __init__(self, base_dir: str): - self.base_dir = base_dir - - def get_path(self, hotkey: str) -> str: - """Returns the path to where this store would locate this hotkey.""" - return utils.get_local_miner_dir(self.base_dir, hotkey) - - def store_model(self, hotkey: str, model: Model) -> ModelId: - """Stores a trained model locally.""" - - # Note that the revision argument here does not affect the directory path like with hugging face downloads. - model.pt_model.save_pretrained( - save_directory=utils.get_local_model_snapshot_dir( - self.base_dir, hotkey, model.id - ), - revision=model.id.commit, - safe_serialization=True, - ) - - # Return the same model id used as we do not edit the commit information. - return model.id - - def retrieve_model( - self, hotkey: str, model_id: ModelId, optimized: bool = False - ) -> Model: - """Retrieves a trained model locally. If optimized use bfloat16 and flash attention.""" - - model = None - if optimized: - model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=utils.get_local_model_snapshot_dir( - self.base_dir, hotkey, model_id - ), - revision=model_id.commit, - local_files_only=True, - use_safetensors=True, - torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2", - ) - else: - model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=utils.get_local_model_snapshot_dir( - self.base_dir, hotkey, model_id - ), - revision=model_id.commit, - local_files_only=True, - use_safetensors=True, - ) - - return Model(id=model_id, pt_model=model) - - def delete_unreferenced_models( - self, valid_models_by_hotkey: Dict[str, ModelId], grace_period_seconds: int - ): - """Check across all of local storage and delete unreferenced models out of grace period.""" - # Expected directory structure is as follows. - # self.base_dir/models/hotkey/models--namespace--name/snapshots/commit/config.json + other files. - - # Create a set of valid model paths up to where we expect to see the actual files. - valid_model_paths = set() - for hotkey, model_id in valid_models_by_hotkey.items(): - valid_model_paths.add( - utils.get_local_model_snapshot_dir(self.base_dir, hotkey, model_id) - ) - - # For each hotkey path on disk using listdir to go one level deep. - miners_dir = Path(utils.get_local_miners_dir(self.base_dir)) - hotkey_subfolder_names = [d.name for d in miners_dir.iterdir() if d.is_dir()] - - for hotkey in hotkey_subfolder_names: - try: - # Reconstruct the path from the hotkey - hotkey_path = utils.get_local_miner_dir(self.base_dir, hotkey) - - # If it is not in valid_hotkeys and out of grace period remove it. - if hotkey not in valid_models_by_hotkey: - deleted_hotkey = utils.remove_dir_out_of_grace( - hotkey_path, grace_period_seconds - ) - if deleted_hotkey: - bt.logging.trace( - f"Removed directory for unreferenced hotkey: {hotkey}." - ) - - else: - # Check all the models--namespace--name subfolder paths. - hotkey_dir = Path(hotkey_path) - model_subfolder_paths = [ - str(d) for d in hotkey_dir.iterdir() if d.is_dir() - ] - - # Check all the snapshots subfolder paths - for model_path in model_subfolder_paths: - model_dir = Path(model_path) - snapshot_subfolder_paths = [ - str(d) for d in model_dir.iterdir() if d.is_dir() - ] - - # Check all the commit paths. - for snapshot_path in snapshot_subfolder_paths: - snapshot_dir = Path(snapshot_path) - commit_subfolder_paths = [ - str(d) for d in snapshot_dir.iterdir() if d.is_dir() - ] - - # Reached the end. Check all the actual commit subfolders for the files. - for commit_path in commit_subfolder_paths: - if commit_path not in valid_model_paths: - deleted_model = utils.remove_dir_out_of_grace( - commit_path, grace_period_seconds - ) - if deleted_model: - bt.logging.trace( - f"Removing directory for unreferenced model at: {commit_path}." - ) - except Exception: - # Catch the exception so we continue with the rest of the cleanup. - bt.logging.warning(traceback.format_exc()) diff --git a/model/storage/disk/utils.py b/model/storage/disk/utils.py deleted file mode 100644 index ce62955..0000000 --- a/model/storage/disk/utils.py +++ /dev/null @@ -1,117 +0,0 @@ -import base64 -import datetime -import hashlib -import os -import shutil -import sys -from model.data import ModelId - - -def get_local_miners_dir(base_dir: str) -> str: - return os.path.join(base_dir, "models") - - -def get_local_miner_dir(base_dir: str, hotkey: str) -> str: - return os.path.join(get_local_miners_dir(base_dir), hotkey) - - -# Hugging face stores models under models--namespace--name/snapshots/commit when downloading. -def get_local_model_dir(base_dir: str, hotkey: str, model_id: ModelId) -> str: - return os.path.join( - get_local_miner_dir(base_dir, hotkey), - "models" + "--" + model_id.namespace + "--" + model_id.name, - ) - - -def get_local_model_snapshot_dir(base_dir: str, hotkey: str, model_id: ModelId) -> str: - return os.path.join( - get_local_model_dir(base_dir, hotkey, model_id), - "snapshots", - model_id.commit, - ) - - -def get_hf_download_path(local_path: str, model_id: ModelId) -> str: - return os.path.join( - local_path, - "models" + "--" + model_id.namespace + "--" + model_id.name, - "snapshots", - model_id.commit, - ) - - -def get_newest_datetime_under_path(path: str) -> datetime.datetime: - newest_filetime = 0 - - # Check to see if any file at any level was modified more recently than the current one. - for cur_path, dirnames, filenames in os.walk(path): - for filename in filenames: - try: - path = os.path.join(cur_path, filename) - mod_time = os.stat(path).st_mtime - if mod_time > newest_filetime: - newest_filetime = mod_time - except FileNotFoundError: - pass - - if newest_filetime == 0: - return datetime.datetime.min - - return datetime.datetime.fromtimestamp(newest_filetime) - - -def remove_dir_out_of_grace(path: str, grace_period_seconds: int) -> bool: - """Removes a dir if the last modified time is out of grace period secs. Returns if it was deleted.""" - last_modified = get_newest_datetime_under_path(path) - grace = datetime.timedelta(seconds=grace_period_seconds) - - if last_modified < datetime.datetime.now() - grace: - shutil.rmtree(path=path, ignore_errors=True) - return True - - return False - - -def realize_symlinks_in_directory(path: str) -> int: - """Realizes all symlinks in the given directory, moving the linked file to the location. Returns count removed.""" - realized_symlinks = 0 - - for cur_path, dirnames, filenames in os.walk(path): - for filename in filenames: - path = os.path.abspath(os.path.join(cur_path, filename)) - # Get path resolving symlinks if encountered - real_path = os.path.realpath(path) - # If different then move - if path != real_path: - realized_symlinks += 1 - shutil.move(real_path, path) - - return realized_symlinks - - -def get_hash_of_file(path: str) -> str: - blocksize = 64 * 1024 - file_hash = hashlib.sha256() - with open(path, "rb") as fp: - while True: - data = fp.read(blocksize) - if not data: - break - file_hash.update(data) - return base64.b64encode(file_hash.digest()).decode("utf-8") - - -def get_hash_of_directory(path: str) -> str: - dir_hash = hashlib.sha256() - - # Recursively walk everything under the directory for files. - for cur_path, dirnames, filenames in os.walk(path): - # Ensure we walk future directories in a consistent order. - dirnames.sort() - # Ensure we walk files in a consistent order. - for filename in sorted(filenames): - path = os.path.join(cur_path, filename) - file_hash = get_hash_of_file(path) - dir_hash.update(file_hash.encode()) - - return base64.b64encode(dir_hash.digest()).decode("utf-8") diff --git a/model/storage/hugging_face/__init__.py b/model/storage/hugging_face/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/model/storage/hugging_face/hugging_face_model_store.py b/model/storage/hugging_face/hugging_face_model_store.py deleted file mode 100644 index 1fd45d4..0000000 --- a/model/storage/hugging_face/hugging_face_model_store.py +++ /dev/null @@ -1,91 +0,0 @@ -import sys -import tempfile -import os -from huggingface_hub import HfApi -from model.data import Model, ModelId -from model.storage.disk import utils -from transformers import AutoModelForCausalLM - -from model.storage.remote_model_store import RemoteModelStore -import constants - - -class HuggingFaceModelStore(RemoteModelStore): - """Hugging Face based implementation for storing and retrieving a model.""" - - @classmethod - def assert_access_token_exists(cls) -> str: - """Asserts that the access token exists.""" - if not os.getenv("HF_ACCESS_TOKEN"): - raise ValueError("No Hugging Face access token found to write to the hub.") - return os.getenv("HF_ACCESS_TOKEN") - - async def upload_model(self, model: Model) -> ModelId: - """Uploads a trained model to Hugging Face.""" - token = HuggingFaceModelStore.assert_access_token_exists() - - # PreTrainedModel.save_pretrained only saves locally - commit_info = model.pt_model.push_to_hub( - repo_id=model.id.namespace + "/" + model.id.name, - token=token, - safe_serialization=True, - ) - - model_id_with_commit = ModelId( - namespace=model.id.namespace, - name=model.id.name, - hash=model.id.hash, - commit=commit_info.oid, - ) - - # TODO consider skipping the redownload if a hash is already provided. - # To get the hash we need to redownload it at a local tmp directory after which it can be deleted. - with tempfile.TemporaryDirectory() as temp_dir: - model_with_hash = await self.download_model(model_id_with_commit, temp_dir) - # Return a ModelId with both the correct commit and hash. - return model_with_hash.id - - async def download_model( - self, model_id: ModelId, local_path: str, model_size_limit: int = sys.maxsize - ) -> Model: - """Retrieves a trained model from Hugging Face.""" - if not model_id.commit: - raise ValueError("No Hugging Face commit id found to read from the hub.") - - repo_id = model_id.namespace + "/" + model_id.name - - # Check ModelInfo for the size of model.safetensors file before downloading. - api = HfApi() - model_info = api.model_info( - repo_id=repo_id, revision=model_id.commit, timeout=10, files_metadata=True - ) - size = sum(repo_file.size for repo_file in model_info.siblings) - if size > model_size_limit: - raise ValueError( - f"Hugging Face repo over maximum size limit. Size {size}. Limit {model_size_limit}." - ) - - # Transformers library can pick up a model based on the hugging face path (username/model) + rev. - model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path=repo_id, - revision=model_id.commit, - cache_dir=local_path, - use_safetensors=True, - ) - - # Get the directory the model was stored to. - model_dir = utils.get_hf_download_path(local_path, model_id) - - # Realize all symlinks in that directory since Transformers library does not support avoiding symlinks. - utils.realize_symlinks_in_directory(model_dir) - - # Compute the hash of the downloaded model. - model_hash = utils.get_hash_of_directory(model_dir) - model_id_with_hash = ModelId( - namespace=model_id.namespace, - name=model_id.name, - commit=model_id.commit, - hash=model_hash, - ) - - return Model(id=model_id_with_hash, pt_model=model) diff --git a/model/storage/hugging_face/test_hugging_face_model_store.py b/model/storage/hugging_face/test_hugging_face_model_store.py deleted file mode 100644 index d530f68..0000000 --- a/model/storage/hugging_face/test_hugging_face_model_store.py +++ /dev/null @@ -1,125 +0,0 @@ -import asyncio -import os -from model.data import Model, ModelId -from model.storage.disk import utils -from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore - -from pretrain.model import get_model - - -async def test_roundtrip_model(): - """Verifies that the HuggingFaceModelStore can roundtrip a model in hugging face.""" - hf_name = os.getenv("HF_NAME") - model_id = ModelId( - namespace=hf_name, - name="TestModel", - ) - - pt_model = get_model() - - model = Model(id=model_id, pt_model=pt_model) - hf_model_store = HuggingFaceModelStore() - - # Store the model in hf getting back the id with commit and hash. - model.id = await hf_model_store.upload_model(model) - - # Retrieve the model from hf. - retrieved_model = await hf_model_store.download_model( - model_id=model.id, - local_path=utils.get_local_miner_dir("test-models", "hotkey0"), - ) - - # Check that they match. - print( - f"Finished the roundtrip and checking that the models match: {str(model.state_dict()) == str(retrieved_model.state_dict())}" - ) - - -async def test_retrieve_model(): - """Verifies that the HuggingFaceModelStore can retrieve a model.""" - model_id = ModelId( - namespace="pszemraj", - name="distilgpt2-HC3", - hash="TestHash1", - commit="6f9ad473a3793d0271df34a55882ad30846a6788", - ) - - hf_model_store = HuggingFaceModelStore() - - # Retrieve the model from hf (first run) or cache. - model = await hf_model_store.download_model( - model_id=model_id, - local_path=utils.get_local_miner_dir("test-models", "hotkey0"), - ) - - print(f"Finished retrieving the model with id: {model.id}") - - -async def test_retrieve_oversized_model(): - """Verifies that the HuggingFaceModelStore can raise an exception if the model is too big.""" - model_id = ModelId( - namespace="microsoft", - name="phi-2", - hash="TestHash1", - commit="d318676", - ) - - hf_model_store = HuggingFaceModelStore() - - try: - model = await hf_model_store.download_model( - model_id=model_id, - local_path=utils.get_local_miner_dir("test-models", "hotkey0"), - model_size_limit=5 * 1024 * 1024 * 1024, - ) - except ValueError as ve: - print(f"Caught expected exception for downloading too large of a model: {ve}") - - -async def test_retrieve_multiple_models_for_hotkey(): - """Verifies that the HuggingFaceModelStore can handle multiple models for the same hotkey.""" - model_id_1 = ModelId( - namespace="pszemraj", - name="distilgpt2-HC3", - hash="TestHash1", - commit="6f9ad473a3793d0271df34a55882ad30846a6788", - ) - - model_id_2 = ModelId( - namespace="FredZhang7", - name="distilgpt2-stable-diffusion-v2", - hash="TestHash1", - commit="f839bc9217d4bc3694e4c5285934b5e671012f85", - ) - - hf_model_store = HuggingFaceModelStore() - - # Retrieve the model from hf (first run) or cache. - model_1 = await hf_model_store.download_model( - model_id=model_id_1, - local_path=utils.get_local_miner_dir("test-models", "hotkey0"), - ) - - expected_hash_1 = "3+voQJtkt7UCBvrLILeTz0oUE6iusGnXrCPZ3Mv664o=" - print( - f"Check that model 1 hash matches the expected hash: {model_1.id.hash == expected_hash_1}" - ) - - model_2 = await hf_model_store.download_model( - model_id=model_id_2, - local_path=utils.get_local_miner_dir("test-models", "hotkey0"), - ) - expected_hash_2 = "ZgTmR9X6YlD+ADOvbojE0JXEmAiTN/ok+QlukGXF61E=" - - print( - f"Check that model 2 hash matches the expected hash: {model_2.id.hash == expected_hash_2}" - ) - - -if __name__ == "__main__": - asyncio.run(test_retrieve_model()) - # Test redownloading a model from cache. - asyncio.run(test_retrieve_model()) - asyncio.run(test_roundtrip_model()) - asyncio.run(test_retrieve_oversized_model()) - asyncio.run(test_retrieve_multiple_models_for_hotkey()) diff --git a/model/storage/local_model_store.py b/model/storage/local_model_store.py deleted file mode 100644 index b495603..0000000 --- a/model/storage/local_model_store.py +++ /dev/null @@ -1,31 +0,0 @@ -import abc -from typing import Dict -from model.data import Model, ModelId - - -class LocalModelStore(abc.ABC): - """An abstract base class for storing and retrieving a pre trained model locally.""" - - @abc.abstractmethod - def store_model(self, hotkey: str, model: Model) -> ModelId: - """Stores a trained model in the appropriate location based on implementation.""" - pass - - @abc.abstractmethod - def get_path(self, hotkey: str) -> str: - """Returns the path to the appropriate location based on implementation.""" - pass - - @abc.abstractmethod - def retrieve_model( - self, hotkey: str, model_id: ModelId, optimized: bool = False - ) -> Model: - """Retrieves a trained model from the appropriate location based on implementation.""" - pass - - @abc.abstractmethod - def delete_unreferenced_models( - self, valid_models_by_hotkey: Dict[str, ModelId], grace_period_seconds: int - ): - """Check across all of local storage and delete unreferenced models out of grace period.""" - pass diff --git a/model/storage/model_metadata_store.py b/model/storage/model_metadata_store.py deleted file mode 100644 index e795f46..0000000 --- a/model/storage/model_metadata_store.py +++ /dev/null @@ -1,17 +0,0 @@ -import abc -from typing import Optional -from model.data import ModelId, ModelMetadata - - -class ModelMetadataStore(abc.ABC): - """An abstract base class for storing and retrieving model metadata.""" - - @abc.abstractmethod - async def store_model_metadata(self, hotkey: str, model_id: ModelId): - """Stores model metadata on this subnet for a specific miner.""" - pass - - @abc.abstractmethod - async def retrieve_model_metadata(self, hotkey: str) -> Optional[ModelMetadata]: - """Retrieves model metadata + block information on this subnet for specific miner, if present""" - pass diff --git a/model/storage/remote_model_store.py b/model/storage/remote_model_store.py deleted file mode 100644 index e479dc7..0000000 --- a/model/storage/remote_model_store.py +++ /dev/null @@ -1,19 +0,0 @@ -import abc -import sys -from model.data import Model, ModelId - - -class RemoteModelStore(abc.ABC): - """An abstract base class for storing and retrieving a pre trained model.""" - - @abc.abstractmethod - async def upload_model(self, model: Model) -> ModelId: - """Uploads a trained model in the appropriate location based on implementation.""" - pass - - @abc.abstractmethod - async def download_model( - self, model_id: ModelId, local_path: str, model_size_limit: int = sys.maxsize - ) -> Model: - """Retrieves a trained model from the appropriate location and stores at the given path.""" - pass diff --git a/model/utils.py b/model/utils.py deleted file mode 100644 index 2085f78..0000000 --- a/model/utils.py +++ /dev/null @@ -1,22 +0,0 @@ -import constants -from model.data import ModelCriteria -import base64 -import hashlib - - -def get_model_criteria(block: int) -> ModelCriteria: - """Returns the model criteria at block.""" - criteria = None - for b, crit in constants.MODEL_CRITERIA_BY_BLOCK: - if block >= b: - criteria = crit - assert criteria is not None, f"No model criteria found for block {block}" - return criteria - - -def get_hash_of_two_strings(string1: str, string2: str) -> str: - """Hashes two strings together and returns the result.""" - - string_hash = hashlib.sha256((string1 + string2).encode()) - - return base64.b64encode(string_hash.digest()).decode("utf-8") diff --git a/neurons/validator.py b/neurons/validator.py index 615eb9b..1d6fe36 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -16,56 +16,50 @@ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -from collections import defaultdict +import asyncio import copy import datetime as dt import functools -import os import json import math +import multiprocessing +import os import pickle +import threading import time -import torch -import random -import asyncio +import traceback import typing +from collections import defaultdict + +import bittensor as bt +import torch import wandb -import constants +from huggingface_hub.utils import RepositoryNotFoundError +from rich.console import Console +from rich.table import Table from taoverse.metagraph import utils as metagraph_utils from taoverse.metagraph.metagraph_syncer import MetagraphSyncer +from taoverse.metagraph.miner_iterator import MinerIterator from taoverse.model import utils as model_utils from taoverse.model.competition import utils as competition_utils from taoverse.model.competition.competition_tracker import CompetitionTracker from taoverse.model.competition.data import Competition from taoverse.model.model_tracker import ModelTracker from taoverse.model.model_updater import MinerMisconfiguredError, ModelUpdater +from taoverse.model.storage.chain.chain_model_metadata_store import ( + ChainModelMetadataStore, +) from taoverse.model.storage.disk.disk_model_store import DiskModelStore from taoverse.model.storage.hugging_face.hugging_face_model_store import ( HuggingFaceModelStore, ) -from taoverse.model.storage.chain.chain_model_metadata_store import ( - ChainModelMetadataStore, -) -from taoverse.utilities.perf_monitor import PerfMonitor from taoverse.utilities import utils +from taoverse.utilities.perf_monitor import PerfMonitor -from model.data import TokenizerIdentifier - -from huggingface_hub.utils import RepositoryNotFoundError -from neurons import config -import traceback -import threading -import multiprocessing -from rich.table import Table -from rich.console import Console - -import bittensor as bt +import constants import pretrain as pt -from torch.utils.data import IterableDataset -from utilities.miner_iterator import MinerIterator - - from competitions.data import CompetitionId +from neurons import config os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/model/storage/disk/__init__.py b/tests/model/storage/disk/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/model/storage/disk/test_disk_model_store.py b/tests/model/storage/disk/test_disk_model_store.py deleted file mode 100644 index 69ab3a2..0000000 --- a/tests/model/storage/disk/test_disk_model_store.py +++ /dev/null @@ -1,293 +0,0 @@ -import os -import unittest - -from model.data import ModelId, Model -from model.storage.disk.disk_model_store import DiskModelStore -from pretrain.model import get_model - -import model.storage.disk.utils as utils - - -class TestDiskModelStore(unittest.TestCase): - def setUp(self): - self.base_dir = "test-models" - self.disk_store = DiskModelStore(self.base_dir) - - def tearDown(self): - self.disk_store.delete_unreferenced_models(dict(), 0) - - def test_get_path(self): - hotkey = "hotkey0" - - expected_path = utils.get_local_miner_dir(self.base_dir, hotkey) - actual_path = self.disk_store.get_path(hotkey) - - self.assertEqual(expected_path, actual_path) - - def test_store_and_retrieve_model(self): - hotkey = "hotkey0" - model_id = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit", - ) - - pt_model = get_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Store the model locally. - self.disk_store.store_model(hotkey, model) - - # Retrieve the model locally. - retrieved_model = self.disk_store.retrieve_model(hotkey, model_id) - - # Check that they match. - self.assertEqual(str(model), str(retrieved_model)) - - @unittest.skip( - "Skip this test by default as it requires flash-attn which requires a compatible gpu." - ) - def test_store_and_retrieve_optimized_model(self): - hotkey = "hotkey0" - model_id = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit", - ) - - pt_model = get_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Store the model locally. - self.disk_store.store_model(hotkey, model) - - # Retrieve the model locally. - retrieved_model = self.disk_store.retrieve_model( - hotkey, model_id, optimized=True - ) - - # Check that they match. - self.assertEqual(str(model), str(retrieved_model)) - - def test_delete_unreferenced_models(self): - hotkey = "hotkey0" - - # Make 2 model ids with different hashes / commits. - model_id_1 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit1", - ) - model_id_2 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash2", - commit="TestCommit2", - ) - - pt_model = get_model() - - model_1 = Model(id=model_id_1, pt_model=pt_model) - model_2 = Model(id=model_id_2, pt_model=pt_model) - - # Store both models locally. - self.disk_store.store_model(hotkey, model_1) - self.disk_store.store_model(hotkey, model_2) - - # Create the mapping of hotkey to model_id with only the 2nd model. - valid_models_by_hotkey = dict() - valid_models_by_hotkey[hotkey] = model_id_2 - - # Clear the unreferenced models - self.disk_store.delete_unreferenced_models(valid_models_by_hotkey, 0) - - # Confirm that model 1 is deleted - with self.assertRaises(Exception): - self.disk_store.retrieve_model(hotkey, model_id_1) - - # Confirm that model 2 is still there - model_2_retrieved = self.disk_store.retrieve_model(hotkey, model_id_2) - self.assertEqual(str(model_2), str(model_2_retrieved)) - - def test_delete_unreferenced_models_removed_hotkey(self): - hotkey_1 = "hotkey1" - hotkey_2 = "hotkey2" - - # Make 2 model ids with different hashes / commits. - model_id_1 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit1", - ) - model_id_2 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash2", - commit="TestCommit2", - ) - - pt_model = get_model() - - model_1 = Model(id=model_id_1, pt_model=pt_model) - model_2 = Model(id=model_id_2, pt_model=pt_model) - - # Store both models locally. - self.disk_store.store_model(hotkey_1, model_1) - self.disk_store.store_model(hotkey_2, model_2) - - # Create the mapping of hotkey to model_id with only the 2nd model. - valid_models_by_hotkey = dict() - valid_models_by_hotkey[hotkey_2] = model_id_2 - - # Clear the unreferenced models - self.disk_store.delete_unreferenced_models(valid_models_by_hotkey, 0) - - # Confirm that model 1 is deleted - with self.assertRaises(Exception): - self.disk_store.retrieve_model(hotkey_1, model_id_1) - - # Confirm that model 2 is still there - model_2_retrieved = self.disk_store.retrieve_model(hotkey_2, model_id_2) - self.assertEqual(str(model_2), str(model_2_retrieved)) - - def test_delete_unreferenced_models_in_grace(self): - hotkey = "hotkey0" - - # Make 2 model ids with different hashes / commits. - model_id_1 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit1", - ) - model_id_2 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash2", - commit="TestCommit2", - ) - - pt_model = get_model() - - model_1 = Model(id=model_id_1, pt_model=pt_model) - model_2 = Model(id=model_id_2, pt_model=pt_model) - - # Store both models locally. - self.disk_store.store_model(hotkey, model_1) - self.disk_store.store_model(hotkey, model_2) - - # Create the mapping of hotkey to model_id with only the 2nd model. - valid_models_by_hotkey = dict() - valid_models_by_hotkey[hotkey] = model_id_2 - - # Clear the unreferenced models - self.disk_store.delete_unreferenced_models(valid_models_by_hotkey, 60) - - # Confirm that model 1 is still there. - model_1_retrieved = self.disk_store.retrieve_model(hotkey, model_id_1) - self.assertEqual(str(model_1), str(model_1_retrieved)) - - # Confirm that model 2 is still there - model_2_retrieved = self.disk_store.retrieve_model(hotkey, model_id_2) - self.assertEqual(str(model_2), str(model_2_retrieved)) - - def test_delete_unreferenced_models_removed_hotkey_in_grace(self): - hotkey_1 = "hotkey1" - hotkey_2 = "hotkey2" - - # Make 2 model ids with different hashes / commits. - model_id_1 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit1", - ) - model_id_2 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash2", - commit="TestCommit2", - ) - - pt_model = get_model() - - model_1 = Model(id=model_id_1, pt_model=pt_model) - model_2 = Model(id=model_id_2, pt_model=pt_model) - - # Store both models locally. - self.disk_store.store_model(hotkey_1, model_1) - self.disk_store.store_model(hotkey_2, model_2) - - # Create the mapping of hotkey to model_id with only the 2nd model. - valid_models_by_hotkey = dict() - valid_models_by_hotkey[hotkey_2] = model_id_2 - - # Clear the unreferenced models - self.disk_store.delete_unreferenced_models(valid_models_by_hotkey, 60) - - # Confirm that model 1 is still there - model_1_retrieved = self.disk_store.retrieve_model(hotkey_1, model_id_1) - self.assertEqual(str(model_1), str(model_1_retrieved)) - - # Confirm that model 2 is still there - model_2_retrieved = self.disk_store.retrieve_model(hotkey_2, model_id_2) - self.assertEqual(str(model_2), str(model_2_retrieved)) - - def test_delete_unreferenced_models_and_unexpected_file(self): - hotkey = "hotkey0" - - # Make 2 model ids with different hashes / commits. - model_id_1 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash1", - commit="TestCommit1", - ) - model_id_2 = ModelId( - namespace="TestPath", - name="TestModel", - hash="TestHash2", - commit="TestCommit2", - ) - - pt_model = get_model() - - model_1 = Model(id=model_id_1, pt_model=pt_model) - model_2 = Model(id=model_id_2, pt_model=pt_model) - - # Store both models locally. - self.disk_store.store_model(hotkey, model_1) - self.disk_store.store_model(hotkey, model_2) - - # Also store a random file to the hotkey dir. - # If the is_dir() check is not correct then we will fail to rmtree this file with '[Errno 20] Not a directory.' - miners_dir = utils.get_local_miner_dir(self.base_dir, hotkey) - file_name = miners_dir + os.path.sep + "random.txt" - file = open(file_name, "w") - file.write("unexpected file.") - file.close() - - # Create the mapping of hotkey to model_id with only the 2nd model. - valid_models_by_hotkey = dict() - valid_models_by_hotkey[hotkey] = model_id_2 - - # Clear the unreferenced models - self.disk_store.delete_unreferenced_models(valid_models_by_hotkey, 0) - - # Confirm that model 1 is deleted - with self.assertRaises(Exception): - self.disk_store.retrieve_model(hotkey, model_id_1) - - # Confirm that model 2 is still there - model_2_retrieved = self.disk_store.retrieve_model(hotkey, model_id_2) - self.assertEqual(str(model_2), str(model_2_retrieved)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/model/storage/disk/test_utils.py b/tests/model/storage/disk/test_utils.py deleted file mode 100644 index 87903f8..0000000 --- a/tests/model/storage/disk/test_utils.py +++ /dev/null @@ -1,272 +0,0 @@ -import base64 -import datetime -import shutil -import time -import unittest -from model.data import ModelId -import model.storage.disk.utils as utils -import os - - -class TestUtils(unittest.TestCase): - def setUp(self): - self.base_dir = "test-models" - self.sep = os.path.sep - - def tearDown(self): - shutil.rmtree(path=self.base_dir, ignore_errors=True) - - def test_get_local_miners_dir(self): - miners_dir = utils.get_local_miners_dir(self.base_dir) - - expected_path = self.base_dir + self.sep + "models" - self.assertEqual(miners_dir, expected_path) - - def test_get_local_miner_dir(self): - hotkey = "test-hotkey" - - miner_dir = utils.get_local_miner_dir(self.base_dir, hotkey) - - expected_path = self.base_dir + self.sep + "models" + self.sep + hotkey - self.assertEqual(miner_dir, expected_path) - - def test_get_local_model_dir(self): - hotkey = "test-hotkey" - namespace = "test-namespace" - name = "test-name" - commit = "test-commit" - model_id = ModelId( - namespace=namespace, name=name, hash="test-hash", commit=commit - ) - - model_dir = utils.get_local_model_dir(self.base_dir, hotkey, model_id) - - expected_path = ( - self.base_dir - + self.sep - + "models" - + self.sep - + hotkey - + self.sep - + "models--" - + namespace - + "--" - + name - ) - self.assertEqual(model_dir, expected_path) - - def test_get_local_model_snapshot_dir(self): - hotkey = "test-hotkey" - namespace = "test-namespace" - name = "test-name" - commit = "test-commit" - model_id = ModelId( - namespace=namespace, name=name, hash="test-hash", commit=commit - ) - - model_dir = utils.get_local_model_snapshot_dir(self.base_dir, hotkey, model_id) - - expected_path = ( - self.base_dir - + self.sep - + "models" - + self.sep - + hotkey - + self.sep - + "models--" - + namespace - + "--" - + name - + self.sep - + "snapshots" - + self.sep - + commit - ) - self.assertEqual(model_dir, expected_path) - - def test_get_hf_download_path_dir(self): - hotkey = "test-hotkey" - namespace = "test-namespace" - name = "test-name" - commit = "test-commit" - model_id = ModelId( - namespace=namespace, name=name, hash="test-hash", commit=commit - ) - - hf_download_path_dir = utils.get_hf_download_path( - utils.get_local_miner_dir(self.base_dir, hotkey), model_id - ) - - expected_path = ( - self.base_dir - + self.sep - + "models" - + self.sep - + hotkey - + self.sep - + "models--" - + namespace - + "--" - + name - + self.sep - + "snapshots" - + self.sep - + commit - ) - self.assertEqual(hf_download_path_dir, expected_path) - - def test_get_newest_datetime_under_path(self): - file_name_1 = "test1.txt" - file_name_2 = "test2.txt" - path_1 = self.base_dir + os.path.sep + file_name_1 - path_2 = self.base_dir + os.path.sep + file_name_2 - - os.mkdir(self.base_dir) - file_1 = open(path_1, "w") - file_1.write("test text.") - file_1.close() - - time.sleep(1) - - file_2 = open(path_2, "w") - file_2.write("test text 2.") - file_2.close() - - # File 2 was written more recently. - last_modified_expected = datetime.datetime.fromtimestamp( - os.path.getmtime(path_2) - ) - - last_modified_actual = utils.get_newest_datetime_under_path(self.base_dir) - - self.assertEqual(last_modified_actual, last_modified_expected) - - def test_get_newest_datetime_under_path_empty(self): - last_modified_expected = datetime.datetime.min - - last_modified_actual = utils.get_newest_datetime_under_path(self.base_dir) - - self.assertEqual(last_modified_actual, last_modified_expected) - - def test_remove_dir_out_of_grace(self): - file_name = "test.txt" - path = self.base_dir + self.sep + file_name - - os.mkdir(self.base_dir) - file = open(path, "w") - file.write("test text.") - file.close() - - # Sleep to ensure we are out of grace. - time.sleep(1) - - self.assertTrue(os.path.exists(self.base_dir)) - deleted = utils.remove_dir_out_of_grace(self.base_dir, 0) - self.assertTrue(deleted) - self.assertFalse(os.path.exists(self.base_dir)) - - def test_remove_dir_out_of_grace_in_grace(self): - file_name = "test.txt" - path = self.base_dir + self.sep + file_name - - os.mkdir(self.base_dir) - file = open(path, "w") - file.write("test text.") - file.close() - - self.assertTrue(os.path.exists(self.base_dir)) - deleted = utils.remove_dir_out_of_grace(self.base_dir, 60) - self.assertFalse(deleted) - self.assertTrue(os.path.exists(self.base_dir)) - - def test_get_hash_of_file(self): - file_name = "test.txt" - path = self.base_dir + self.sep + file_name - - os.mkdir(self.base_dir) - file = open(path, "w") - file.write("test text.") - file.close() - - # Obtained by running openssl dgst -sha256 -binary test.txt | base64 - expected_file_hash = "tXNDvHVzqYIRiUx0rvK+M5+Lu4OLzhfPJH+gf7HvCeA=" - actual_file_hash = utils.get_hash_of_file(path) - - self.assertEqual(actual_file_hash, expected_file_hash) - - def test_get_hash_of_directory(self): - # Make two sub directories. - dir_1 = self.base_dir + self.sep + "dir1" - dir_2 = self.base_dir + self.sep + "dir2" - - # Write the same two files to both sub directories. - file_name_1 = "test1.txt" - file_name_2 = "test2.txt" - path_1_file_1 = dir_1 + os.path.sep + file_name_1 - path_1_file_2 = dir_1 + os.path.sep + file_name_2 - path_2_file_1 = dir_2 + os.path.sep + file_name_1 - path_2_file_2 = dir_2 + os.path.sep + file_name_2 - - path_2_file_2 = dir_2 + os.path.sep + file_name_2 - file_paths = [path_1_file_1, path_1_file_2, path_2_file_1, path_2_file_2] - - os.mkdir(self.base_dir) - os.mkdir(dir_1) - os.mkdir(dir_2) - - for file_path in file_paths: - file = open(file_path, "w") - file.write("test text.") - file.close() - - # Test that both sub directories have an equal hash. - dir_1_hash = utils.get_hash_of_directory(dir_1) - dir_2_hash = utils.get_hash_of_directory(dir_2) - self.assertEqual(dir_1_hash, dir_2_hash) - - # Test that the hash for the overall directory does not equal the sub directory. - base_dir_hash = utils.get_hash_of_directory(self.base_dir) - self.assertNotEqual(base_dir_hash, dir_1_hash) - - def test_realize_symlinks_in_directory(self): - end_file_dir = self.base_dir + self.sep + "end_files" - symlink_source_dir = self.base_dir + self.sep + "symlink" - - regular_file = end_file_dir + self.sep + "test_file.txt" - symlink_source = symlink_source_dir + self.sep + "symlink_source.txt" - symlink_dest = end_file_dir + self.sep + "symlink_end.txt" - - # Make a regular file - os.mkdir(self.base_dir) - os.mkdir(end_file_dir) - file = open(regular_file, "w") - file.write("test text.") - file.close() - - # Make a symlinked file - os.mkdir(symlink_source_dir) - file = open(symlink_source, "w") - file.write("symlink source test text.") - file.close() - - os.symlink(os.path.abspath(symlink_source), os.path.abspath(symlink_dest)) - - # Confirm we see 3 files - pre_file_count = 0 - for _, _, files in os.walk(self.base_dir): - pre_file_count += len(files) - self.assertEqual(pre_file_count, 3) - - realized_files = utils.realize_symlinks_in_directory(end_file_dir) - - # Confirm 1 file got realized and there are two total now. - self.assertEqual(realized_files, 1) - - post_file_count = 0 - for _, _, files in os.walk(self.base_dir): - post_file_count += len(files) - self.assertEqual(post_file_count, 2) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/model/storage/fake_model_metadata_store.py b/tests/model/storage/fake_model_metadata_store.py index 88c57d9..912fa55 100644 --- a/tests/model/storage/fake_model_metadata_store.py +++ b/tests/model/storage/fake_model_metadata_store.py @@ -1,7 +1,8 @@ from collections import defaultdict, deque from typing import List, Optional -from model.data import ModelId, ModelMetadata -from model.storage.model_metadata_store import ModelMetadataStore + +from taoverse.model.data import ModelId, ModelMetadata +from taoverse.model.storage.model_metadata_store import ModelMetadataStore class FakeModelMetadataStore(ModelMetadataStore): diff --git a/tests/model/storage/fake_remote_model_store.py b/tests/model/storage/fake_remote_model_store.py index 79207fb..566af6d 100644 --- a/tests/model/storage/fake_remote_model_store.py +++ b/tests/model/storage/fake_remote_model_store.py @@ -1,7 +1,10 @@ -import sys -from model.data import Model, ModelId -from model.storage.disk import utils -from model.storage.remote_model_store import RemoteModelStore +import uuid +from dataclasses import replace + +from taoverse.model.competition.data import Competition +from taoverse.model.data import Model, ModelId +from taoverse.model.storage.disk import utils +from taoverse.model.storage.remote_model_store import RemoteModelStore class FakeRemoteModelStore(RemoteModelStore): @@ -10,16 +13,27 @@ class FakeRemoteModelStore(RemoteModelStore): def __init__(self): self.remote_models = dict() - async def upload_model(self, model: Model) -> ModelId: + async def upload_model(self, model: Model, competition: Competition) -> ModelId: """Fake uploads a model.""" - # Use provided commit + hash rather than generating a new one. - self.remote_models[model.id] = model + model_id = model.id + # Generate a commit and hash, if one doesn't yet exist. + if model_id.commit is None: + commit = str(uuid.uuid4()) + model_id = replace(model_id, commit=commit) + + if model_id.hash is None: + hash = str(uuid.uuid4()) + model_id = replace(model_id, hash=hash) + + model = replace(model, id=model_id) - return model.id + self.remote_models[model_id] = model + + return model_id async def download_model( - self, model_id: ModelId, local_path: str, model_size_limit: int = sys.maxsize + self, model_id: ModelId, local_path: str, competition: Competition ) -> Model: """Retrieves a trained model from memory.""" @@ -29,10 +43,11 @@ async def download_model( split_string = local_path.split("/") # Store it at the local_path + dir = utils.get_local_model_snapshot_dir( + split_string[0], split_string[2], model_id + ) model.pt_model.save_pretrained( - save_directory=utils.get_local_model_snapshot_dir( - split_string[0], split_string[2], model_id - ), + save_directory=dir, safe_serialization=True, ) diff --git a/tests/model/test_data.py b/tests/model/test_data.py deleted file mode 100644 index 1945020..0000000 --- a/tests/model/test_data.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -from model.data import ModelId -import model.storage.disk.utils as utils -import os - - -class TestData(unittest.TestCase): - def test_model_id_compressed_string(self): - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - - roundtrip_model_id = ModelId.from_compressed_str(model_id.to_compressed_str()) - - self.assertEqual(model_id, roundtrip_model_id) - - def test_model_id_compressed_string_no_commit(self): - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - ) - - roundtrip_model_id = ModelId.from_compressed_str(model_id.to_compressed_str()) - - self.assertEqual(model_id, roundtrip_model_id) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/model/test_model_tracker.py b/tests/model/test_model_tracker.py deleted file mode 100644 index 38a1563..0000000 --- a/tests/model/test_model_tracker.py +++ /dev/null @@ -1,173 +0,0 @@ -import os -import unittest -from model.data import ModelId, ModelMetadata - -from model.model_tracker import ModelTracker - - -class TestModelTracker(unittest.TestCase): - def setUp(self): - self.model_tracker = ModelTracker() - - def test_roundtrip_state(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - state_path = ".test_tracker_state.pickle" - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - self.model_tracker.save_state(state_path) - - new_tracker = ModelTracker() - new_tracker.load_state(state_path) - - os.remove(state_path) - - self.assertEqual( - self.model_tracker.miner_hotkey_to_model_metadata_dict, - new_tracker.miner_hotkey_to_model_metadata_dict, - ) - - def test_on_miner_model_updated_add(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - - self.assertTrue( - hotkey in self.model_tracker.miner_hotkey_to_model_metadata_dict - ) - self.assertEqual( - model_metadata, - self.model_tracker.miner_hotkey_to_model_metadata_dict[hotkey], - ) - - def test_on_miner_model_updated_update(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - new_model_id = ModelId( - namespace="test_model2", - name="test_name2", - commit="test_commit2", - hash="test_hash2", - ) - new_model_metadata = ModelMetadata(id=new_model_id, block=2) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - self.model_tracker.on_miner_model_updated(hotkey, new_model_metadata) - - self.assertTrue( - hotkey in self.model_tracker.miner_hotkey_to_model_metadata_dict - ) - self.assertEqual( - new_model_metadata, - self.model_tracker.miner_hotkey_to_model_metadata_dict[hotkey], - ) - - def test_get_model_metadata_for_miner_hotkey(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - returned_model_metadata = ( - self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey) - ) - - self.assertEqual(model_metadata, returned_model_metadata) - - def test_get_model_metadata_for_miner_hotkey_optional(self): - hotkey = "test_hotkey" - - returned_model_id = self.model_tracker.get_model_metadata_for_miner_hotkey( - hotkey - ) - - self.assertIsNone(returned_model_id) - - def test_get_miner_hotkey_to_model_metadata_dict(self): - hotkey_1 = "test_hotkey" - model_id_1 = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata_1 = ModelMetadata(id=model_id_1, block=1) - - hotkey_2 = "test_hotkey2" - model_id_2 = ModelId( - namespace="test_model2", - name="test_name2", - commit="test_commit2", - hash="test_hash2", - ) - model_metadata_2 = ModelMetadata(id=model_id_2, block=2) - - self.model_tracker.on_miner_model_updated(hotkey_1, model_metadata_1) - self.model_tracker.on_miner_model_updated(hotkey_2, model_metadata_2) - - hotkey_to_model_metadata = ( - self.model_tracker.get_miner_hotkey_to_model_metadata_dict() - ) - - self.assertEqual(len(hotkey_to_model_metadata), 2) - self.assertEqual(hotkey_to_model_metadata[hotkey_1], model_metadata_1) - self.assertEqual(hotkey_to_model_metadata[hotkey_2], model_metadata_2) - - def test_on_hotkeys_updated_extra_ignored(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - self.model_tracker.on_hotkeys_updated(set([hotkey, "extra_hotkey"])) - - self.assertEqual(len(self.model_tracker.miner_hotkey_to_model_metadata_dict), 1) - - def test_on_hotkeys_updated_missing_removed(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - commit="test_commit", - hash="test_hash", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - self.model_tracker.on_hotkeys_updated(set(["extra_hotkey"])) - - self.assertEqual(len(self.model_tracker.miner_hotkey_to_model_metadata_dict), 0) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/model/test_model_updater.py b/tests/model/test_model_updater.py deleted file mode 100644 index 2b82e73..0000000 --- a/tests/model/test_model_updater.py +++ /dev/null @@ -1,314 +0,0 @@ -import asyncio -import unittest -from model import utils -from model.data import Model, ModelId, ModelMetadata -from model.model_tracker import ModelTracker - -from model.model_updater import ModelUpdater -from model.storage.disk.disk_model_store import DiskModelStore -from tests.model.storage.fake_model_metadata_store import FakeModelMetadataStore -from tests.model.storage.fake_remote_model_store import FakeRemoteModelStore -from transformers import GPT2Config, GPT2LMHeadModel, PreTrainedModel - - -class TestModelUpdater(unittest.TestCase): - def setUp(self): - self.model_tracker = ModelTracker() - self.local_store = DiskModelStore("test-models") - self.remote_store = FakeRemoteModelStore() - self.metadata_store = FakeModelMetadataStore() - self.model_updater = ModelUpdater( - metadata_store=self.metadata_store, - remote_store=self.remote_store, - local_store=self.local_store, - model_tracker=self.model_tracker, - ) - - def tearDown(self): - self.local_store.delete_unreferenced_models(dict(), 0) - - def _get_small_model(self) -> PreTrainedModel: - """Gets a small model that works with even the earliest block.""" - config = GPT2Config( - n_head=10, - n_layer=12, - n_embd=760, - ) - return GPT2LMHeadModel(config) - - def test_get_metadata(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - - metadata = asyncio.run(self.model_updater._get_metadata(hotkey)) - - self.assertEqual(metadata.id, model_id) - self.assertIsNotNone(metadata.block) - - def test_sync_model_bad_metadata(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="bad_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - # Setup the metadata with a commit that doesn't exist in the remote store. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - - # Check the model fails to sync but that it doesn't throw an exception. - self.assertFalse(asyncio.run(self.model_updater.sync_model(hotkey))) - - def test_sync_model_same_metadata(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - pt_model = self._get_small_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata, local, and model_tracker to match. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - self.local_store.store_model(hotkey, model) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - - asyncio.run(self.model_updater.sync_model(hotkey)) - - # Tracker information did not change. - self.assertEqual( - self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey), - model_metadata, - ) - - def test_sync_model_same_metadata_force(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - pt_model = self._get_small_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata, local, and model_tracker to match. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - self.local_store.store_model(hotkey, model) - # Also setup remote store for redownload. - asyncio.run(self.remote_store.upload_model(model)) - - self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - - updated = asyncio.run(self.model_updater.sync_model(hotkey, force=True)) - - # Tracker information did not change. - self.assertEqual( - self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey), - model_metadata, - ) - - # We did return updated from the sync_model. - self.assertTrue(updated) - - def test_sync_model_new_metadata(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - pt_model = self._get_small_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata and remote store but not local or the model_tracker. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - asyncio.run(self.remote_store.upload_model(model)) - - self.assertIsNone( - self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey) - ) - - # Our local store raises an exception from the Transformers.from_pretrained method if not found. - with self.assertRaises(Exception): - self.local_store.retrieve_model(hotkey, model_id) - - asyncio.run(self.model_updater.sync_model(hotkey)) - - self.assertEqual( - self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey), - model_metadata, - ) - self.assertEqual( - str(self.local_store.retrieve_model(hotkey, model_id)), str(model) - ) - - def test_sync_model_hotkey_hash(self): - hotkey = "test_hotkey" - model_hash = "test_hash" - model_id_chain = ModelId( - namespace="test_model", - name="test_name", - hash=utils.get_hash_of_two_strings(model_hash, hotkey), - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id_chain, block=1) - - model_id = ModelId( - namespace="test_model", - name="test_name", - hash=model_hash, - commit="test_commit", - ) - - pt_model = self._get_small_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata and remote store and but not local or the model tracker. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - self.remote_store.inject_mismatched_model(model_id_chain, model) - - # Assert that we do update since the model_updater retries with the hotkey hash as well. - updated = asyncio.run(self.model_updater.sync_model(hotkey)) - self.assertTrue(updated) - - def test_sync_model_bad_hash(self): - hotkey = "test_hotkey" - model_id_chain = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id_chain, block=1) - - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="bad_hash", - commit="test_commit", - ) - - pt_model = self._get_small_model() - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata and remote store and but not local or the model tracker. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - self.remote_store.inject_mismatched_model(model_id_chain, model) - - # Assert we do not update due to the hash mismatch between the model in remote store and the metadata on chain. - updated = asyncio.run(self.model_updater.sync_model(hotkey)) - self.assertFalse(updated) - - def test_sync_model_over_max_parameters(self): - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - model_metadata = ModelMetadata(id=model_id, block=1) - - config = GPT2Config( - n_head=10, - n_layer=25, # Increase layer by enough to go over max parameter size. - n_embd=760, - ) - pt_model = GPT2LMHeadModel(config) - - model = Model(id=model_id, pt_model=pt_model) - - # Setup the metadata and remote store but not local or the model_tracker. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - asyncio.run(self.remote_store.upload_model(model)) - - # Assert we do not update due to exceeding the maximum allowed parameter size. - updated = asyncio.run(self.model_updater.sync_model(hotkey)) - self.assertFalse(updated) - - def test_sync_model_uses_next_model_limit(self): - - # Create a model larger than the limit prior to block 2_405_920. - pt_model = GPT2LMHeadModel( - GPT2Config( - n_head=50, - n_layer=25, - n_embd=750, - ) - ) - - hotkey = "test_hotkey" - model_id = ModelId( - namespace="test_model", - name="test_name", - hash="test_hash", - commit="test_commit", - ) - - # Upload the large model before the block that uses the new limit. - model_metadata = ModelMetadata(id=model_id, block=2_405_919) - - model = Model(id=model_id, pt_model=pt_model) - - # Upload the model metadata and model. - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - asyncio.run(self.remote_store.upload_model(model)) - - # Assert we do not update due to exceeding the maximum allowed parameter size. - self.assertFalse(asyncio.run(self.model_updater.sync_model(hotkey))) - - # Upload the model again, this time after the block that allows this size of model. - model_metadata = ModelMetadata(id=model_id, block=2_405_920) - asyncio.run( - self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) - ) - - self.assertTrue(asyncio.run(self.model_updater.sync_model(hotkey))) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/model/test_model_utils.py b/tests/model/test_model_utils.py deleted file mode 100644 index 1daa672..0000000 --- a/tests/model/test_model_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -import unittest -from model.utils import get_model_criteria, get_hash_of_two_strings -from constants import BLOCK_7B, ALLOWED_MODEL_TYPES_1, ALLOWED_MODEL_TYPES_2 -from model.data import ModelCriteria, TokenizerIdentifier - - -class TestModelUtils(unittest.TestCase): - MODEL_CRITERIA_186M = ModelCriteria( - sequence_length=1024, - optimized=False, - max_model_bytes=5 * 1024 * 1024 * 1024, - max_model_parameters=186_000_000, - allowed_model_types=ALLOWED_MODEL_TYPES_1, - tokenizer_identifier=TokenizerIdentifier.DISTILGPT_2, - ) - MODEL_CRITERIA_772M = ModelCriteria( - sequence_length=1024, - optimized=False, - max_model_bytes=5 * 1024 * 1024 * 1024, - max_model_parameters=772_000_000, - allowed_model_types=ALLOWED_MODEL_TYPES_1, - tokenizer_identifier=TokenizerIdentifier.DISTILGPT_2, - ) - MODEL_CRITERIA_7B = ModelCriteria( - sequence_length=4096, - optimized=True, - max_model_bytes=15 * 1024 * 1024 * 1024, - max_model_parameters=6_900_000_000, - allowed_model_types=ALLOWED_MODEL_TYPES_2, - tokenizer_identifier=TokenizerIdentifier.GPT_4_TIKTOKEN, - ) - - model_criteria_cases = [ - (0, MODEL_CRITERIA_186M), - (2_405_919, MODEL_CRITERIA_186M), - (2_405_920, MODEL_CRITERIA_772M), - (2_605_920, MODEL_CRITERIA_772M), - (BLOCK_7B - 1, MODEL_CRITERIA_772M), - (BLOCK_7B, MODEL_CRITERIA_7B), - (BLOCK_7B + 1, MODEL_CRITERIA_7B), - ] - - def test_get_model_criteria(self): - for block, expected_criteria in self.model_criteria_cases: - with self.subTest(block=block, expected_criteria=expected_criteria): - assert get_model_criteria(block) == expected_criteria - - def test_get_hash_of_two_strings(self): - string1 = "hello" - string2 = "world" - - result = get_hash_of_two_strings(string1, string2) - - self.assertEqual(result, "k2oYXKqiZrucvpgengXLeM1zKwsygOuURBK7b4+PB68=") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/pretrain/test_dataset.py b/tests/pretrain/test_dataset.py index a49250b..0013239 100644 --- a/tests/pretrain/test_dataset.py +++ b/tests/pretrain/test_dataset.py @@ -2,41 +2,42 @@ import numpy as np +from competitions.data import CompetitionId +from constants import MODEL_CONSTRAINTS_BY_COMPETITION_ID import pretrain as pt from neurons import config # Get the config config = config.validator_config() + def test_FineWeb_loader_page_copy(): """ - Test that pages can be correctly copied from one FineWeb dataloader + Test that pages can be correctly copied from one FineWeb dataloader to another """ # Some test params NUM_PAGES = 20 - + # Load a tokenizer - tokenizer = pt.model.get_tokenizer(cache_dir=config.model_dir) + tokenizer = pt.model.load_tokenizer( + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL], + cache_dir=config.model_dir, + ) # First dataloader dataloader_1 = pt.dataset.SubsetFineWebEdu2Loader( - batch_size=4, - sequence_length=4092, - num_pages=NUM_PAGES, - tokenizer=tokenizer) + batch_size=4, sequence_length=4092, num_pages=NUM_PAGES, tokenizer=tokenizer + ) # Assert that the number of pages loaded successfully are the one required assert len(dataloader_1.pages) == NUM_PAGES - # Now create a second loader without automatic page loading dataloader_2 = pt.dataset.SubsetFineWebEdu2Loader( - batch_size=4, - sequence_length=4092, - num_pages=None, - tokenizer=tokenizer) - + batch_size=4, sequence_length=4092, num_pages=None, tokenizer=tokenizer + ) + # Copy pages from the first dataloader dataloader_2.fetch_data_for_pages(pages=dataloader_1.pages) diff --git a/tests/pretrain/test_mining.py b/tests/pretrain/test_mining.py index 25295d0..e2a8318 100644 --- a/tests/pretrain/test_mining.py +++ b/tests/pretrain/test_mining.py @@ -1,11 +1,13 @@ import asyncio import os import shutil +import unittest from unittest import mock + import bittensor as bt -import unittest +from taoverse.model.data import Model, ModelId -from model.data import Model, ModelId +from competitions.data import CompetitionId import pretrain as pt from pretrain.model import get_model from tests.model.storage.fake_model_metadata_store import FakeModelMetadataStore @@ -34,7 +36,7 @@ def test_model_to_disk_roundtrip(self): """Tests that saving a model to disk and loading it gets the same model.""" pt.mining.save(model=self.tiny_model, model_dir=self.model_dir) - model = pt.mining.load_local_model(model_dir=self.model_dir) + model = pt.mining.load_local_model(model_dir=self.model_dir, kwargs={}) assert_model_equality(self, self.tiny_model, model) @@ -43,6 +45,7 @@ def _test_push(self, min_expected_block: int = 1): pt.mining.push( model=self.tiny_model, wallet=self.wallet, + competition_id=CompetitionId.B7_MODEL, repo="namespace/name", retry_delay_secs=1, metadata_store=self.metadata_store, @@ -59,7 +62,12 @@ def _test_push(self, min_expected_block: int = 1): self.metadata_store.retrieve_model_metadata(self.wallet.hotkey.ss58_address) ) self.assertGreaterEqual(model_metadata.block, min_expected_block) - self.assertEqual(model_metadata.id, model.id) + + # Check certain properties of the model metadata. + self.assertEqual(model_metadata.id.commit, model.id.commit) + self.assertEqual(model_metadata.id.name, model.id.name) + self.assertEqual(model_metadata.id.namespace, model.id.namespace) + self.assertEqual(model_metadata.id.competition_id, model.id.competition_id) self.metadata_store.reset() self.remote_store.reset() @@ -106,7 +114,11 @@ async def test_get_repo(self): metagraph.hotkeys.return_value = [hotkey] model_id = ModelId( - namespace="namespace", name="name", hash="hash", commit="commit" + namespace="namespace", + name="name", + hash="hash", + commit="commit", + competition_id=0, ) self.metadata_store.store_model_metadata(hotkey, model_id) diff --git a/tests/utilities/__init__.py b/tests/utilities/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/utilities/test_miner_iterator.py b/tests/utilities/test_miner_iterator.py deleted file mode 100644 index 3bd50d5..0000000 --- a/tests/utilities/test_miner_iterator.py +++ /dev/null @@ -1,73 +0,0 @@ -from utilities.miner_iterator import MinerIterator -import unittest - - -class TestMinerIterator(unittest.TestCase): - def test_miner_uids_are_sorted(self): - """Creates a MinerIterator with unsorted miner UIDs and verifies that the miner UIDs are sorted.""" - uids = [2, 5, 1, 0] - iterator = MinerIterator(uids) - - # The iterator starts at a random position. Move it until we're pointing to 0. - while iterator.peek() != 0: - next(iterator) - - # Now verify the UIDs are iterated in sorted order. - iterated_uids = [next(iterator) for _ in range(len(uids))] - self.assertEqual(iterated_uids, sorted(uids)) - - def test_iterator_is_infinite(self): - """Creates a MinerIterator and verifies calling it more times than the number of miner UIDs cycles the UIDs.""" - uids = [3, 2, 1] - expected = [1, 2, 3] * 10 - iterator = MinerIterator(uids) - iterated_uids = [next(iterator) for _ in range(30)] - self.assertEqual(sorted(iterated_uids), sorted(expected)) - - def test_peek(self): - """Creates a MinerIterator and verifies that peek returns the next UID without advancing the iterator.""" - uids = [1, 2, 3] - iterator = MinerIterator(uids) - - peeked = iterator.peek() - self.assertEqual(peeked, iterator.peek()) - self.assertEqual(peeked, next(iterator)) - self.assertNotEqual(peeked, iterator.peek()) - - def test_set_miner_uids(self): - """Verifies the iterator position is maintained when the miner UIDs are updated.""" - initial_miner_uids = [1, 2, 3, 4, 5] - iterator = MinerIterator(initial_miner_uids) - - # Advance the iterator so it should now point to 3 - # The iterator starts at a random position. Advance it until it returns 2. - while next(iterator) != 2: - pass - - iterator.set_miner_uids([1, 4, 6]) - - # Verify the iterator picks up from the next UID greater than or equal to 3. - self.assertEqual(next(iterator), 4) - self.assertEqual(next(iterator), 6) - self.assertEqual(next(iterator), 1) - - def test_set_miner_uids_edge_case(self): - """Verifies the iterator position is reset when the miner UIDs are updated and the current position is no longer valid.""" - # Create a MinerIterator with initial miner UIDs - initial_miner_uids = [1, 2, 3, 4, 5] - iterator = MinerIterator(initial_miner_uids) - - # Advance the iterator so it should now point to 5 - while iterator.peek() != 5: - next(iterator) - - iterator.set_miner_uids([1, 2, 3, 4]) - - self.assertEqual(next(iterator), 1) - self.assertEqual(next(iterator), 2) - self.assertEqual(next(iterator), 3) - self.assertEqual(next(iterator), 4) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/utilities/test_perf_monitor.py b/tests/utilities/test_perf_monitor.py deleted file mode 100644 index fb81282..0000000 --- a/tests/utilities/test_perf_monitor.py +++ /dev/null @@ -1,44 +0,0 @@ -import time -import unittest - -from utilities.perf_monitor import PerfMonitor - - -class TestPerfMonitor(unittest.TestCase): - def test_format_duration(self): - """Performs basic validation of the _format_duration method.""" - - monitor = PerfMonitor("test") - - self.assertEqual(monitor._format_duration(1), "1.00 ns") - self.assertEqual(monitor._format_duration(2500), "2.50 μs") - self.assertEqual(monitor._format_duration(3000000), "3.00 ms") - self.assertEqual(monitor._format_duration(1230000000), "1.23 s") - self.assertEqual(monitor._format_duration(120000000000), "2.00 min") - - def test_perf_monitor(self): - """Performs basic validation of the PerfTracker and its output_str.""" - - tracker = PerfMonitor("TestOfTime") - - self.assertRegex(tracker.summary_str(), "TestOfTime performance: N=0") - - with tracker.sample(): - time.sleep(1) - - self.assertRegex( - tracker.summary_str(), - r"TestOfTime performance: N=1 \| Min=1.[0-9]{2} s \| Max=1.[0-9]{2} s \| Median=1.[0-9]{2} s \| P90=1.[0-9]{2} s", - ) - - with tracker.sample(): - time.sleep(4) - - self.assertRegex( - tracker.summary_str(), - r"TestOfTime performance: N=2 \| Min=1.[0-9]{2} s \| Max=4.[0-9]{2} s \| Median=2.[0-9]{2} s \| P90=3.[0-9]{2} s", - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/utilities/test_utils.py b/tests/utilities/test_utils.py deleted file mode 100644 index 7f35fb6..0000000 --- a/tests/utilities/test_utils.py +++ /dev/null @@ -1,237 +0,0 @@ -import functools -import os -from tempfile import NamedTemporaryFile, TemporaryDirectory -import time -from typing import List, Tuple -import unittest -from unittest import mock -import bittensor as bt - -import torch -import constants - -from utilities.utils import run_in_subprocess -from utilities import utils - - -class TestUtils(unittest.TestCase): - def test_run_in_subprocess(self): - def test_func(a: int, b: int): - return a + b - - partial = functools.partial(test_func, 1, 2) - - result = run_in_subprocess(func=partial, ttl=5) - self.assertEqual(3, result) - - def test_run_in_subprocess_timeout(self): - def test_func(a: int, b: int): - time.sleep(3) - return a + b - - partial = functools.partial(test_func, 1, 2) - - with self.assertRaises(TimeoutError): - result = run_in_subprocess(func=partial, ttl=1) - - def test_run_in_subprocess_no_return(self): - def test_func(a: int, b: int): - pass - - partial = functools.partial(test_func, 1, 2) - - result = run_in_subprocess(func=partial, ttl=5) - self.assertIsNone(result) - - def test_run_in_subprocess_tuple_return(self): - def test_func(a: int, b: int): - return a, b - - partial = functools.partial(test_func, 1, 2) - - result = run_in_subprocess(func=partial, ttl=5) - self.assertEqual((1, 2), result) - - def test_run_in_subprocess_exception(self): - def test_func(a: int, b: int): - raise ValueError() - - partial = functools.partial(test_func, 1, 2) - - with self.assertRaises(ValueError): - result = run_in_subprocess(func=partial, ttl=5) - - def test_validate_hf_repo_id_too_long(self): - with self.assertRaises(ValueError) as ve: - # Max allowed length is 41 characters - utils.validate_hf_repo_id("my-org/" + "a" * 40) - - self.assertRegex( - str(ve.exception), - "Hugging Face repo id must be between 3 and 41 characters", - ) - - def test_validate_hf_repo_id_incorrect_format(self): - with self.assertRaises(ValueError) as ve: - utils.validate_hf_repo_id("my-repo-name-without-a-namespace") - - self.assertRegex( - str(ve.exception), "must be in the format /" - ) - - def test_validate_hf_repo_id_valid(self): - namespace, name = utils.validate_hf_repo_id("my-org/my-repo-name") - self.assertEqual("my-org", namespace) - self.assertEqual("my-repo-name", name) - - def test_save_and_load_version(self): - version = constants.__spec_version__ - with NamedTemporaryFile() as f: - self.assertIsNone(utils.get_version(f.name)) - - utils.save_version(f.name, version) - self.assertEqual(utils.get_version(f.name), version) - - def test_move_if_exists_does_not_move_dst_exists(self): - with NamedTemporaryFile(mode="w+") as f: - f.write("test") - f.flush() - - with NamedTemporaryFile() as f2: - # Destination file exists. Should not move. - self.assertFalse(utils.move_file_if_exists(f.name, f2.name)) - self.assertEqual(b"", f2.read()) - f.seek(0) - self.assertEqual(f.read(), "test") - - def test_move_if_exists_does_not_move_src_missing(self): - with NamedTemporaryFile(mode="w+") as f: - f.write("test") - f.flush() - - self.assertFalse(utils.move_file_if_exists("no_file", f.name)) - - def test_move_if_exists(self): - with TemporaryDirectory() as d: - with open(os.path.join(d, "src"), "w") as f: - f.write("test") - f.flush() - - dst = os.path.join(d, "dst") - - self.assertTrue(utils.move_file_if_exists(f.name, dst)) - self.assertFalse(os.path.exists(f.name)) - self.assertTrue(os.path.exists(dst)) - self.assertEqual(open(dst, "rb").read(), b"test") - - def test_get_top_valis(self): - # Create a metagraph with 10 neurons of varying stake with the top 4 having a validator permit. - mock_metagraph = mock.MagicMock() - mock_metagraph.S = torch.tensor( - [0, 1, 2, 300, 4, 5, 600, 7, 8, 9], dtype=torch.float32 - ) - mock_metagraph.validator_permit = torch.tensor( - [ - False, - False, - False, - True, - False, - False, - True, - False, - True, - True, - ], - dtype=torch.bool, - ) - - # Check top 3. - self.assertEqual(utils.get_top_valis(mock_metagraph, 3), [6, 3, 9]) - - # Check N > valis in the metagraph. - self.assertEqual(utils.get_top_valis(mock_metagraph, 6), [6, 3, 9, 8]) - - def _create_metagraph(self): - """Returns a mocked metagraph with 2 miners and 2 valis.""" - mock_metagraph = mock.MagicMock() - stakes = torch.tensor([0, 200, 2, 30], dtype=torch.float32) - mock_metagraph.S = stakes - mock_metagraph.validator_permit = stakes >= 30 - return mock_metagraph - - def _neuron_info_with_weights( - self, uid: int, weights: List[Tuple[int, float]] - ) -> bt.NeuronInfo: - return bt.NeuronInfo( - uid=uid, - netuid=0, - active=0, - stake=bt.Balance.from_rao(0), - stake_dict={}, - total_stake=bt.Balance.from_rao(0), - rank=0, - emission=0, - incentive=0, - consensus=0, - trust=0, - validator_trust=0, - dividends=0, - last_update=0, - validator_permit=False, - weights=weights, - bonds=[], - prometheus_info=None, - axon_info=None, - is_null=True, - coldkey="000000000000000000000000000000000000000000000000", - hotkey="000000000000000000000000000000000000000000000000", - pruning_score=0, - ) - - def test_list_top_miners_deduplicated(self): - """Tests list_top_miners, when validators agree on the top miner.""" - metagraph = self._create_metagraph() - - # Set validator weights such that they agree on miner 0 as the top miner. - metagraph.neurons = [ - self._neuron_info_with_weights(uid=0, weights=[]), - self._neuron_info_with_weights(uid=1, weights=[(0, 1)]), - self._neuron_info_with_weights(uid=2, weights=[]), - self._neuron_info_with_weights(uid=3, weights=[(0, 1)]), - ] - - # Verify the miner UID is deduped. - self.assertSequenceEqual(utils.list_top_miners(metagraph), [0]) - - def test_list_top_miners_multiple_miners(self): - """Tests list_top_miners, when validators disagree on the top miner.""" - metagraph = self._create_metagraph() - - metagraph.neurons = [ - self._neuron_info_with_weights(uid=0, weights=[]), - self._neuron_info_with_weights(uid=1, weights=[(0, 1)]), - self._neuron_info_with_weights(uid=2, weights=[]), - self._neuron_info_with_weights(uid=3, weights=[(2, 1)]), - ] - top_miners = utils.list_top_miners(metagraph) - self.assertEqual(len(top_miners), 2) - self.assertEqual(set(top_miners), set([0, 2])) - - def test_list_top_miners_multiple_weights_set(self): - """Tests list_top_miners, when validators assign multiple weights""" - metagraph = self._create_metagraph() - - # Have vali 1 set multiple weights, ensuring it assigns more than - # 50% relative weight to UID 0. - metagraph.neurons = [ - self._neuron_info_with_weights(uid=0, weights=[]), - self._neuron_info_with_weights(uid=1, weights=[(0, 1), (1, 0.1), (2, 0.5)]), - self._neuron_info_with_weights(uid=2, weights=[]), - self._neuron_info_with_weights(uid=3, weights=[]), - ] - self.assertEqual(utils.list_top_miners(metagraph), [0]) - - -if __name__ == "__main__": - unittest.main() diff --git a/utilities/__init__.py b/utilities/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utilities/miner_iterator.py b/utilities/miner_iterator.py deleted file mode 100644 index 12b8a27..0000000 --- a/utilities/miner_iterator.py +++ /dev/null @@ -1,61 +0,0 @@ -import bisect -import copy -import threading -from typing import List - -import random - - -class MinerIterator: - """A thread safe infinite iterator to cyclically enumerate the current set of miner UIDs. - - Why? To perform miner evaluations, the validator will enumerate through the miners in order to help ensure - each miner is evaluated at least once per epoch. - """ - - def __init__(self, miner_uids: List[int]): - self.miner_uids = sorted(copy.deepcopy(miner_uids)) - # Start the index at a random position. This helps ensure that miners with high UIDs aren't penalized if - # the validator restarts frequently. - self.index = random.randint(0, len(self.miner_uids) - 1) - self.lock = threading.Lock() - - def __iter__(self): - return self - - def __next__(self) -> int: - with self.lock: - if len(self.miner_uids) == 0: - # This iterator should be infinite. If there are no miner UIDs, raise an error. - raise IndexError("No miner UIDs.") - - uid = self.miner_uids[self.index] - self.index += 1 - if self.index >= len(self.miner_uids): - self.index = 0 - return uid - - def peek(self) -> int: - """Returns the next miner UID without advancing the iterator.""" - with self.lock: - if len(self.miner_uids) == 0: - # This iterator should be infinite. If there are no miner UIDs, raise an error. - raise IndexError("No miner UIDs.") - - return self.miner_uids[self.index] - - def set_miner_uids(self, miner_uids: List[int]): - """Updates the miner UIDs to iterate. - - The iterator will be updated to the first miner uid that is greater than or equal to UID that would be next - returned by the iterator. This helps ensure that frequent updates to the miner_uids does not cause too much - churn in the sequence of UIDs returned by the iterator. - """ - sorted_uids = sorted(copy.deepcopy(miner_uids)) - with self.lock: - next_uid = self.miner_uids[self.index] - new_index = bisect.bisect_left(sorted_uids, next_uid) - if new_index >= len(sorted_uids): - new_index = 0 - self.index = new_index - self.miner_uids = sorted_uids diff --git a/utilities/perf_monitor.py b/utilities/perf_monitor.py deleted file mode 100644 index 1466e39..0000000 --- a/utilities/perf_monitor.py +++ /dev/null @@ -1,67 +0,0 @@ -import time -import numpy as np - - -class PerfSample: - def __init__(self, perf_tracker): - self.perf_tracker = perf_tracker - self.start_time = None - - def __enter__(self): - self.start_time = time.monotonic_ns() - return self - - def __exit__(self, exception_type, exception_value, exception_traceback): - duration = time.monotonic_ns() - self.start_time - self.perf_tracker.samples.append(duration) - - -class PerfMonitor: - """PerfMonitor is a context manager that tracks the performance of a block of code by taking several samples. - - Example: - tracker = PerfMonitor("MyOperation") - for _ in range(10): - with tracker.sample(): - // Do something - - print(tracker.summary_str()) - """ - - def __init__(self, name): - self.name = name - self.samples = [] - - def sample(self) -> PerfSample: - """Returns a context manager that will record the duration of the block it wraps.""" - return PerfSample(self) - - def summary_str(self) -> str: - """Returns a string summarizing the performance of the tracked operation.""" - if not self.samples: - return f"{self.name} performance: N=0" - - durations_ns = np.array(self.samples) - - return ( - f"{self.name} performance: N={len(durations_ns)} | " - + f"Min={self._format_duration(np.min(durations_ns))} | " - + f"Max={self._format_duration(np.max(durations_ns))} | " - + f"Median={self._format_duration(np.median(durations_ns))} | " - + f"P90={self._format_duration(np.percentile(durations_ns, 90))}" - ) - - def _format_duration(self, duration_ns: int) -> str: - units = [ - ("ns", 1), - ("μs", 1000), - ("ms", 1000_000), - ("s", 1000_000_000), - ("min", 60 * 1000_000_000), - ] - - for unit, divisor in reversed(units): - if duration_ns >= divisor: - return f"{duration_ns/divisor:.2f} {unit}" - - return f"{duration_ns:.2f} ns" diff --git a/utilities/utils.py b/utilities/utils.py deleted file mode 100644 index db85ca1..0000000 --- a/utilities/utils.py +++ /dev/null @@ -1,204 +0,0 @@ -import functools -import multiprocessing -import os -import time -from typing import Any, List, Optional, Tuple -import bittensor as bt - -from model.data import ModelId, ModelMetadata - - -def assert_registered(wallet: bt.wallet, metagraph: bt.metagraph) -> int: - """Asserts the wallet is a registered miner and returns the miner's UID. - - Raises: - ValueError: If the wallet is not registered. - """ - if wallet.hotkey.ss58_address not in metagraph.hotkeys: - raise ValueError( - f"You are not registered. \nUse: \n`btcli s register --netuid {metagraph.netuid}` to register via burn \n or btcli s pow_register --netuid {metagraph.netuid} to register with a proof of work" - ) - uid = metagraph.hotkeys.index(wallet.hotkey.ss58_address) - bt.logging.success( - f"You are registered with address: {wallet.hotkey.ss58_address} and uid: {uid}" - ) - - return uid - - -def validate_hf_repo_id(repo_id: str) -> Tuple[str, str]: - """Verifies a Hugging Face repo id is valid and returns it split into namespace and name. - - Raises: - ValueError: If the repo id is invalid. - """ - - if not repo_id: - raise ValueError("Hugging Face repo id cannot be empty.") - - if not 3 < len(repo_id) <= ModelId.MAX_REPO_ID_LENGTH: - raise ValueError( - f"Hugging Face repo id must be between 3 and {ModelId.MAX_REPO_ID_LENGTH} characters. Got={repo_id}" - ) - - parts = repo_id.split("/") - if len(parts) != 2: - raise ValueError( - f"Hugging Face repo id must be in the format /. Got={repo_id}" - ) - - return parts[0], parts[1] - - -def get_hf_url(model_metadata: ModelMetadata) -> str: - """Returns the URL to the Hugging Face repo for the provided model metadata.""" - return f"https://huggingface.co/{model_metadata.id.namespace}/{model_metadata.id.name}/tree/{model_metadata.id.commit}" - - -def _wrapped_func(func: functools.partial, queue: multiprocessing.Queue): - try: - result = func() - queue.put(result) - except (Exception, BaseException) as e: - # Catch exceptions here to add them to the queue. - queue.put(e) - - -def run_in_subprocess(func: functools.partial, ttl: int, mode="fork") -> Any: - """Runs the provided function on a subprocess with 'ttl' seconds to complete. - - Args: - func (functools.partial): Function to be run. - ttl (int): How long to try for in seconds. - - Returns: - Any: The value returned by 'func' - """ - ctx = multiprocessing.get_context(mode) - queue = ctx.Queue() - process = ctx.Process(target=_wrapped_func, args=[func, queue]) - - process.start() - - process.join(timeout=ttl) - - if process.is_alive(): - process.terminate() - process.join() - raise TimeoutError(f"Failed to {func.func.__name__} after {ttl} seconds") - - # Raises an error if the queue is empty. This is fine. It means our subprocess timed out. - result = queue.get(block=False) - - # If we put an exception on the queue then raise instead of returning. - if isinstance(result, Exception): - raise result - if isinstance(result, BaseException): - raise Exception(f"BaseException raised in subprocess: {str(result)}") - - return result - - -def get_version(filepath: str) -> Optional[int]: - """Loads a version from the provided filepath or None if the file does not exist. - - Args: - filepath (str): Path to the version file.""" - if os.path.exists(filepath): - with open(filepath, "r") as f: - line = f.readline() - if line: - return int(line) - return None - return None - - -def save_version(filepath: str, version: int): - """Saves a version to the provided filepath.""" - os.makedirs(os.path.dirname(filepath), exist_ok=True) - with open(filepath, "w") as f: - f.write(str(version)) - - -def move_file_if_exists(src: str, dst: str) -> bool: - """Moves a file from src to dst if it exists. - - Returns: - bool: True if the file was moved, False otherwise. - """ - if os.path.exists(src) and not os.path.exists(dst): - os.makedirs(os.path.dirname(dst), exist_ok=True) - os.replace(src, dst) - return True - return False - - -def list_top_miners(metagraph: bt.metagraph) -> List[int]: - """Returns the list of top miners, chosen based on weights set on the largest valis. - - Args: - metagraph (bt.metagraph): Metagraph to use. Must not be lite. - """ - - top_miners = set() - - # Find the top 10 valis by stake. - valis_by_stake = get_top_valis(metagraph, 10) - - # For each, find the miner that has more than 50% of the weights. - for uid in valis_by_stake: - # Weights is a list of (uid, weight) pairs - weights: List[Tuple[int, float]] = metagraph.neurons[uid].weights - total_weight = sum(weight for _, weight in weights) - - # Only look for miners with at least half the weight from this vali - threshold = total_weight / 2.0 - for uid, weight in weights: - if weight > threshold: - top_miners.add(uid) - # Break now because only 1 miner can have more than half the weight. - break - - return list(top_miners) - - -def get_top_valis(metagraph: bt.metagraph, n: int) -> List[int]: - """Returns the N top validators, ordered by stake descending. - - Returns: - List[int]: Ordered list of UIDs of the top N validators, or all validators if N is greater than the number of validators. - """ - valis = [] - for uid, stake in enumerate(metagraph.S): - # Use vPermit to check for validators rather than vTrust because we'd rather - # cast a wide net in the case that vTrust is 0 due to an unhealthy state of the - # subnet. - if metagraph.validator_permit[uid]: - valis.append((stake, uid)) - - return [uid for _, uid in sorted(valis, reverse=True)[:n]] - - -def run_with_retry(func, max_retries=3, delay_seconds=1, single_try_timeout=30): - """ - Retry a function with constant backoff. - - Parameters: - - func: The function to be retried. - - max_retries: Maximum number of retry attempts (default is 3). - - delay_seconds: Initial delay between retries in seconds (default is 1). - - Returns: - - The result of the successful function execution. - - Raises the exception from the last attempt if all attempts fail. - """ - for attempt in range(1, max_retries + 1): - try: - return func() - except Exception as e: - if attempt == max_retries: - # If it's the last attempt, raise the exception - raise e - # Wait before the next retry. - time.sleep(delay_seconds) - raise Exception("Unexpected state: Ran with retry but didn't hit a terminal state") From 2dec6ee82962e216eae61026b8a5952159cb2d5f Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Sun, 25 Aug 2024 16:34:38 -0700 Subject: [PATCH 02/11] Add max_size requirements to the competitions --- constants/__init__.py | 3 +++ requirements.txt | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/constants/__init__.py b/constants/__init__.py index 2838c56..535fbc6 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -105,6 +105,7 @@ tokenizer="distilgpt2", eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=5 * 1024 * 1024 * 1024, ), CompetitionId.B7_MODEL: ModelConstraints( max_model_parameter_size=6_900_000_000, @@ -118,6 +119,7 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=15 * 1024 * 1024 * 1024, ), CompetitionId.B3_MODEL: ModelConstraints( max_model_parameter_size=3_400_000_000, @@ -131,6 +133,7 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=15 * 1024 * 1024 * 1024, ), } diff --git a/requirements.txt b/requirements.txt index dcfee97..d62cbdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ transformers==4.44.1 wandb datasets flash-attn -taoverse==1.0.2 +taoverse==1.0.4 From b9758aaaea20c28c3761a5fd4934ed899b0f7d3c Mon Sep 17 00:00:00 2001 From: Alan Aboudib Date: Mon, 26 Aug 2024 08:40:00 +0000 Subject: [PATCH 03/11] Added draft 14B competition schedule --- competitions/data.py | 2 ++ constants/__init__.py | 46 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/competitions/data.py b/competitions/data.py index d5bad71..10a9e43 100644 --- a/competitions/data.py +++ b/competitions/data.py @@ -12,6 +12,8 @@ class CompetitionId(IntEnum): B7_MODEL_LOWER_EPSILON = 3 + B14_MODEL = 4 + # Overwrite the default __repr__, which doesn't work with # bt.logging for some unknown reason. def __repr__(self) -> str: diff --git a/constants/__init__.py b/constants/__init__.py index eb321f0..7f0d162 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -60,7 +60,7 @@ WEIGHT_SYNC_VALI_MIN_STAKE = 200_000 # Starting block for 3B, 7B* (epsilon experiment) and sample unpacking -BLOCK_3B_7BSTAR_UNPACK = 3_601_190 +BLOCK_3B_7BSTAR_UNPACK = 3_601#_190 # Minimum percent of weight on a vali for a miner to be considered a top miner. # Since there can be multiple competitions at different reward percentages we can't just check biggest. @@ -98,6 +98,7 @@ CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader, + CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader, } # Defined model constraints by competition id to ensure they are constant across blocks. @@ -134,6 +135,19 @@ }, eval_block_delay=0, ), + CompetitionId.B14_MODEL: ModelConstraints( + max_model_parameter_size=13_900_000_000, + min_model_parameter_size=13_000_000_000, + sequence_length=4096, + allowed_architectures=ALLOWED_MODEL_TYPES_2, + tokenizer="Xenova/gpt-4", + kwargs={ + "torch_dtype": torch.bfloat16, + "attn_implementation": "flash_attention_2", + }, + eval_block_delay=0, + ), + } @@ -150,7 +164,7 @@ ], ), ( - 3_565_190, + 3_565,#_190, [ Competition( CompetitionId.M772_MODEL, @@ -185,6 +199,34 @@ ], ), + ( + 1_000_000, + [ + Competition( + CompetitionId.M772_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL], + 0.14, + ), + Competition( + CompetitionId.B3_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL], + 0.29, + ), + Competition( + CompetitionId.B7_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL], + 0.07, + ), + Competition( + CompetitionId.B14_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL], + 0.5, + ) + + + ], + ), + ] for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK: From eb818398923d398c98991faae659201e1b699f85 Mon Sep 17 00:00:00 2001 From: Alan Aboudib Date: Mon, 26 Aug 2024 08:40:00 +0000 Subject: [PATCH 04/11] Added draft 14B competition schedule --- competitions/data.py | 2 ++ constants/__init__.py | 46 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/competitions/data.py b/competitions/data.py index d5bad71..10a9e43 100644 --- a/competitions/data.py +++ b/competitions/data.py @@ -12,6 +12,8 @@ class CompetitionId(IntEnum): B7_MODEL_LOWER_EPSILON = 3 + B14_MODEL = 4 + # Overwrite the default __repr__, which doesn't work with # bt.logging for some unknown reason. def __repr__(self) -> str: diff --git a/constants/__init__.py b/constants/__init__.py index 2838c56..acf5f36 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -59,7 +59,7 @@ WEIGHT_SYNC_VALI_MIN_STAKE = 200_000 # Starting block for 3B, 7B* (epsilon experiment) and sample unpacking -BLOCK_3B_7BSTAR_UNPACK = 3_601_190 +BLOCK_3B_7BSTAR_UNPACK = 3_601#_190 # Minimum percent of weight on a vali for a miner to be considered a top miner. # Since there can be multiple competitions at different reward percentages we can't just check biggest. @@ -93,6 +93,7 @@ CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader, + CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader, } # Defined model constraints by competition id to ensure they are constant across blocks. @@ -132,6 +133,19 @@ eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), ), + CompetitionId.B14_MODEL: ModelConstraints( + max_model_parameter_size=13_900_000_000, + min_model_parameter_size=13_000_000_000, + sequence_length=4096, + allowed_architectures=ALLOWED_MODEL_TYPES_2, + tokenizer="Xenova/gpt-4", + kwargs={ + "torch_dtype": torch.bfloat16, + "attn_implementation": "flash_attention_2", + }, + eval_block_delay=0, + ), + } @@ -148,7 +162,7 @@ ], ), ( - 3_565_190, + 3_565,#_190, [ Competition( CompetitionId.M772_MODEL, @@ -182,6 +196,34 @@ ), ], ), + ( + 1_000_000, + [ + Competition( + CompetitionId.M772_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL], + 0.14, + ), + Competition( + CompetitionId.B3_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL], + 0.29, + ), + Competition( + CompetitionId.B7_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL], + 0.07, + ), + Competition( + CompetitionId.B14_MODEL, + MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL], + 0.5, + ) + + + ], + ), + ] for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK: From e72744f56ebc1389bc427c1a1bb7253a45b6b442 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 13:47:23 +0000 Subject: [PATCH 05/11] Added 14B competition schedule --- constants/__init__.py | 47 +++++++++---------------------------------- neurons/validator.py | 2 ++ 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index 617557a..3b9cd50 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -34,7 +34,7 @@ # --------------------------------- # Release -__version__ = "4.1.2" +__version__ = "4.2.0" # Validator schema version __validator_version__ = "3.0.0" @@ -59,7 +59,7 @@ WEIGHT_SYNC_VALI_MIN_STAKE = 200_000 # Starting block for 3B, 7B* (epsilon experiment) and sample unpacking -BLOCK_3B_7BSTAR_UNPACK = 3_601#_190 +BLOCK_3B_7BSTAR_UNPACK = 3_601_190 # Minimum percent of weight on a vali for a miner to be considered a top miner. # Since there can be multiple competitions at different reward percentages we can't just check biggest. @@ -93,7 +93,7 @@ CompetitionId.M772_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B3_MODEL: pt.dataset.SubsetFalconLoader, CompetitionId.B7_MODEL: pt.dataset.SubsetFineWebEdu2Loader, - CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader, + CompetitionId.B14_MODEL: pt.dataset.SubsetFineWebEdu2Loader, } # Defined model constraints by competition id to ensure they are constant across blocks. @@ -135,7 +135,7 @@ ), CompetitionId.B14_MODEL: ModelConstraints( max_model_parameter_size=13_900_000_000, - min_model_parameter_size=13_000_000_000, + min_model_parameter_size=13_700_000_000, sequence_length=4096, allowed_architectures=ALLOWED_MODEL_TYPES_2, tokenizer="Xenova/gpt-4", @@ -144,8 +144,9 @@ "attn_implementation": "flash_attention_2", }, eval_block_delay=0, + epsilon_func=FixedEpsilon(0.005), ), - + } @@ -162,7 +163,7 @@ ], ), ( - 3_565,#_190, + 3_565_190, [ Competition( CompetitionId.M772_MODEL, @@ -197,7 +198,7 @@ ], ), ( - 1_000_000, + 3_750_683, [ Competition( CompetitionId.M772_MODEL, @@ -212,45 +213,17 @@ Competition( CompetitionId.B7_MODEL, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL], - 0.07, + 0.15, ), Competition( CompetitionId.B14_MODEL, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL], - 0.5, + 0,42, ) - ], ), - ( - 1_000_000, - [ - Competition( - CompetitionId.M772_MODEL, - MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.M772_MODEL], - 0.14, - ), - Competition( - CompetitionId.B3_MODEL, - MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B3_MODEL], - 0.29, - ), - Competition( - CompetitionId.B7_MODEL, - MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B7_MODEL], - 0.07, - ), - Competition( - CompetitionId.B14_MODEL, - MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL], - 0.5, - ) - - ], - ), - ] for block_and_competitions in COMPETITION_SCHEDULE_BY_BLOCK: diff --git a/neurons/validator.py b/neurons/validator.py index 615eb9b..678f19b 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -748,6 +748,7 @@ async def run_step(self): batches = list(dataloader) bt.logging.debug(f"Number of validation batches is {len(batches)}") + bt.logging.debug(f"Batch size is {len(batches[0])}") # This is useful for logging to wandb pages = dataloader.get_page_names() @@ -814,6 +815,7 @@ async def run_step(self): ttl=400, mode="spawn", ) + del model_i except Exception as e: bt.logging.error( From 6f742bc2889cdea846de6a5647a6d26d2750cc80 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 13:51:16 +0000 Subject: [PATCH 06/11] Reverted dependency to Taoverse 1.0.2 --- constants/__init__.py | 3 --- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index 05be534..3b9cd50 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -106,7 +106,6 @@ tokenizer="distilgpt2", eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), - max_bytes=5 * 1024 * 1024 * 1024, ), CompetitionId.B7_MODEL: ModelConstraints( max_model_parameter_size=6_900_000_000, @@ -120,7 +119,6 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), - max_bytes=15 * 1024 * 1024 * 1024, ), CompetitionId.B3_MODEL: ModelConstraints( max_model_parameter_size=3_400_000_000, @@ -134,7 +132,6 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), - max_bytes=15 * 1024 * 1024 * 1024, ), CompetitionId.B14_MODEL: ModelConstraints( max_model_parameter_size=13_900_000_000, diff --git a/requirements.txt b/requirements.txt index d62cbdd..dcfee97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ transformers==4.44.1 wandb datasets flash-attn -taoverse==1.0.4 +taoverse==1.0.2 From a07f6ca42f6f343db81458855976fea1325a079c Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 15:04:38 +0000 Subject: [PATCH 07/11] Added epsilon experiment deactivation block --- constants/__init__.py | 1 + neurons/validator.py | 1 + 2 files changed, 2 insertions(+) diff --git a/constants/__init__.py b/constants/__init__.py index 535fbc6..5b372f3 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -219,6 +219,7 @@ pages_per_eval_pack = 18 timestamp_epsilon_experiment_start_block = BLOCK_3B_7BSTAR_UNPACK +timestamp_epsilon_experiment_end_block = 3_743_483 timestamp_epsilon_experiment = 0.001 timestamp_epsilon_experiment_weight_percent = 0.123 diff --git a/neurons/validator.py b/neurons/validator.py index 1d6fe36..966486e 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -839,6 +839,7 @@ async def run_step(self): if ( competition.id == CompetitionId.B7_MODEL and cur_block >= constants.timestamp_epsilon_experiment_start_block + and cur_block < constants.timestamp_epsilon_experiment_end_block ): wins_epsilon_experiment, win_rate_epsilon_experiment = ( pt.validation.compute_wins( From c90dcb83c4e5fdbf9ae233bd3b209203a49f880d Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 15:29:29 +0000 Subject: [PATCH 08/11] Updated deactivation block to the same as 14B --- constants/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constants/__init__.py b/constants/__init__.py index 5b372f3..7085c0c 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -219,7 +219,7 @@ pages_per_eval_pack = 18 timestamp_epsilon_experiment_start_block = BLOCK_3B_7BSTAR_UNPACK -timestamp_epsilon_experiment_end_block = 3_743_483 +timestamp_epsilon_experiment_end_block = 3_750_683 timestamp_epsilon_experiment = 0.001 timestamp_epsilon_experiment_weight_percent = 0.123 From 4918fa5e338cbc4a37dcded5afb9a8f9d95cea77 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 16:04:02 +0000 Subject: [PATCH 09/11] Bumped validator version and taoverse dependency --- constants/__init__.py | 8 ++++++-- requirements.txt | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index 3b9cd50..0bfc8ff 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -37,7 +37,7 @@ __version__ = "4.2.0" # Validator schema version -__validator_version__ = "3.0.0" +__validator_version__ = "3.1.0" version_split = __validator_version__.split(".") __spec_version__ = ( (1000 * int(version_split[0])) @@ -106,6 +106,7 @@ tokenizer="distilgpt2", eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=5 * 1024 * 1024 * 1024, ), CompetitionId.B7_MODEL: ModelConstraints( max_model_parameter_size=6_900_000_000, @@ -119,6 +120,7 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=15 * 1024 * 1024 * 1024, ), CompetitionId.B3_MODEL: ModelConstraints( max_model_parameter_size=3_400_000_000, @@ -132,6 +134,7 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=15 * 1024 * 1024 * 1024, ), CompetitionId.B14_MODEL: ModelConstraints( max_model_parameter_size=13_900_000_000, @@ -145,6 +148,7 @@ }, eval_block_delay=0, epsilon_func=FixedEpsilon(0.005), + max_bytes=29 * 1024 * 1024 * 1024, ), } @@ -218,7 +222,7 @@ Competition( CompetitionId.B14_MODEL, MODEL_CONSTRAINTS_BY_COMPETITION_ID[CompetitionId.B14_MODEL], - 0,42, + 0.42, ) ], diff --git a/requirements.txt b/requirements.txt index dcfee97..d62cbdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ transformers==4.44.1 wandb datasets flash-attn -taoverse==1.0.2 +taoverse==1.0.4 From 7fa379957fecda4dbfbaf5a5c187667dcdfa9a5c Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 17:13:15 +0000 Subject: [PATCH 10/11] Updated leaderboard links and validator sys requirements --- README.md | 4 ++-- docs/validator.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 35eeca0..1ee9e92 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ --- -[Leaderboard](https://huggingface.co/spaces/macrocosm-os/pretraining-leaderboard) • [Discord](https://discord.gg/bittensor) • [Network](https://taostats.io/subnets/netuid-9/) • [Research](https://bittensor.com/whitepaper) +[Leaderboard](https://www.macrocosmos.ai/sn9) • [Discord](https://discord.gg/bittensor) • [Network](https://taostats.io/subnets/netuid-9/) • [Research](https://bittensor.com/whitepaper) --- @@ -43,7 +43,7 @@ Miners within this subnet are evaluated based on the number of times the model t TL;DR: 1. [Chat](https://discord.gg/bittensor) -2. [Leaderboard](https://huggingface.co/spaces/macrocosm-os/pretraining-leaderboard) +2. [Leaderboard](https://www.macrocosmos.ai/sn9) This repo's main conversation is carried out in the Bittensor [Discord](https://discord.gg/bittensor). Visit the 'pretraining' channel to ask questions and get real time feedback. You can view the ongoing running of the incentive mechanism, the best miners (see 'incentive'), the most in consensus validators (see 'vtrust') using this [taostats link](https://taostats.io/subnets/netuid-9/). The table shows all 256 participant UIDs with corresponding YC stats and earnings. diff --git a/docs/validator.md b/docs/validator.md index 2e490f1..739bdb6 100644 --- a/docs/validator.md +++ b/docs/validator.md @@ -51,7 +51,7 @@ It is important to note that this affects the game theoretics of the incentive l # System Requirements -Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 1 TB of disk space. +Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 2 TB of disk space and 80GB for systerm memory. Validators will need enough processing power to evaluate their model. As of Apr 1st, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations. From 824c9d4c6149cbcfe5d2ad122abae10a2029da48 Mon Sep 17 00:00:00 2001 From: cryptal-mc Date: Tue, 27 Aug 2024 17:15:23 +0000 Subject: [PATCH 11/11] typo fix --- docs/validator.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/validator.md b/docs/validator.md index 739bdb6..2df3457 100644 --- a/docs/validator.md +++ b/docs/validator.md @@ -51,7 +51,7 @@ It is important to note that this affects the game theoretics of the incentive l # System Requirements -Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 2 TB of disk space and 80GB for systerm memory. +Validators will need enough disk space to store the models of miners being evaluated. Each model has a max size by block defined in [constants/__init__.py](https://github.com/macrocosm-os/pretraining/blob/main/constants/__init__.py#L57) and the validator has cleanup logic to remove old models. It is recommended to have at least 2 TB of disk space and 80GB of system memory. Validators will need enough processing power to evaluate their model. As of Apr 1st, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.