Skip to content

Commit

Permalink
Added sana models (#387)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsingal0 authored Dec 7, 2024
1 parent f47871d commit 363e5a7
Show file tree
Hide file tree
Showing 507 changed files with 132,678 additions and 183 deletions.
7 changes: 4 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
fail_fast: true
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v5.0.0
hooks:
- id: check-added-large-files
args: ["--maxkb=500"]
Expand All @@ -13,15 +13,16 @@ repos:
- id: check-merge-conflict
- id: check-symlinks
- id: debug-statements

- repo: https://github.com/psf/black
rev: 22.10.0
rev: 24.10.0
hooks:
- id: black
# It is recommended to specify the latest version of Python
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.11
language_version: python3
- repo: local
hooks:
- id: isort
Expand Down
1 change: 1 addition & 0 deletions 02-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# We use the 7B version of the Mistral model.
CHECKPOINT = "mistralai/Mistral-7B-v0.1"


# # Define the `Model` class and load function
#
# In the `load` function of the Truss, we implement logic involved in
Expand Down
1 change: 1 addition & 0 deletions 04-image-generation/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# The following line is needed to enable TF32 on NVIDIA GPUs
torch.backends.cuda.matmul.allow_tf32 = True


# # Define the `Model` class and load function
#
# In the `load` function of the Truss, we implement logic involved in
Expand Down
1 change: 1 addition & 0 deletions clip/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# This is the CLIP model from Hugging Face that we will use for this example.
CHECKPOINT = "openai/clip-vit-base-patch32"


# # Define the Truss
#
# In the `load` method, we load in the pretrained CLIP model from the
Expand Down
12 changes: 6 additions & 6 deletions dis-segmentation/model/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,14 @@ def create_hyper_parameters():
hypar = {} # paramters for inferencing
hypar["model_path"] = "./saved_models" ## load trained weights from this path
hypar["restore_model"] = "isnet.pth" ## name of the to-be-loaded weights
hypar[
"interm_sup"
] = False ## indicate if activate intermediate feature supervision
hypar["interm_sup"] = (
False ## indicate if activate intermediate feature supervision
)

## choose floating point accuracy --
hypar[
"model_digit"
] = "full" ## indicates "half" or "full" accuracy of float number
hypar["model_digit"] = (
"full" ## indicates "half" or "full" accuracy of float number
)
hypar["seed"] = 0

hypar["cache_size"] = [
Expand Down
9 changes: 4 additions & 5 deletions gemma/gemma-2-27b-it-vllm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@
import uuid

from transformers import AutoTokenizer
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine

from vllm import SamplingParams

os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ[
"VLLM_WORKER_MULTIPROC_METHOD"
] = "spawn" # for multiprocessing to work with CUDA
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = (
"spawn" # for multiprocessing to work with CUDA
)
logger = logging.getLogger(__name__)


Expand Down
8 changes: 5 additions & 3 deletions gemma/gemma-7b-instruct-trtllm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ def load(self):

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
engine_repository_path=(
build_config.engine_repository
if not build_config.requires_build
else None
),
huggingface_auth_token=hf_access_token,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@ def load(self):

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
engine_repository_path=(
build_config.engine_repository
if not build_config.requires_build
else None
),
huggingface_auth_token=hf_access_token,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,7 @@
from itertools import count

import build_engine_utils
from constants import (
GRPC_SERVICE_PORT,
HF_AUTH_KEY_CONSTANT,
HTTP_SERVICE_PORT,
TOKENIZER_KEY_CONSTANT,
)
from constants import GRPC_SERVICE_PORT, HF_AUTH_KEY_CONSTANT, HTTP_SERVICE_PORT, TOKENIZER_KEY_CONSTANT
from schema import ModelInput, TrussBuildConfig
from transformers import AutoTokenizer
from triton_client import TritonClient, TritonServer
Expand Down Expand Up @@ -50,9 +45,11 @@ def load(self):

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
engine_repository_path=(
build_config.engine_repository
if not build_config.requires_build
else None
),
huggingface_auth_token=hf_access_token,
)

Expand Down
8 changes: 5 additions & 3 deletions llama/llama-3-8b-instruct-trt-llm-fp8/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@ def load(self):

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
engine_repository_path=(
build_config.engine_repository
if not build_config.requires_build
else None
),
huggingface_auth_token=hf_access_token,
)

Expand Down
8 changes: 5 additions & 3 deletions llama/llama-3-8b-instruct-trt-llm/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@ def load(self):

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=build_config.engine_repository
if not build_config.requires_build
else None,
engine_repository_path=(
build_config.engine_repository
if not build_config.requires_build
else None
),
huggingface_auth_token=hf_access_token,
)

Expand Down
9 changes: 4 additions & 5 deletions llama/llama-3_1-405b-instruct/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@

patch()

from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine

from vllm import SamplingParams

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ[
"VLLM_WORKER_MULTIPROC_METHOD"
] = "spawn" # for multiprocessing to work with CUDA
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = (
"spawn" # for multiprocessing to work with CUDA
)
logger = logging.getLogger(__name__)


Expand Down
9 changes: 4 additions & 5 deletions llama/llama-3_1_70b-instruct/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@

patch()

from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine

from vllm import SamplingParams

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ[
"VLLM_WORKER_MULTIPROC_METHOD"
] = "spawn" # for multiprocessing to work with CUDA
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = (
"spawn" # for multiprocessing to work with CUDA
)
logger = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Generate json file for webpage."""

import json
import os
import re
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Generate answers with GPT-3.5"""

# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import argparse
import concurrent.futures
Expand Down
1 change: 1 addition & 0 deletions llava/llava-v1.5-7b/packages/llava/model/apply_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Usage:
python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
"""

import argparse

import torch
Expand Down
1 change: 1 addition & 0 deletions llava/llava-v1.5-7b/packages/llava/model/consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Usage:
python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
"""

import argparse

import torch
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Attention layers."""

import math
import warnings
from typing import Optional
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""GPT Blocks used for the GPT Model."""

from typing import Dict, Optional, Tuple

import torch
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""A HuggingFace-style model configuration."""

from typing import Dict, Optional, Union

from transformers import PretrainedConfig
Expand Down Expand Up @@ -132,7 +133,7 @@ def __init__(
self._validate_config()

def _set_config_defaults(self, config, config_defaults):
for (k, v) in config_defaults.items():
for k, v in config_defaults.items():
if k not in config:
config[k] = v
return config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
"""

import math

import torch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Prefix LMs accepts a `bidirectional_mask` input in `forward`
and treat the input prompt as the prefix in `generate`.
"""

import math
import warnings
from types import MethodType
Expand All @@ -20,19 +21,15 @@
CrossEntropyLoss,
)
from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
from transformers.models.bloom.modeling_bloom import (
_make_causal_mask as _make_causal_mask_bloom,
)
from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
from transformers.models.bloom.modeling_bloom import logging
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
from transformers.models.opt.modeling_opt import OPTForCausalLM
from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
from transformers.models.opt.modeling_opt import (
_make_causal_mask as _make_causal_mask_opt,
)
from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt

logger = logging.get_logger(__name__)
_SUPPORTED_GPT_MODELS = (
Expand Down Expand Up @@ -357,7 +354,7 @@ def forward(
input_shape=(batch_size, seq_length),
past_key_values_length=past_key_values_length,
)
for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
if output_hidden_states:
hst = (hidden_states,)
all_hidden_states = all_hidden_states + hst
Expand Down Expand Up @@ -742,7 +739,7 @@ def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
if "bidirectional_mask" not in batch:
if batch.get("mode", None) == "icl_task":
batch["bidirectional_mask"] = batch["attention_mask"].clone()
for (i, continuation_indices) in enumerate(batch["continuation_indices"]):
for i, continuation_indices in enumerate(batch["continuation_indices"]):
batch["bidirectional_mask"][i, continuation_indices] = 0
elif "labels" in batch and "attention_mask" in batch:
batch["bidirectional_mask"] = torch.logical_and(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
"""

import math
import warnings
from typing import List, Optional, Tuple, Union
Expand All @@ -10,20 +11,14 @@
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast

from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
from .attention import attn_bias_shape, build_attn_bias
from .blocks import MPTBlock
from .configuration_mpt import MPTConfig
from .custom_embedding import SharedEmbedding
from .hf_prefixlm_converter import (
add_bidirectional_mask_if_missing,
convert_hf_causal_lm_to_prefix_lm,
)
from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
from .meta_init_context import init_empty_weights
from .norm import NORM_CLASS_REGISTRY
from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
Expand Down Expand Up @@ -326,7 +321,7 @@ def forward(
past_key_values = [() for _ in range(self.config.n_layers)]
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
for (b_idx, block) in enumerate(self.blocks):
for b_idx, block in enumerate(self.blocks):
if output_hidden_states:
assert all_hidden_states is not None
all_hidden_states = all_hidden_states + (x,)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def fused_init_helper_(module: nn.Module, init_fn_):
raise RuntimeError(f"Internal logic error")
(dim, splits) = _fused
splits = (0, *splits, module.weight.size(dim))
for (s, e) in zip(splits[:-1], splits[1:]):
for s, e in zip(splits[:-1], splits[1:]):
slice_indices = [slice(None)] * module.weight.ndim
slice_indices[dim] = slice(s, e)
init_fn_(module.weight[slice_indices])
Expand Down Expand Up @@ -128,7 +128,7 @@ def generic_param_init_fn_(
assert d_model is not None
_d = d_model
splits = (0, _d, 2 * _d, 3 * _d)
for (s, e) in zip(splits[:-1], splits[1:]):
for s, e in zip(splits[:-1], splits[1:]):
init_fn_(module.in_proj_weight[s:e])
else:
assert (
Expand Down
1 change: 1 addition & 0 deletions llava/llava-v1.5-7b/packages/llava/model/make_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Usage:
python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
"""

import argparse

import torch
Expand Down
Loading

0 comments on commit 363e5a7

Please sign in to comment.