Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench #1

Open
wants to merge 3 commits into
base: fiddler
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions benchmark_prompts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
What is the capital of France?
Describe the process of photosynthesis in detail.
Write a short story about a magical talking cat.
How do airplanes fly?
Explain the concept of time dilation in Einstein's theory of relativity.
What are the main differences between living in a city and living in the countryside?
Write a haiku about the moon.
Discuss the impact of social media on modern society.
What is the chemical formula for water?
Describe your perfect day from start to finish.
How do you make a paper airplane?
Write a dialogue between two characters discussing the meaning of life.
What are the three branches of the United States government and their roles?
Describe the life cycle of a butterfly.
Write a persuasive paragraph on why everyone should learn to play a musical instrument.
What is the difference between a virus and a bacteria?
Create a recipe for a healthy breakfast smoothie.
Discuss the themes present in Shakespeare's play "Hamlet".
What are the five largest countries in the world by population?
Write a letter to your future self in 10 years.
How does the Internet work?
Describe the plot of your favorite movie.
What are the main causes of climate change?
Write a poem about the changing seasons.
Discuss the importance of regular exercise for maintaining good health.
What is the Pythagorean theorem, and how is it used in mathematics?
166 changes: 166 additions & 0 deletions run_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import sys

sys.path.append("./")
import torch
from torch.nn import functional as F
from hqq.core.quantize import BaseQuantizeConfig
from huggingface_hub import snapshot_download
from IPython.display import clear_output
from tqdm.auto import trange
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import logging as hf_logging
import time
from tqdm import tqdm
from src.build_model import OffloadConfig, QuantConfig, build_model

model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
quantized_model_name = "lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo"
state_path = "/home/amangupt/random/mixtral-offloading/Mixtral-8x7B-Instruct-v0.1-offloading-demo"
benchmark_prompts = "benchmark_prompts.txt"

def read_prompt(file_path):
with open(file_path, "r") as f:
prompts = f.readlines()
return prompts

all_prompts = read_prompt(benchmark_prompts)
all_prompts = all_prompts[:2]
# print(read_prompt(benchmark_prompts))


config = AutoConfig.from_pretrained(quantized_model_name)

device = torch.device("cuda:0")

##### Change this to 5 if you have only 12 GB of GPU VRAM #####
offload_per_layer = 4
# offload_per_layer = 5
###############################################################

num_experts = config.num_local_experts

offload_config = OffloadConfig(
main_size=config.num_hidden_layers * (num_experts - offload_per_layer),
offload_size=config.num_hidden_layers * offload_per_layer,
buffer_size=4,
offload_per_layer=offload_per_layer,
)


attn_config = BaseQuantizeConfig(
nbits=4,
group_size=64,
quant_zero=True,
quant_scale=True,
)
attn_config["scale_quant_params"]["group_size"] = 256


ffn_config = BaseQuantizeConfig(
nbits=2,
group_size=16,
quant_zero=True,
quant_scale=True,
)
quant_config = QuantConfig(ffn_config=ffn_config, attn_config=attn_config)


model, expert_cache_obj = build_model(
device=device,
quant_config=quant_config,
offload_config=offload_config,
state_path=state_path,
)

from transformers import TextStreamer


tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
past_key_values = None
sequence = None
total_time = []
total_num_tokens = []

seq_len = 0
for i in tqdm(range(len(all_prompts))):
start = time.time()
print("User: ", end="")
user_input = all_prompts[i]
print(user_input)
print("\n")

user_entry = dict(role="user", content=user_input)
input_ids = tokenizer.apply_chat_template([user_entry], return_tensors="pt").to(device)

if past_key_values is None:
attention_mask = torch.ones_like(input_ids)
else:
seq_len = input_ids.size(1) + past_key_values[0][0][0].size(1)
attention_mask = torch.ones([1, seq_len - 1], dtype=torch.int, device=device)

print("Mixtral: ", end="")
result = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
streamer=streamer,
do_sample=True,
temperature=0.9,
top_p=0.9,
max_new_tokens=100,
pad_token_id=tokenizer.eos_token_id,
return_dict_in_generate=True,
output_hidden_states=True,
)
print("\n")
sequence = result["sequences"]
past_key_values = result["past_key_values"]
end = time.time()
total_time.append(end - start)
seq_len = sum([len(seq) for seq in sequence])
total_num_tokens.append(seq_len)

# CHANGE FILENAME HERE
filename = "Initial_attempt"

# make a track_results/logs/{filename}.txt file

log_file = open(f"track_results/logs/{filename}.txt", "w")
dump_data_file = open(f"track_results/data/{filename}.json", "w")


print("TIME BENCHMARKS", file=log_file)
print(f"Total time taken: {sum(total_time)} seconds", file=log_file)
print(f"Total number of tokens generated: {sum(total_num_tokens)}", file=log_file)
print(f"Average token per second: {sum(total_num_tokens)/sum(total_time)}", file=log_file)
print('\n\n\n', file=log_file)

print("HIT RATE BENCHMARKS", file=log_file)
data_hits = {}

for k in expert_cache_obj.group_infos:
data_hits[k] = expert_cache_obj.group_infos[k].expert_counts
# print(data_hits)
# print overall hit rate and hit rate per layer
overall_hits = 0
overall_misses = 0
for layer in data_hits:
tot_calls = 0
tot_hits = 0
# print(data_hits[layer])
for exp in data_hits[layer]:
tot_calls += data_hits[layer][exp][0]
tot_hits += data_hits[layer][exp][1]
# print(tot_hits, tot_calls)
overall_hits += tot_hits
overall_misses += tot_calls - tot_hits
print(f"Layer {layer}: Hit rate = {tot_hits/tot_calls}", file=log_file)

print(f"Overall hit rate = {overall_hits/(overall_hits + overall_misses)}", file=log_file)


# dump data_hits, total_time, total_num_tokens to a json file
import json
all_stats = {"data_hits": data_hits, "total_time": total_time, "total_num_tokens": total_num_tokens}
json.dump(all_stats, dump_data_file)
18 changes: 7 additions & 11 deletions src/build_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def get_default_ffn_quant_config(ffn_dim: int = 14336, hidden_dim: int = 4096):


def make_empty_expert(
model_config: MixtralConfig, quant_config: QuantConfig, use_gpu: bool = True
model_config: MixtralConfig, quant_config: QuantConfig
) -> MixtralBLockSparseTop2MLP_HQQ:
meta1, meta2 = quant_config.get_ffn_metas(
model_config.hidden_size, model_config.intermediate_size
Expand All @@ -131,7 +131,6 @@ def make_empty_expert(
quant_config.ffn_config,
meta1,
meta2,
use_gpu
)


Expand All @@ -141,7 +140,6 @@ def make_and_load_expert_wrapper(
states_dir: str,
expert_uid: tuple[int, int],
device: torch.device,
use_gpu: bool = True,
) -> MixtralExpertWrapper:
layer_idx, expert_idx = expert_uid

Expand All @@ -151,7 +149,7 @@ def make_and_load_expert_wrapper(
state_fpath = json.load(f)["weight_map"][f"{module_idx}.w1.W_q"]

state_dict = load_file(os.path.join(states_dir, state_fpath), device=str(device))
expert = make_empty_expert(config, quant_config, use_gpu)
expert = make_empty_expert(config, quant_config)
expert.load_state_dict(state_dict, strict=True)

return MixtralExpertWrapper(expert, device)
Expand All @@ -175,11 +173,11 @@ def build_model(

state_dict_00 = load_00_expert_state_dict(state_path, device)

def _make_module(use_gpu=True):
def _make_module():
config = AutoConfig.from_pretrained(model_name)
expert = make_empty_expert(config, quant_config, use_gpu)
expert = make_empty_expert(config, quant_config)
expert.load_state_dict(state_dict_00)
return MixtralExpertWrapper(expert, device=device if use_gpu else 'cpu')
return MixtralExpertWrapper(expert, device=device)

with device, with_default_dtype(torch.float16):
model = MixtralForCausalLM(
Expand Down Expand Up @@ -208,7 +206,6 @@ def _make_module(use_gpu=True):
main_size=offload_config.main_size,
offload_size=offload_config.offload_size,
buffer_size=offload_config.buffer_size,
fiddler=True,
)
for layer_idx in trange(model_config.num_hidden_layers, desc="Loading experts"):
curr_layer = model.model.layers[layer_idx]
Expand All @@ -227,8 +224,7 @@ def _make_module(use_gpu=True):
quant_config=quant_config,
states_dir=state_path,
expert_uid=(layer_idx, expert_idx),
device='cpu' if do_offload else device,
use_gpu=not do_offload,
device=device,
)

expert_cache.add_expert(
Expand All @@ -242,4 +238,4 @@ def _make_module(use_gpu=True):
torch.cuda.synchronize(device)
torch.cuda.empty_cache()

return model
return model, expert_cache
Loading