Skip to content

Commit

Permalink
add decompress tests
Browse files Browse the repository at this point in the history
  • Loading branch information
horheynm committed Dec 11, 2024
1 parent 7067ad0 commit 126d3d5
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
98 changes: 98 additions & 0 deletions tests/llmcompressor/transformers/compression/test_decompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import copy
import shutil
import tempfile
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
compressed_model_stub = None
skeleton_model_stub = None

SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

@classmethod
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)

self.compressed_model = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
)

self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.compressed_model.dtype,
device_map=self.compressed_model.device,
)

assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)

self.decompressed_model = None
config = AutoConfig.from_pretrained(self.compressed_model_stub)

compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)

# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)

# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)

# self.dense_model should be decompressed
assert dense_model is not self.dense_model

self.decompressed_model = self.dense_model

assert hasattr(
self.decompressed_model.model.layers[0].self_attn.q_proj, "weight_scale"
)

def test_compressed_matches_uncompressed(self):
for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.compressed_model.device
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)
)
uncompressed_output = self.tokenizer.batch_decode(
self.decompressed_model.generate(**inputs, max_length=50)
)

assert compressed_output == uncompressed_output

@classmethod
def tearDownClass(self):
shutil.rmtree(self.test_dir)
del self.compressed_model
del self.dense_model
del self.decompressed_model
torch.cuda.empty_cache()

0 comments on commit 126d3d5

Please sign in to comment.