-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
122 additions
and
8 deletions.
There are no files selected for viewing
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_FP8_Dynamic_uncompressed | ||
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W4A16_G128_uncompressed | ||
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A16_G128_uncompressed | ||
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_compressed | ||
uncompressed_model_stub: horheynm/TinyLlama_1.1B_Chat_v1.0_W8A8_Dynamic_Per_Token_uncompressed | ||
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
98 changes: 98 additions & 0 deletions
98
tests/llmcompressor/transformers/compression/test_decompress.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import copy | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
import torch | ||
from compressed_tensors import QUANTIZATION_CONFIG_NAME | ||
from compressed_tensors.compressors import ModelCompressor | ||
from compressed_tensors.quantization import QuantizationStatus | ||
from parameterized import parameterized_class | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
|
||
from tests.testing_utils import parse_params, requires_gpu | ||
|
||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(CONFIG_DIR)) | ||
class TestQuantizationMatches(unittest.TestCase): | ||
compressed_model_stub = None | ||
skeleton_model_stub = None | ||
|
||
SAMPLE_INPUTS = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
@classmethod | ||
def setUpClass(self): | ||
self.test_dir = tempfile.mkdtemp() | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) | ||
|
||
self.compressed_model = AutoModelForCausalLM.from_pretrained( | ||
self.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
) | ||
|
||
self.dense_model = AutoModelForCausalLM.from_pretrained( | ||
self.skeleton_model_stub, | ||
torch_dtype=self.compressed_model.dtype, | ||
device_map=self.compressed_model.device, | ||
) | ||
|
||
assert not hasattr( | ||
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
|
||
self.decompressed_model = None | ||
config = AutoConfig.from_pretrained(self.compressed_model_stub) | ||
|
||
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) | ||
self.compressor = ModelCompressor.from_compression_config(compression_config) | ||
self.compressor.quantization_config.quantization_status = ( | ||
QuantizationStatus.FROZEN | ||
) | ||
|
||
# use the model_path to load the decompressed weights into dense_model | ||
dense_model = copy.deepcopy(self.dense_model) | ||
|
||
# overwrite the weights of the dense model | ||
self.compressor.decompress( | ||
model_path=self.compressed_model_stub, | ||
model=self.dense_model, | ||
) | ||
|
||
# self.dense_model should be decompressed | ||
assert dense_model is not self.dense_model | ||
|
||
self.decompressed_model = self.dense_model | ||
|
||
assert hasattr( | ||
self.decompressed_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
|
||
def test_compressed_matches_uncompressed(self): | ||
for input in self.SAMPLE_INPUTS: | ||
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( | ||
self.compressed_model.device | ||
) | ||
compressed_output = self.tokenizer.batch_decode( | ||
self.compressed_model.generate(**inputs, max_length=50) | ||
) | ||
uncompressed_output = self.tokenizer.batch_decode( | ||
self.decompressed_model.generate(**inputs, max_length=50) | ||
) | ||
|
||
assert compressed_output == uncompressed_output | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
shutil.rmtree(self.test_dir) | ||
del self.compressed_model | ||
del self.dense_model | ||
del self.decompressed_model | ||
torch.cuda.empty_cache() |