-
Notifications
You must be signed in to change notification settings - Fork 6.1k
Bump minimum TorchAO version to 0.7.0 #10293
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0f812cd
bb2fb98
497c56d
b0e5e5b
6e89718
5dc6225
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,7 @@ | |
nightly, | ||
require_torch, | ||
require_torch_gpu, | ||
require_torchao_version_greater, | ||
require_torchao_version_greater_or_equal, | ||
slow, | ||
torch_device, | ||
) | ||
|
@@ -74,13 +74,13 @@ def forward(self, input, *args, **kwargs): | |
|
||
if is_torchao_available(): | ||
from torchao.dtypes import AffineQuantizedTensor | ||
from torchao.dtypes.affine_quantized_tensor import TensorCoreTiledLayoutType | ||
from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor | ||
from torchao.utils import get_model_size_in_bytes | ||
|
||
|
||
@require_torch | ||
@require_torch_gpu | ||
@require_torchao_version_greater("0.6.0") | ||
@require_torchao_version_greater_or_equal("0.7.0") | ||
class TorchAoConfigTest(unittest.TestCase): | ||
def test_to_dict(self): | ||
""" | ||
|
@@ -125,7 +125,7 @@ def test_repr(self): | |
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners | ||
@require_torch | ||
@require_torch_gpu | ||
@require_torchao_version_greater("0.6.0") | ||
@require_torchao_version_greater_or_equal("0.7.0") | ||
class TorchAoTest(unittest.TestCase): | ||
def tearDown(self): | ||
gc.collect() | ||
|
@@ -139,11 +139,13 @@ def get_dummy_components(self, quantization_config: TorchAoConfig): | |
quantization_config=quantization_config, | ||
torch_dtype=torch.bfloat16, | ||
) | ||
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder") | ||
text_encoder_2 = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") | ||
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16) | ||
text_encoder_2 = T5EncoderModel.from_pretrained( | ||
model_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16 | ||
) | ||
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer") | ||
tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2") | ||
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae") | ||
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16) | ||
scheduler = FlowMatchEulerDiscreteScheduler() | ||
|
||
return { | ||
|
@@ -212,7 +214,7 @@ def get_dummy_tensor_inputs(self, device=None, seed: int = 0): | |
def _test_quant_type(self, quantization_config: TorchAoConfig, expected_slice: List[float]): | ||
components = self.get_dummy_components(quantization_config) | ||
pipe = FluxPipeline(**components) | ||
pipe.to(device=torch_device, dtype=torch.bfloat16) | ||
pipe.to(device=torch_device) | ||
|
||
inputs = self.get_dummy_inputs(torch_device) | ||
output = pipe(**inputs)[0] | ||
|
@@ -276,7 +278,6 @@ def test_int4wo_quant_bfloat16_conversion(self): | |
self.assertTrue(isinstance(weight, AffineQuantizedTensor)) | ||
self.assertEqual(weight.quant_min, 0) | ||
self.assertEqual(weight.quant_max, 15) | ||
self.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType)) | ||
|
||
def test_device_map(self): | ||
""" | ||
|
@@ -341,21 +342,33 @@ def test_device_map(self): | |
|
||
def test_modules_to_not_convert(self): | ||
quantization_config = TorchAoConfig("int8_weight_only", modules_to_not_convert=["transformer_blocks.0"]) | ||
quantized_model = FluxTransformer2DModel.from_pretrained( | ||
quantized_model_with_not_convert = FluxTransformer2DModel.from_pretrained( | ||
"hf-internal-testing/tiny-flux-pipe", | ||
subfolder="transformer", | ||
quantization_config=quantization_config, | ||
torch_dtype=torch.bfloat16, | ||
) | ||
|
||
unquantized_layer = quantized_model.transformer_blocks[0].ff.net[2] | ||
unquantized_layer = quantized_model_with_not_convert.transformer_blocks[0].ff.net[2] | ||
self.assertTrue(isinstance(unquantized_layer, torch.nn.Linear)) | ||
self.assertFalse(isinstance(unquantized_layer.weight, AffineQuantizedTensor)) | ||
self.assertEqual(unquantized_layer.weight.dtype, torch.bfloat16) | ||
|
||
quantized_layer = quantized_model.proj_out | ||
quantized_layer = quantized_model_with_not_convert.proj_out | ||
self.assertTrue(isinstance(quantized_layer.weight, AffineQuantizedTensor)) | ||
self.assertEqual(quantized_layer.weight.layout_tensor.data.dtype, torch.int8) | ||
|
||
quantization_config = TorchAoConfig("int8_weight_only") | ||
quantized_model = FluxTransformer2DModel.from_pretrained( | ||
"hf-internal-testing/tiny-flux-pipe", | ||
subfolder="transformer", | ||
quantization_config=quantization_config, | ||
torch_dtype=torch.bfloat16, | ||
) | ||
|
||
size_quantized_with_not_convert = get_model_size_in_bytes(quantized_model_with_not_convert) | ||
size_quantized = get_model_size_in_bytes(quantized_model) | ||
|
||
self.assertTrue(size_quantized < size_quantized_with_not_convert) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not related to bumping the version, but it makes for a more meaningful test |
||
|
||
def test_training(self): | ||
quantization_config = TorchAoConfig("int8_weight_only") | ||
|
@@ -406,23 +419,6 @@ def test_torch_compile(self): | |
# Note: Seems to require higher tolerance | ||
self.assertTrue(np.allclose(normal_output, compile_output, atol=1e-2, rtol=1e-3)) | ||
|
||
@staticmethod | ||
def _get_memory_footprint(module): | ||
quantized_param_memory = 0.0 | ||
unquantized_param_memory = 0.0 | ||
|
||
for param in module.parameters(): | ||
if param.__class__.__name__ == "AffineQuantizedTensor": | ||
data, scale, zero_point = param.layout_tensor.get_plain() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason as above for removing this. |
||
quantized_param_memory += data.numel() + data.element_size() | ||
quantized_param_memory += scale.numel() + scale.element_size() | ||
quantized_param_memory += zero_point.numel() + zero_point.element_size() | ||
else: | ||
unquantized_param_memory += param.data.numel() * param.data.element_size() | ||
|
||
total_memory = quantized_param_memory + unquantized_param_memory | ||
return total_memory, quantized_param_memory, unquantized_param_memory | ||
|
||
def test_memory_footprint(self): | ||
r""" | ||
A simple test to check if the model conversion has been done correctly by checking on the | ||
|
@@ -433,20 +429,18 @@ def test_memory_footprint(self): | |
transformer_int8wo = self.get_dummy_components(TorchAoConfig("int8wo"))["transformer"] | ||
transformer_bf16 = self.get_dummy_components(None)["transformer"] | ||
|
||
total_int4wo, quantized_int4wo, unquantized_int4wo = self._get_memory_footprint(transformer_int4wo) | ||
total_int4wo_gs32, quantized_int4wo_gs32, unquantized_int4wo_gs32 = self._get_memory_footprint( | ||
transformer_int4wo_gs32 | ||
) | ||
total_int8wo, quantized_int8wo, unquantized_int8wo = self._get_memory_footprint(transformer_int8wo) | ||
total_bf16, quantized_bf16, unquantized_bf16 = self._get_memory_footprint(transformer_bf16) | ||
|
||
self.assertTrue(quantized_bf16 == 0 and total_bf16 == unquantized_bf16) | ||
# int4wo_gs32 has smaller group size, so more groups -> more scales and zero points | ||
self.assertTrue(total_int8wo < total_bf16 < total_int4wo_gs32) | ||
# int4 with default group size quantized very few linear layers compared to a smaller group size of 32 | ||
self.assertTrue(quantized_int4wo < quantized_int4wo_gs32 and unquantized_int4wo > unquantized_int4wo_gs32) | ||
total_int4wo = get_model_size_in_bytes(transformer_int4wo) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We use the torchao provided utility instead now |
||
total_int4wo_gs32 = get_model_size_in_bytes(transformer_int4wo_gs32) | ||
total_int8wo = get_model_size_in_bytes(transformer_int8wo) | ||
total_bf16 = get_model_size_in_bytes(transformer_bf16) | ||
|
||
# Latter has smaller group size, so more groups -> more scales and zero points | ||
self.assertTrue(total_int4wo < total_int4wo_gs32) | ||
# int8 quantizes more layers compare to int4 with default group size | ||
self.assertTrue(quantized_int8wo < quantized_int4wo) | ||
self.assertTrue(total_int8wo < total_int4wo) | ||
# int4wo does not quantize too many layers because of default group size, but for the layers it does | ||
# there is additional overhead of scales and zero points | ||
self.assertTrue(total_bf16 < total_int4wo) | ||
|
||
def test_wrong_config(self): | ||
with self.assertRaises(ValueError): | ||
|
@@ -456,7 +450,7 @@ def test_wrong_config(self): | |
# This class is not to be run as a test by itself. See the tests that follow this class | ||
@require_torch | ||
@require_torch_gpu | ||
@require_torchao_version_greater("0.6.0") | ||
@require_torchao_version_greater_or_equal("0.7.0") | ||
class TorchAoSerializationTest(unittest.TestCase): | ||
model_name = "hf-internal-testing/tiny-flux-pipe" | ||
quant_method, quant_method_kwargs = None, None | ||
|
@@ -565,7 +559,7 @@ class TorchAoSerializationINTA16W8CPUTest(TorchAoSerializationTest): | |
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners | ||
@require_torch | ||
@require_torch_gpu | ||
@require_torchao_version_greater("0.6.0") | ||
@require_torchao_version_greater_or_equal("0.7.0") | ||
@slow | ||
@nightly | ||
class SlowTorchAoTests(unittest.TestCase): | ||
|
@@ -581,11 +575,13 @@ def get_dummy_components(self, quantization_config: TorchAoConfig): | |
quantization_config=quantization_config, | ||
torch_dtype=torch.bfloat16, | ||
) | ||
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder") | ||
text_encoder_2 = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") | ||
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16) | ||
text_encoder_2 = T5EncoderModel.from_pretrained( | ||
model_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16 | ||
) | ||
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer") | ||
tokenizer_2 = AutoTokenizer.from_pretrained(model_id, subfolder="tokenizer_2") | ||
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae") | ||
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.bfloat16) | ||
scheduler = FlowMatchEulerDiscreteScheduler() | ||
|
||
return { | ||
|
@@ -617,7 +613,7 @@ def get_dummy_inputs(self, device: torch.device, seed: int = 0): | |
|
||
def _test_quant_type(self, quantization_config, expected_slice): | ||
components = self.get_dummy_components(quantization_config) | ||
pipe = FluxPipeline(**components).to(dtype=torch.bfloat16) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this was incorrect thing to do here and it slipped past us in previous PR. We should not be calling The GGUF PR introduced a check in |
||
pipe = FluxPipeline(**components) | ||
pipe.enable_model_cpu_offload() | ||
|
||
inputs = self.get_dummy_inputs(torch_device) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
layout_type
has become an internal private attribute called_layout
now. It does not have to be tested as such so can remove. The layout is also now calledTensorCoreTiledLayout
instead