⚡️ Speed up method CompressedTensorsConfig._quantization_scheme_map_from_config by 24%
#338
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 24% (0.24x) speedup for
CompressedTensorsConfig._quantization_scheme_map_from_configinpython/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py⏱️ Runtime :
587 microseconds→475 microseconds(best of37runs)📝 Explanation and details
The optimization achieves a 23% speedup by eliminating redundant computations and improving data structure efficiency:
Key Optimizations:
Module-level constant creation: Moved
_ACTIVATION_QUANTIZATION_FORMATSfrom inside the function to module level as a set instead of recreating a list on every call. The line profiler shows this eliminated 400+ microseconds spent repeatedly constructing the list and accessingCompressionFormatattributes (33.4% + 28.2% + 27% of original function time).Set vs List membership testing: Changed from list to set for O(1) vs O(n) membership checks in
is_activation_quantization_format.Loop-invariant hoisting: Cached
is_activation_quantization_format(quant_format)andQuantizationType.FLOAToutside the nested loops sincequant_formatdoesn't change during iteration. This eliminates 105 redundant function calls (22.5% of original total time).Performance Impact:
is_activation_quantization_formatfunction time dropped from 425μs to 19μs (95% reduction)Why This Matters:
This function processes quantization configurations during model initialization. The nested loops over config groups and targets mean
is_activation_quantization_formatgets called repeatedly with the samequant_formatvalue, making the caching optimization particularly effective for configurations with multiple target layers.✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import pytest
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
CompressedTensorsConfig
QuantizationType stub
class QuantizationType:
FLOAT = "float"
INT8 = "int8"
FP8 = "fp8"
QuantizationArgs stub with validation
class QuantizationArgs:
def init(self, type, bits=None, group_size=None):
self.type = type
self.bits = bits
self.group_size = group_size
QuantizationConfig base class stub
class QuantizationConfig:
def init(self):
self.packed_modules_mapping = {}
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
CompressedTensorsConfig
--- Unit tests ---
1. Basic Test Cases
def test_edge_missing_input_activations_with_activation_quantization():
# Test config where activation quantization is enabled but input_activations is missing
config = {
"format": "float_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": {"type": "float"}
# No input_activations
}
}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 23.8μs -> 22.8μs (4.78% faster)
def test_edge_missing_weights_raises():
# Test config missing weights (should raise ValueError from QuantizationArgs.model_validate)
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
# "weights" missing
"input_activations": {"type": "fp8"}
}
}
}
with pytest.raises(ValueError):
CompressedTensorsConfig._quantization_scheme_map_from_config(config) # 7.82μs -> 8.33μs (6.13% slower)
def test_edge_empty_config_groups():
# Test config with empty config_groups
config = {
"format": "float_quantized",
"config_groups": {}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 1.69μs -> 2.64μs (36.0% slower)
def test_edge_targets_empty_list():
# Test config group with empty targets list
config = {
"format": "float_quantized",
"config_groups": {
"group1": {
"targets": [],
"weights": {"type": "float"},
"input_activations": {"type": "fp8"}
}
}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 1.63μs -> 2.24μs (27.4% slower)
def test_edge_no_format_key():
# Test config missing 'format' key
config = {
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": {"type": "float"},
"input_activations": {"type": "fp8"}
}
}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 24.2μs -> 22.9μs (5.97% faster)
3. Large Scale Test Cases
def test_large_scale_many_config_groups():
# Test config with many config groups (up to 100)
num_groups = 100
config_groups = {}
for i in range(num_groups):
config_groups[f"group{i}"] = {
"targets": [f"layer{i}"],
"weights": {"type": "float"},
"input_activations": {"type": "fp8"}
}
config = {
"format": "float_quantized",
"config_groups": config_groups
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 434μs -> 321μs (35.1% faster)
for i in range(num_groups):
layer = f"layer{i}"
#------------------------------------------------
from typing import Any, Dict, Optional
imports
import pytest
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
CompressedTensorsConfig
QuantizationType stub
class QuantizationType:
FLOAT = "float"
INT = "int"
NAIVE = "naive"
QuantizationArgs stub
class QuantizationArgs:
def init(self, type_: str, bits: int):
self.type = type_
self.bits = bits
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
CompressedTensorsConfig
--- Unit tests ---
1. Basic Test Cases
def test_edge_invalid_weights_type():
# Invalid weights type should raise ValueError
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": {"type": "invalid_type", "bits": 8},
"input_activations": {"type": "int", "bits": 8},
}
}
}
with pytest.raises(ValueError):
CompressedTensorsConfig._quantization_scheme_map_from_config(config) # 20.5μs -> 21.6μs (5.41% slower)
def test_edge_invalid_bits_value():
# Invalid bits value for weights
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": {"type": "int", "bits": -1},
"input_activations": {"type": "int", "bits": 8},
}
}
}
with pytest.raises(ValueError):
CompressedTensorsConfig._quantization_scheme_map_from_config(config) # 13.0μs -> 13.8μs (6.17% slower)
def test_edge_empty_config_groups():
# No config_groups: should return empty dict
config = {
"format": "int_quantized",
"config_groups": {}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 1.45μs -> 1.98μs (26.9% slower)
def test_edge_empty_targets():
# No targets: should not add anything to the result
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": [],
"weights": {"type": "int", "bits": 8},
"input_activations": {"type": "int", "bits": 8},
}
}
}
codeflash_output = CompressedTensorsConfig._quantization_scheme_map_from_config(config); result = codeflash_output # 1.54μs -> 2.21μs (30.3% slower)
def test_edge_non_dict_weights():
# weights is not a dict
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": "not_a_dict",
"input_activations": {"type": "int", "bits": 8},
}
}
}
with pytest.raises(ValueError):
CompressedTensorsConfig._quantization_scheme_map_from_config(config) # 10.0μs -> 10.9μs (7.76% slower)
def test_edge_non_dict_input_activations():
# input_activations is not a dict
config = {
"format": "int_quantized",
"config_groups": {
"group1": {
"targets": ["layer1"],
"weights": {"type": "int", "bits": 8},
"input_activations": "not_a_dict",
}
}
}
with pytest.raises(ValueError):
CompressedTensorsConfig._quantization_scheme_map_from_config(config) # 15.2μs -> 15.4μs (1.68% slower)
3. Large Scale Test Cases
To edit these changes
git checkout codeflash/optimize-CompressedTensorsConfig._quantization_scheme_map_from_config-mhtxo751and push.