diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml similarity index 100% rename from examples/quantization_24_sparse_w4a16/2:4_w4a16_group-128_recipe.yaml rename to examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml diff --git a/examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml similarity index 100% rename from examples/quantization_24_sparse_w4a16/2:4_w4a16_recipe.yaml rename to examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml diff --git a/examples/quantization_24_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md similarity index 85% rename from examples/quantization_24_sparse_w4a16/README.md rename to examples/quantization_2of4_sparse_w4a16/README.md index 6e006d9db..b0f90778d 100644 --- a/examples/quantization_24_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -29,7 +29,7 @@ This example uses LLMCompressor and Compressed-Tensors to create a 2:4 sparse an The model is calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is required to run this example. -Follow the steps below, or to run the example as `python examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py` +Follow the steps below, or to run the example as `python examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py` ## Step 1: Select a model, dataset, and recipe In this step, we select which model to use as a baseline for sparsification, a dataset to @@ -40,7 +40,7 @@ Models can reference a local directory, or a model in the huggingface hub. Datasets can be from a local compatible directory or the huggingface hub. Recipes are YAML files that describe how a model should be optimized during or after training. -The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). +The recipe used for this flow is located in [2of4_w4a16_recipe.yaml](./2of4_w4a16_recipe.yaml). It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning, and quantize to 4 bits in one show using GPTQ. @@ -56,18 +56,18 @@ model = SparseAutoModelForCausalLM.from_pretrained( dataset = "ultrachat-200k" splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} -recipe = "2:4_w4a16_recipe.yaml" +recipe = "2of4_w4a16_recipe.yaml" ``` ## Step 2: Run sparsification using `apply` The `apply` function applies the given recipe to our model and dataset. The hardcoded kwargs may be altered based on each model's needs. -After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. +After running, the sparsified model will be saved to `output_llama7b_2of4_w4a16_channel`. ```python from llmcompressor.transformers import apply -output_dir = "output_llama7b_2:4_w4a16_channel" +output_dir = "output_llama7b_2of4_w4a16_channel" apply( model=model, @@ -98,12 +98,12 @@ run the following: import torch from llmcompressor.transformers import SparseAutoModelForCausalLM -compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed" +compressed_output_dir = "output_llama7b_2of4_w4a16_channel_compressed" model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) model.save_pretrained(compressed_output_dir, save_compressed=True) ``` ### Custom Quantization The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. -The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. -To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`. +The above recipe (`2of4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. +To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2of4_w4a16_group-128_recipe.yaml`. diff --git a/examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py similarity index 94% rename from examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py rename to examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index fba6db74f..6e83e4e87 100644 --- a/examples/quantization_24_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -3,7 +3,7 @@ from llmcompressor.transformers import SparseAutoModelForCausalLM, apply # define a recipe to handle sparsity, finetuning and quantization -recipe = "2:4_w4a16_recipe.yaml" +recipe = "2of4_w4a16_recipe.yaml" # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" @@ -15,7 +15,7 @@ dataset = "ultrachat-200k" # save location of quantized model -output_dir = "output_llama7b_2:4_w4a16_channel" +output_dir = "output_llama7b_2of4_w4a16_channel" # set dataset config parameters splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} diff --git a/tests/examples/test_quantization_24_sparse_w4a16.py b/tests/examples/test_quantization_2of4_sparse_w4a16.py similarity index 89% rename from tests/examples/test_quantization_24_sparse_w4a16.py rename to tests/examples/test_quantization_2of4_sparse_w4a16.py index ffb5931fd..b85e6098a 100644 --- a/tests/examples/test_quantization_24_sparse_w4a16.py +++ b/tests/examples/test_quantization_2of4_sparse_w4a16.py @@ -16,14 +16,14 @@ @pytest.fixture def example_dir() -> str: - return "examples/quantization_24_sparse_w4a16" + return "examples/quantization_2of4_sparse_w4a16" @pytest.mark.example @requires_gpu_count(1) class TestQuantization24SparseW4A16: """ - Tests for examples in the "quantization_24_sparse_w4a16" example folder. + Tests for examples in the "quantization_2of4_sparse_w4a16" example folder. """ def test_doc_example_command(self, example_dir: str, tmp_path: Path): @@ -52,7 +52,7 @@ def test_alternative_recipe(self, example_dir: str, tmp_path: Path): script_path = tmp_path / example_dir / script_filename content = script_path.read_text(encoding="utf-8") content = content.replace( - "2:4_w4a16_recipe.yaml", "2:4_w4a16_group-128_recipe.yaml" + "2of4_w4a16_recipe.yaml", "2of4_w4a16_group-128_recipe.yaml" ) script_path.write_text(content, encoding="utf-8")