From 50e47077430050056dfc14e7406172edbc35f745 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 19 Nov 2024 16:50:45 +0000 Subject: [PATCH 1/6] fix --- src/llmcompressor/core/session.py | 10 +++++-- .../compressed_tensors_utils.py | 7 +++-- tests/e2e/vLLM/test_vllm.py | 30 ++++++++++++------- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py index 41072feb9..9b846fc23 100644 --- a/src/llmcompressor/core/session.py +++ b/src/llmcompressor/core/session.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Union +from loguru import logger + from llmcompressor.core.events import EventType from llmcompressor.core.helpers import log_model_info, should_log_model_info from llmcompressor.core.lifecycle import CompressionLifecycle @@ -260,12 +262,16 @@ def reset_stage(self): self.lifecycle.initialized_ = False self.lifecycle.finalized = False - def get_serialized_recipe(self) -> str: + def get_serialized_recipe(self) -> Optional[str]: """ :return: serialized string of the current compiled recipe """ recipe = self.lifecycle.recipe_container.compiled_recipe - return recipe.yaml() + + if recipe is not None and hasattr(recipe, "yaml"): + return recipe.yaml() + + logger.warning("Recipe not found in session - may been reset") def _log_model_info(self): # Log model level logs if cadence reached diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 001b43b0b..6de89dd8b 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -191,9 +191,10 @@ def skip(*args, **kwargs): recipe_path = os.path.join(save_directory, "recipe.yaml") session = active_session() - recipe_yaml_str = session.get_serialized_recipe() - with open(recipe_path, "w") as fp: - fp.write(recipe_yaml_str) + + if (recipe_yaml_str := session.get_serialized_recipe()) is not None: + with open(recipe_path, "w") as fp: + fp.write(recipe_yaml_str) # copy python files from cache dir to save_path if any copy_python_files_from_model_cache(model, save_directory) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 988793849..c141c34ef 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -1,9 +1,11 @@ +import os import shutil import unittest from typing import Callable import pytest from datasets import load_dataset +from loguru import logger from parameterized import parameterized, parameterized_class from transformers import AutoTokenizer @@ -22,6 +24,7 @@ vllm_installed = True except ImportError: vllm_installed = False + logger.warning("vllm is not installed. This test will be skipped") # Defines the file paths to the directories containing the test configs # for each of the quantization schemes @@ -32,6 +35,8 @@ WNA16_2of4 = "tests/e2e/vLLM/configs/WNA16_2of4" CONFIGS = [WNA16, FP8, INT8, ACTORDER, WNA16_2of4] +HF_MODEL_HUB_NAME = "nm-testing" + def gen_test_name(testcase_func: Callable, param_num: int, param: dict) -> str: return "_".join( @@ -76,8 +81,8 @@ class TestvLLM(unittest.TestCase): save_dir = None def setUp(self): - print("========== RUNNING ==============") - print(self.scheme) + logger.info("========== RUNNING ==============") + logger.debug(self.scheme) self.device = "cuda:0" self.oneshot_kwargs = {} @@ -124,16 +129,18 @@ def test_vllm(self): ) # Apply quantization. - print("ONESHOT KWARGS", self.oneshot_kwargs) + logger.debug("ONESHOT KWARGS", self.oneshot_kwargs) oneshot( **self.oneshot_kwargs, clear_sparse_session=True, oneshot_device=self.device, ) + self.oneshot_kwargs["model"].save_pretrained(self.save_dir) tokenizer.save_pretrained(self.save_dir) + # Run vLLM with saved model - print("================= RUNNING vLLM =========================") + logger.info("================= RUNNING vLLM =========================") sampling_params = SamplingParams(temperature=0.80, top_p=0.95) if "W4A16_2of4" in self.scheme: # required by the kernel @@ -141,16 +148,19 @@ def test_vllm(self): else: llm = LLM(model=self.save_dir) outputs = llm.generate(self.prompts, sampling_params) - print("================= vLLM GENERATION ======================") + + logger.info("================= vLLM GENERATION ======================") for output in outputs: assert output prompt = output.prompt generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) - print("================= UPLOADING TO HUB ======================") - self.oneshot_kwargs["model"].push_to_hub(f"nm-testing/{self.save_dir}-e2e") - tokenizer.push_to_hub(f"nm-testing/{self.save_dir}-e2e") + logger.debug("PROMPT", prompt) + logger.debug("GENERATED TEXT", generated_text) + + logger.info("================= UPLOADING TO HUB ======================") + hf_upload_path = os.path.join(HF_MODEL_HUB_NAME, f"{self.save_dir}-e2e") + self.oneshot_kwargs["model"].push_to_hub(hf_upload_path) + tokenizer.push_to_hub(hf_upload_path) def tearDown(self): if self.save_dir is not None: From 5c9559bcc4bc6c8c5f18231c69a46795143aded9 Mon Sep 17 00:00:00 2001 From: George Date: Wed, 20 Nov 2024 16:36:32 +0000 Subject: [PATCH 2/6] fix grammar --- src/llmcompressor/core/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py index 9b846fc23..7c489f36f 100644 --- a/src/llmcompressor/core/session.py +++ b/src/llmcompressor/core/session.py @@ -271,7 +271,7 @@ def get_serialized_recipe(self) -> Optional[str]: if recipe is not None and hasattr(recipe, "yaml"): return recipe.yaml() - logger.warning("Recipe not found in session - may been reset") + logger.warning("Recipe not found in session - it may have been reset") def _log_model_info(self): # Log model level logs if cadence reached From 84ca3c8bc1d3ac80a3fefbad726007dc0602c9ee Mon Sep 17 00:00:00 2001 From: George Date: Wed, 20 Nov 2024 22:27:46 -0500 Subject: [PATCH 3/6] comments --- tests/e2e/vLLM/test_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index c141c34ef..eba2a2c36 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -132,7 +132,7 @@ def test_vllm(self): logger.debug("ONESHOT KWARGS", self.oneshot_kwargs) oneshot( **self.oneshot_kwargs, - clear_sparse_session=True, + clear_sparse_session=False, oneshot_device=self.device, ) From c719c5dab5151ec76287ba7d6981b8dfc25cedc0 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 21 Nov 2024 03:36:17 +0000 Subject: [PATCH 4/6] remove args, use default --- tests/e2e/vLLM/test_vllm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index eba2a2c36..7803486b2 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -132,7 +132,6 @@ def test_vllm(self): logger.debug("ONESHOT KWARGS", self.oneshot_kwargs) oneshot( **self.oneshot_kwargs, - clear_sparse_session=False, oneshot_device=self.device, ) From 5cd4c115a2dfa4e2ef2989c1907583d632bda6f7 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 21 Nov 2024 17:01:59 +0000 Subject: [PATCH 5/6] reset session - avoid running finalize more than once on one session --- tests/e2e/vLLM/test_vllm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 7803486b2..35d9e8513 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -9,6 +9,7 @@ from parameterized import parameterized, parameterized_class from transformers import AutoTokenizer +from llmcompressor.core import active_session from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot from tests.testing_utils import ( @@ -93,6 +94,7 @@ def setUp(self): "The president of the US is", "My name is", ] + self.session = active_session() def test_vllm(self): import torch @@ -138,6 +140,9 @@ def test_vllm(self): self.oneshot_kwargs["model"].save_pretrained(self.save_dir) tokenizer.save_pretrained(self.save_dir) + # Whole flow is complete reset the session + self.session.reset() + # Run vLLM with saved model logger.info("================= RUNNING vLLM =========================") sampling_params = SamplingParams(temperature=0.80, top_p=0.95) From 3763c74196b02c427b728f0f0809e99796a7c3d3 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 21 Nov 2024 17:03:19 +0000 Subject: [PATCH 6/6] better comment --- tests/e2e/vLLM/test_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 35d9e8513..316f90ff1 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -140,7 +140,7 @@ def test_vllm(self): self.oneshot_kwargs["model"].save_pretrained(self.save_dir) tokenizer.save_pretrained(self.save_dir) - # Whole flow is complete reset the session + # Reset after session info is extracted on save -- recipe self.session.reset() # Run vLLM with saved model