diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py index 41072feb9..7c489f36f 100644 --- a/src/llmcompressor/core/session.py +++ b/src/llmcompressor/core/session.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Union +from loguru import logger + from llmcompressor.core.events import EventType from llmcompressor.core.helpers import log_model_info, should_log_model_info from llmcompressor.core.lifecycle import CompressionLifecycle @@ -260,12 +262,16 @@ def reset_stage(self): self.lifecycle.initialized_ = False self.lifecycle.finalized = False - def get_serialized_recipe(self) -> str: + def get_serialized_recipe(self) -> Optional[str]: """ :return: serialized string of the current compiled recipe """ recipe = self.lifecycle.recipe_container.compiled_recipe - return recipe.yaml() + + if recipe is not None and hasattr(recipe, "yaml"): + return recipe.yaml() + + logger.warning("Recipe not found in session - it may have been reset") def _log_model_info(self): # Log model level logs if cadence reached diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 001b43b0b..6de89dd8b 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -191,9 +191,10 @@ def skip(*args, **kwargs): recipe_path = os.path.join(save_directory, "recipe.yaml") session = active_session() - recipe_yaml_str = session.get_serialized_recipe() - with open(recipe_path, "w") as fp: - fp.write(recipe_yaml_str) + + if (recipe_yaml_str := session.get_serialized_recipe()) is not None: + with open(recipe_path, "w") as fp: + fp.write(recipe_yaml_str) # copy python files from cache dir to save_path if any copy_python_files_from_model_cache(model, save_directory) diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index 988793849..316f90ff1 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -1,12 +1,15 @@ +import os import shutil import unittest from typing import Callable import pytest from datasets import load_dataset +from loguru import logger from parameterized import parameterized, parameterized_class from transformers import AutoTokenizer +from llmcompressor.core import active_session from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot from tests.testing_utils import ( @@ -22,6 +25,7 @@ vllm_installed = True except ImportError: vllm_installed = False + logger.warning("vllm is not installed. This test will be skipped") # Defines the file paths to the directories containing the test configs # for each of the quantization schemes @@ -32,6 +36,8 @@ WNA16_2of4 = "tests/e2e/vLLM/configs/WNA16_2of4" CONFIGS = [WNA16, FP8, INT8, ACTORDER, WNA16_2of4] +HF_MODEL_HUB_NAME = "nm-testing" + def gen_test_name(testcase_func: Callable, param_num: int, param: dict) -> str: return "_".join( @@ -76,8 +82,8 @@ class TestvLLM(unittest.TestCase): save_dir = None def setUp(self): - print("========== RUNNING ==============") - print(self.scheme) + logger.info("========== RUNNING ==============") + logger.debug(self.scheme) self.device = "cuda:0" self.oneshot_kwargs = {} @@ -88,6 +94,7 @@ def setUp(self): "The president of the US is", "My name is", ] + self.session = active_session() def test_vllm(self): import torch @@ -124,16 +131,20 @@ def test_vllm(self): ) # Apply quantization. - print("ONESHOT KWARGS", self.oneshot_kwargs) + logger.debug("ONESHOT KWARGS", self.oneshot_kwargs) oneshot( **self.oneshot_kwargs, - clear_sparse_session=True, oneshot_device=self.device, ) + self.oneshot_kwargs["model"].save_pretrained(self.save_dir) tokenizer.save_pretrained(self.save_dir) + + # Reset after session info is extracted on save -- recipe + self.session.reset() + # Run vLLM with saved model - print("================= RUNNING vLLM =========================") + logger.info("================= RUNNING vLLM =========================") sampling_params = SamplingParams(temperature=0.80, top_p=0.95) if "W4A16_2of4" in self.scheme: # required by the kernel @@ -141,16 +152,19 @@ def test_vllm(self): else: llm = LLM(model=self.save_dir) outputs = llm.generate(self.prompts, sampling_params) - print("================= vLLM GENERATION ======================") + + logger.info("================= vLLM GENERATION ======================") for output in outputs: assert output prompt = output.prompt generated_text = output.outputs[0].text - print("PROMPT", prompt) - print("GENERATED TEXT", generated_text) - print("================= UPLOADING TO HUB ======================") - self.oneshot_kwargs["model"].push_to_hub(f"nm-testing/{self.save_dir}-e2e") - tokenizer.push_to_hub(f"nm-testing/{self.save_dir}-e2e") + logger.debug("PROMPT", prompt) + logger.debug("GENERATED TEXT", generated_text) + + logger.info("================= UPLOADING TO HUB ======================") + hf_upload_path = os.path.join(HF_MODEL_HUB_NAME, f"{self.save_dir}-e2e") + self.oneshot_kwargs["model"].push_to_hub(hf_upload_path) + tokenizer.push_to_hub(hf_upload_path) def tearDown(self): if self.save_dir is not None: