diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 15c51c1..5d783d1 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -1228,10 +1228,9 @@ async def generate_gen( # The first index will always be the positive prompt context_len = input_ids[0].size(dim=-1) if context_len > self.config.max_seq_len: - logger.warning( + raise ValueError( f"Context length {context_len} is greater than max_seq_len " - f"{self.config.max_seq_len}. Generation is truncated and " - "metrics may not be accurate." + f"{self.config.max_seq_len}" ) # Automatically set max_tokens to fill up the context