Skip to content

Commit

Permalink
v1.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivierDehaene committed Dec 22, 2023
1 parent 529d7c2 commit 630800e
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 98 deletions.
161 changes: 75 additions & 86 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ members = [
]

[workspace.package]
version = "1.3.3"
version = "1.3.4"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"
Expand Down
2 changes: 1 addition & 1 deletion docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "1.3.3"
"version": "1.3.4"
},
"paths": {
"/": {
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "text-generation-integration-tests"
version = "1.3.3"
version = "1.3.4"
description = "Text Generation Inference integration tests"
authors = ["Nicolas Patry <[email protected]>"]

Expand Down
2 changes: 1 addition & 1 deletion server/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "text-generation-server"
version = "1.3.3"
version = "1.3.4"
description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <[email protected]>"]

Expand Down
2 changes: 1 addition & 1 deletion server/text_generation_server/utils/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
V2 = False
log_once(
logger.warning,
"Disabling exllama v2 and using v1 instead because there are issues when sharding"
"Disabling exllama v2 and using v1 instead because there are issues when sharding",
)

if os.getenv("DISABLE_EXLLAMA") == "True":
Expand Down
2 changes: 1 addition & 1 deletion server/text_generation_server/utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@


@lru_cache(10)
def log_once(log, msg:str):
def log_once(log, msg: str):
log(msg)
11 changes: 5 additions & 6 deletions server/text_generation_server/utils/weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
bits, groupsize, desc_act = self._get_gptq_params()
from text_generation_server.utils.layers import HAS_EXLLAMA

use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
use_exllama = (
bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
)
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
else:
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
Expand Down Expand Up @@ -281,14 +283,11 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
if CAN_EXLLAMA:
log_once(
logger.warning,
"Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
"Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
)
use_exllama = False
else:
log_once(
logger.info,
f"Using exllama kernels v{HAS_EXLLAMA}"
)
log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")

g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)

Expand Down

0 comments on commit 630800e

Please sign in to comment.