v1.3.4

huggingface · Dec 22, 2023 · 630800e · 630800e
1 parent 529d7c2
commit 630800e
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 98 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,7 +8,7 @@ members = [
 ]
 
 [workspace.package]
-version = "1.3.3"
+version = "1.3.4"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.3.3"
+    "version": "1.3.4"
   },
   "paths": {
     "/": {

diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.3.3"
+version = "1.3.4"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <[email protected]>"]
 

diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.3.3"
+version = "1.3.4"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <[email protected]>"]
 

diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
@@ -39,7 +39,7 @@
     V2 = False
     log_once(
         logger.warning,
-        "Disabling exllama v2 and using v1 instead because there are issues when sharding"
+        "Disabling exllama v2 and using v1 instead because there are issues when sharding",
     )
 
 if os.getenv("DISABLE_EXLLAMA") == "True":

diff --git a/server/text_generation_server/utils/log.py b/server/text_generation_server/utils/log.py
@@ -2,5 +2,5 @@
 
 
 @lru_cache(10)
-def log_once(log, msg:str):
+def log_once(log, msg: str):
     log(msg)
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
@@ -215,7 +215,9 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
             bits, groupsize, desc_act = self._get_gptq_params()
             from text_generation_server.utils.layers import HAS_EXLLAMA
 
-            use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
+            use_exllama = (
+                bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
+            )
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
         else:
             w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
@@ -281,14 +283,11 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
                     if CAN_EXLLAMA:
                         log_once(
                             logger.warning,
-                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
+                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
                         )
                     use_exllama = False
                 else:
-                    log_once(
-                        logger.info,
-                        f"Using exllama kernels v{HAS_EXLLAMA}"
-                    )
+                    log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
 
             g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)