Merge branch 'master' into dependabot/pip/backend/python/transformers/sentence-transformers-5.1.1

mudler · web-flow · commit c7e2046f74da · 2025-09-30T14:41:27.000+02:00
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
@@ -882,7 +882,7 @@ jobs:
             backend: "rfdetr"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
-          - build-type: 'cublas'
+          - build-type: 'l4t'
             cuda-major-version: "12"
             cuda-minor-version: "0"
             platforms: 'linux/arm64'
@@ -955,6 +955,18 @@ jobs:
             backend: "exllama2"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: 'l4t'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/arm64'
+            skip-drivers: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-l4t-arm64-chatterbox'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
           # runs out of space on the runner
           # - build-type: 'hipblas'
           #   cuda-major-version: ""
diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
         if: ${{ github.actor != 'dependabot[bot]' }}
       - name: Run Gosec Security Scanner
         if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.8
+        uses: securego/gosec@v2.22.9
         with:
           # we let the report trigger content trigger a failure using the GitHub Security features.
           args: '-no-fail -fmt sarif -out results.sarif ./...'
diff --git a/Dockerfile b/Dockerfile
@@ -78,6 +78,16 @@ RUN <<EOT bash
     fi
 EOT
 
+# https://github.com/NVIDIA/Isaac-GR00T/issues/343
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
+        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
+        dpkg -i cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
+        cp /var/cudss-local-tegra-repo-ubuntu2204-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
+        apt-get update && apt-get -y install cudss
+    fi
+EOT
+
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
         apt-get update && \
diff --git a/Makefile b/Makefile
@@ -429,6 +429,9 @@ docker-build-kitten-tts:
 docker-save-kitten-tts: backend-images
 	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
 
+docker-save-chatterbox: backend-images
+	docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
+
 docker-build-kokoro:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
 
diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
 
-LLAMA_VERSION?=da30ab5f8696cabb2d4620cdc0aa41a298c54fd6
+LLAMA_VERSION?=5f7e166cbf7b9ca928c7fad990098ef32358ac75
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 
 CMAKE_ARGS?=
@@ -14,7 +14,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
 
 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
 endif
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -231,6 +231,7 @@ static void params_parse(const backend::ModelOptions* request,
     params.cpuparams.n_threads = request->threads();
     params.n_gpu_layers = request->ngpulayers();
     params.n_batch = request->nbatch();
+    params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
     // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
     //params.n_parallel = 1;
     const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -801,11 +802,6 @@ class BackendServiceImpl final : public backend::Backend::Service {
             return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
         }
 
-        // Tokenize the query
-        auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
-        if (tokenized_query.size() != 1) {
-            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
-        }
         // Create and queue the task
         json responses = json::array();
         bool error = false;
@@ -817,10 +813,9 @@ class BackendServiceImpl final : public backend::Backend::Service {
                 documents.push_back(request->documents(i));
             }
             
-            auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
-            tasks.reserve(tokenized_docs.size());
-            for (size_t i = 0; i < tokenized_docs.size(); i++) {
-                auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, request->query(), documents[i]);
                 server_task task = server_task(SERVER_TASK_TYPE_RERANK);
                 task.id = ctx_server.queue_tasks.get_new_id();
                 task.index = i;
diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=44fa2f647cf2a6953493b21ab83b50d5f5dbc483
+WHISPER_CPP_VERSION?=32be14f8ebfc0498c2c619182f0d7f4c822d52c4
 
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 
diff --git a/backend/index.yaml b/backend/index.yaml
@@ -353,6 +353,7 @@
     nvidia: "cuda12-chatterbox"
     metal: "metal-chatterbox"
     default: "cpu-chatterbox"
+    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - &piper
   name: "piper"
   uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1239,6 +1240,7 @@
     nvidia: "cuda12-chatterbox-development"
     metal: "metal-chatterbox-development"
     default: "cpu-chatterbox-development"
+    nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
 - !!merge <<: *chatterbox
   name: "cpu-chatterbox"
   uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1249,6 +1251,16 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
   mirrors:
     - localai/localai-backends:master-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "nvidia-l4t-arm64-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
+- !!merge <<: *chatterbox
+  name: "nvidia-l4t-arm64-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
 - !!merge <<: *chatterbox
   name: "metal-chatterbox"
   uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py
@@ -14,9 +14,23 @@
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 import grpc
 
+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -47,6 +61,28 @@ def LoadModel(self, request, context):
         if not torch.cuda.is_available() and request.CUDA:
             return backend_pb2.Result(success=False, message="CUDA is not available")
 
+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the images
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":")
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
         self.AudioPath = None
 
         if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ def LoadModel(self, request, context):
             modelFileBase = os.path.dirname(request.ModelFile)
             # modify LoraAdapter to be relative to modelFileBase
             self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
-
         try:
             print("Preparing models, please wait", file=sys.stderr)
-            self.model = ChatterboxTTS.from_pretrained(device=device)
+            if "multilingual" in self.options:
+                # remove key from options
+                del self.options["multilingual"]
+                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+            else:
+                self.model = ChatterboxTTS.from_pretrained(device=device)
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         # Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ def LoadModel(self, request, context):
 
     def TTS(self, request, context):
         try:
-            # Generate audio using ChatterboxTTS
+            kwargs = {}
+
+            if "language" in self.options:
+                kwargs["language_id"] = self.options["language"]
             if self.AudioPath is not None:
-                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
-            else:
-                wav = self.model.generate(request.text)
-            
+                kwargs["audio_prompt_path"] = self.AudioPath
+
+            # add options to kwargs
+            kwargs.update(self.options)
+
+            # Generate audio using ChatterboxTTS
+            wav = self.model.generate(request.text, **kwargs)
             # Save the generated audio
             ta.save(request.dst, wav, self.model.sr)
             
diff --git a/backend/python/chatterbox/install.sh b/backend/python/chatterbox/install.sh
@@ -15,5 +15,6 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
+EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
 
 installRequirements
diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,6 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+#chatterbox-tts==0.1.4
diff --git a/backend/python/chatterbox/requirements-cublas11.txt b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,5 +2,6 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-chatterbox-tts==0.1.2
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
diff --git a/backend/python/chatterbox/requirements-cublas12.txt b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,5 +1,6 @@
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
diff --git a/backend/python/chatterbox/requirements-hipblas.txt b/backend/python/chatterbox/requirements-hipblas.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
diff --git a/backend/python/chatterbox/requirements-intel.txt b/backend/python/chatterbox/requirements-intel.txt
@@ -2,8 +2,9 @@
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
diff --git a/backend/python/chatterbox/requirements-l4t.txt b/backend/python/chatterbox/requirements-l4t.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
+torch
+torchaudio
+transformers
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+accelerate
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 grpcio-tools
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 packaging==24.1
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.74.0
+grpcio==1.75.1
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 wheel
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf==6.32.0
 certifi
 setuptools
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.74.0
+grpcio==1.75.1
 protobuf
 certifi
 setuptools
diff --git a/gallery/index.yaml b/gallery/index.yaml
diff --git a/pkg/model/loader.go b/pkg/model/loader.go