Skip to content

Commit c7e2046

Browse files
authored
Merge branch 'master' into dependabot/pip/backend/python/transformers/sentence-transformers-5.1.1
2 parents 56ed3f6 + 33c1419 commit c7e2046

26 files changed

+216
-59
lines changed

.github/workflows/backend.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ jobs:
882882
backend: "rfdetr"
883883
dockerfile: "./backend/Dockerfile.python"
884884
context: "./backend"
885-
- build-type: 'cublas'
885+
- build-type: 'l4t'
886886
cuda-major-version: "12"
887887
cuda-minor-version: "0"
888888
platforms: 'linux/arm64'
@@ -955,6 +955,18 @@ jobs:
955955
backend: "exllama2"
956956
dockerfile: "./backend/Dockerfile.python"
957957
context: "./backend"
958+
- build-type: 'l4t'
959+
cuda-major-version: "12"
960+
cuda-minor-version: "0"
961+
platforms: 'linux/arm64'
962+
skip-drivers: 'true'
963+
tag-latest: 'auto'
964+
tag-suffix: '-gpu-nvidia-l4t-arm64-chatterbox'
965+
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
966+
runs-on: 'ubuntu-24.04-arm'
967+
backend: "chatterbox"
968+
dockerfile: "./backend/Dockerfile.python"
969+
context: "./backend"
958970
# runs out of space on the runner
959971
# - build-type: 'hipblas'
960972
# cuda-major-version: ""

.github/workflows/secscan.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
if: ${{ github.actor != 'dependabot[bot]' }}
1919
- name: Run Gosec Security Scanner
2020
if: ${{ github.actor != 'dependabot[bot]' }}
21-
uses: securego/[email protected].8
21+
uses: securego/[email protected].9
2222
with:
2323
# we let the report trigger content trigger a failure using the GitHub Security features.
2424
args: '-no-fail -fmt sarif -out results.sarif ./...'

Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,16 @@ RUN <<EOT bash
7878
fi
7979
EOT
8080

81+
# https://github.com/NVIDIA/Isaac-GR00T/issues/343
82+
RUN <<EOT bash
83+
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
84+
wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
85+
dpkg -i cudss-local-tegra-repo-ubuntu2204-0.6.0_0.6.0-1_arm64.deb && \
86+
cp /var/cudss-local-tegra-repo-ubuntu2204-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
87+
apt-get update && apt-get -y install cudss
88+
fi
89+
EOT
90+
8191
# If we are building with clblas support, we need the libraries for the builds
8292
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
8393
apt-get update && \

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,9 @@ docker-build-kitten-tts:
429429
docker-save-kitten-tts: backend-images
430430
docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
431431

432+
docker-save-chatterbox: backend-images
433+
docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
434+
432435
docker-build-kokoro:
433436
docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
434437

backend/cpp/llama-cpp/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
LLAMA_VERSION?=da30ab5f8696cabb2d4620cdc0aa41a298c54fd6
2+
LLAMA_VERSION?=5f7e166cbf7b9ca928c7fad990098ef32358ac75
33
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
44

55
CMAKE_ARGS?=
@@ -14,7 +14,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
1414

1515
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
1616
ifeq ($(NATIVE),false)
17-
CMAKE_ARGS+=-DGGML_NATIVE=OFF
17+
CMAKE_ARGS+=-DGGML_NATIVE=OFF -DLLAMA_OPENSSL=OFF
1818
endif
1919
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
2020
ifeq ($(BUILD_TYPE),cublas)

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ static void params_parse(const backend::ModelOptions* request,
231231
params.cpuparams.n_threads = request->threads();
232232
params.n_gpu_layers = request->ngpulayers();
233233
params.n_batch = request->nbatch();
234+
params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
234235
// Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
235236
//params.n_parallel = 1;
236237
const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -801,11 +802,6 @@ class BackendServiceImpl final : public backend::Backend::Service {
801802
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"documents\" must be a non-empty string array");
802803
}
803804

804-
// Tokenize the query
805-
auto tokenized_query = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, request->query(), /* add_special */ false, true);
806-
if (tokenized_query.size() != 1) {
807-
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "\"query\" must contain only a single prompt");
808-
}
809805
// Create and queue the task
810806
json responses = json::array();
811807
bool error = false;
@@ -817,10 +813,9 @@ class BackendServiceImpl final : public backend::Backend::Service {
817813
documents.push_back(request->documents(i));
818814
}
819815

820-
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
821-
tasks.reserve(tokenized_docs.size());
822-
for (size_t i = 0; i < tokenized_docs.size(); i++) {
823-
auto tmp = format_rerank(ctx_server.vocab, tokenized_query[0], tokenized_docs[i]);
816+
tasks.reserve(documents.size());
817+
for (size_t i = 0; i < documents.size(); i++) {
818+
auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, request->query(), documents[i]);
824819
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
825820
task.id = ctx_server.queue_tasks.get_new_id();
826821
task.index = i;

backend/go/whisper/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
88

99
# whisper.cpp version
1010
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
11-
WHISPER_CPP_VERSION?=44fa2f647cf2a6953493b21ab83b50d5f5dbc483
11+
WHISPER_CPP_VERSION?=32be14f8ebfc0498c2c619182f0d7f4c822d52c4
1212

1313
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
1414

backend/index.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@
353353
nvidia: "cuda12-chatterbox"
354354
metal: "metal-chatterbox"
355355
default: "cpu-chatterbox"
356+
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
356357
- &piper
357358
name: "piper"
358359
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1239,6 +1240,7 @@
12391240
nvidia: "cuda12-chatterbox-development"
12401241
metal: "metal-chatterbox-development"
12411242
default: "cpu-chatterbox-development"
1243+
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
12421244
- !!merge <<: *chatterbox
12431245
name: "cpu-chatterbox"
12441246
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1249,6 +1251,16 @@
12491251
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
12501252
mirrors:
12511253
- localai/localai-backends:master-cpu-chatterbox
1254+
- !!merge <<: *chatterbox
1255+
name: "nvidia-l4t-arm64-chatterbox"
1256+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
1257+
mirrors:
1258+
- localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
1259+
- !!merge <<: *chatterbox
1260+
name: "nvidia-l4t-arm64-chatterbox-development"
1261+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
1262+
mirrors:
1263+
- localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
12521264
- !!merge <<: *chatterbox
12531265
name: "metal-chatterbox"
12541266
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
bark==0.1.5
2-
grpcio==1.74.0
2+
grpcio==1.75.1
33
protobuf
44
certifi

backend/python/chatterbox/backend.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,23 @@
1414
import torch
1515
import torchaudio as ta
1616
from chatterbox.tts import ChatterboxTTS
17-
17+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
1818
import grpc
1919

20+
def is_float(s):
21+
"""Check if a string can be converted to float."""
22+
try:
23+
float(s)
24+
return True
25+
except ValueError:
26+
return False
27+
def is_int(s):
28+
"""Check if a string can be converted to int."""
29+
try:
30+
int(s)
31+
return True
32+
except ValueError:
33+
return False
2034

2135
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
2236

@@ -47,6 +61,28 @@ def LoadModel(self, request, context):
4761
if not torch.cuda.is_available() and request.CUDA:
4862
return backend_pb2.Result(success=False, message="CUDA is not available")
4963

64+
65+
options = request.Options
66+
67+
# empty dict
68+
self.options = {}
69+
70+
# The options are a list of strings in this form optname:optvalue
71+
# We are storing all the options in a dict so we can use it later when
72+
# generating the images
73+
for opt in options:
74+
if ":" not in opt:
75+
continue
76+
key, value = opt.split(":")
77+
# if value is a number, convert it to the appropriate type
78+
if is_float(value):
79+
value = float(value)
80+
elif is_int(value):
81+
value = int(value)
82+
elif value.lower() in ["true", "false"]:
83+
value = value.lower() == "true"
84+
self.options[key] = value
85+
5086
self.AudioPath = None
5187

5288
if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ def LoadModel(self, request, context):
5692
modelFileBase = os.path.dirname(request.ModelFile)
5793
# modify LoraAdapter to be relative to modelFileBase
5894
self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
59-
6095
try:
6196
print("Preparing models, please wait", file=sys.stderr)
62-
self.model = ChatterboxTTS.from_pretrained(device=device)
97+
if "multilingual" in self.options:
98+
# remove key from options
99+
del self.options["multilingual"]
100+
self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
101+
else:
102+
self.model = ChatterboxTTS.from_pretrained(device=device)
63103
except Exception as err:
64104
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
65105
# Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ def LoadModel(self, request, context):
68108

69109
def TTS(self, request, context):
70110
try:
71-
# Generate audio using ChatterboxTTS
111+
kwargs = {}
112+
113+
if "language" in self.options:
114+
kwargs["language_id"] = self.options["language"]
72115
if self.AudioPath is not None:
73-
wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
74-
else:
75-
wav = self.model.generate(request.text)
76-
116+
kwargs["audio_prompt_path"] = self.AudioPath
117+
118+
# add options to kwargs
119+
kwargs.update(self.options)
120+
121+
# Generate audio using ChatterboxTTS
122+
wav = self.model.generate(request.text, **kwargs)
77123
# Save the generated audio
78124
ta.save(request.dst, wav, self.model.sr)
79125

0 commit comments

Comments
 (0)