Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions tests/test-thread-safety.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// - Creates n_parallel (--parallel) contexts per model
// - Runs inference in parallel on each context

#include <array>
#include <thread>
#include <vector>
#include <atomic>
Expand Down Expand Up @@ -38,13 +39,14 @@ int main(int argc, char ** argv) {
cparams.n_seq_max = 1;

int dev_count = ggml_backend_dev_count();
int gpu_dev_count = 0;
std::vector<std::array<ggml_backend_dev_t, 2>> gpus;
for (int i = 0; i < dev_count; ++i) {
auto * dev = ggml_backend_dev_get(i);
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_dev_count++;
gpus.push_back({dev, nullptr});
}
}
const int gpu_dev_count = (int)gpus.size();
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
//const int num_models = std::max(1, gpu_dev_count);
const int num_contexts = std::max(1, params.n_parallel);
Expand All @@ -58,12 +60,12 @@ int main(int argc, char ** argv) {

if (m < gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = m;
mparams.devices = gpus[m].data();
} else if (m == gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = -1; // CPU model
} else {
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
}

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
Expand Down
Loading