Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert opennmt py on the fly and inference with model in memory #1578

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ctranslate2/models/language_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ namespace ctranslate2 {

protected:
void initialize(ModelReader& model_reader) override;
void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;

private:
std::shared_ptr<const Vocabulary> _vocabulary;
Expand Down
16 changes: 16 additions & 0 deletions include/ctranslate2/models/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ namespace ctranslate2 {
Device device = Device::CPU,
int device_index = 0,
ComputeType compute_type = ComputeType::DEFAULT);
static std::shared_ptr<const Model> load(const std::string& spec,
const size_t& spec_version,
const size_t& binary_version,
std::unordered_map<std::string, std::string>& alias,
std::unordered_map<std::string, std::vector<std::string>>& vocabularies,
std::unordered_map<std::string, StorageView>& variables,
const std::string& config,
Device device = Device::CPU,
int device_index = 0,
ComputeType compute_type = ComputeType::DEFAULT);


virtual std::unique_ptr<SequenceToSequenceReplica> as_sequence_to_sequence() const;
virtual std::unique_ptr<SequenceGeneratorReplica> as_sequence_generator() const;
Expand Down Expand Up @@ -86,6 +97,10 @@ namespace ctranslate2 {
return ScopedDeviceSetter(_device, _device_index);
}

void set_config(const std::string& config_str);
void set_revision(const size_t revision);
void set_binary_version(const size_t binary_version);

// If the model contains variables, they will be moved to the new device.
void set_device(const Device device, const int index = 0);

Expand Down Expand Up @@ -143,6 +158,7 @@ namespace ctranslate2 {

// Runs some initialization after the model is loaded.
virtual void initialize(ModelReader&) {}
virtual void initialize(std::unordered_map<std::string, std::vector<std::string>>&) {}

virtual std::unique_ptr<Model> clone() const = 0;

Expand Down
1 change: 1 addition & 0 deletions include/ctranslate2/models/sequence_to_sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace ctranslate2 {

protected:
virtual void initialize(ModelReader& model_reader) override;
virtual void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;

private:
std::vector<std::shared_ptr<const Vocabulary>> _source_vocabularies;
Expand Down
1 change: 1 addition & 0 deletions include/ctranslate2/models/transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace ctranslate2 {
protected:
bool is_linear_weight(const std::string& variable_name) const override;
void initialize(ModelReader& model_reader) override;
void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
std::unique_ptr<Model> clone() const override;
};

Expand Down
1 change: 1 addition & 0 deletions include/ctranslate2/models/wav2vec2.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace ctranslate2 {

protected:
void initialize(ModelReader& model_reader) override;
void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
private:
std::shared_ptr<const Vocabulary> _vocabulary;
};
Expand Down
1 change: 1 addition & 0 deletions include/ctranslate2/models/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ namespace ctranslate2 {

protected:
void initialize(ModelReader& model_reader) override;
void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;

private:
std::shared_ptr<const Vocabulary> _vocabulary;
Expand Down
39 changes: 39 additions & 0 deletions python/cpp/generator.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "module.h"

#include <ctranslate2/generator.h>
#include <ctranslate2/storage_view.h>

#include "replica_pool.h"

Expand Down Expand Up @@ -158,6 +159,44 @@ namespace ctranslate2 {
:obj:`model_path` acts as an identifier for this model.
)pbdoc")

.def(py::init<const std::string&, const size_t&, const size_t&, std::unordered_map<std::string, std::string>&,
std::unordered_map<std::string, std::vector<std::string>>&, std::unordered_map<std::string, StorageView>&, const std::string&, const std::string&, const std::variant<int, std::vector<int>>, const StringOrMap&, size_t, size_t, long>(),
py::arg("spec"),
py::arg("spec_revision"),
py::arg("binary_version"),
py::arg("aliases"),
py::arg("vocabularies"),
py::arg("variables"),
py::arg("config"),
py::arg("device")="cpu",
py::arg("device_index")=0,
py::arg("compute_type")="default",
py::arg("inter_threads")=1,
py::arg("intra_threads")=0,
py::arg("max_queued_batches")=0,
R"pbdoc(
Initializes the generator.

Arguments:
spec: The name of the model specification.
spec_revision: The model specification revision.
binary_version: The version of binary model
aliases: aliases got in the mode
vocabularies: dictionary of name and list of tokens
variables: dictionary of name of variables and storage view of variable
config: list of config (normally saved in config.json)
device: Device to use (possible values are: cpu, cuda, auto).
device_index: Device IDs where to place this generator on.
compute_type: Model computation type or a dictionary mapping a device name
to the computation type (possible values are: default, auto, int8, int8_float32,
int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).
inter_threads: Maximum number of parallel generations.
intra_threads: Number of OpenMP threads per generator (0 to use a default value).
max_queued_batches: Maximum numbers of batches in the queue (-1 for unlimited,
0 for an automatic value). When the queue is full, future requests will block
until a free slot is available.
)pbdoc")

.def_property_readonly("device", &GeneratorWrapper::device,
"Device this generator is running on.")
.def_property_readonly("device_index", &GeneratorWrapper::device_index,
Expand Down
68 changes: 60 additions & 8 deletions python/cpp/replica_pool.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#pragma once

#include <ctranslate2/replica_pool.h>
#include <ctranslate2/models/model_factory.h>
#include <ctranslate2/models/model.h>

#include <unordered_map>
#include <optional>
#include "utils.h"

namespace ctranslate2 {
Expand Down Expand Up @@ -49,15 +53,53 @@ namespace ctranslate2 {
{
pybind11::gil_scoped_release nogil;

_model_loader.device = str_to_device(device);
_model_loader.device_indices = std::visit(DeviceIndexResolver(), device_index);
_model_loader.compute_type = std::visit(ComputeTypeResolver(device), compute_type);
_model_loader.num_replicas_per_device = inter_threads;
_model_loader->device = str_to_device(device);
_model_loader->device_indices = std::visit(DeviceIndexResolver(), device_index);
_model_loader->compute_type = std::visit(ComputeTypeResolver(device), compute_type);
_model_loader->num_replicas_per_device = inter_threads;

_pool_config.num_threads_per_replica = intra_threads;
_pool_config.max_queued_batches = max_queued_batches;

_pool = std::make_unique<T>(_model_loader, _pool_config);
_pool = std::make_unique<T>(_model_loader.value(), _pool_config);
}

ReplicaPoolHelper(const std::string& spec,
const size_t& spec_version,
const size_t& binary_version,
std::unordered_map<std::string, std::string>& aliases,
std::unordered_map<std::string, std::vector<std::string>>& vocabularies,
std::unordered_map<std::string, StorageView>& variables,
const std::string& config,
const std::string& device,
const std::variant<int, std::vector<int>>& device_index,
const StringOrMap& compute_type,
size_t ,//inter_threads
size_t intra_threads,
long max_queued_batches)
{
pybind11::gil_scoped_release nogil;

// Load the variables.
auto model_device = str_to_device(device);
auto model_device_indices = std::visit(DeviceIndexResolver(), device_index)[0];
auto model_compute_type = std::visit(ComputeTypeResolver(device), compute_type);

auto model = models::Model::load(spec,
spec_version,
binary_version,
aliases,
vocabularies,
variables,
config,
model_device,
model_device_indices,
model_compute_type);

_pool_config.num_threads_per_replica = intra_threads;
_pool_config.max_queued_batches = max_queued_batches;

_pool = std::make_unique<T>(model, _pool_config);
}

~ReplicaPoolHelper() {
Expand All @@ -66,11 +108,19 @@ namespace ctranslate2 {
}

std::string device() const {
return device_to_str(_model_loader.device);
if (_model_loader.has_value())
return device_to_str(_model_loader->device);
if (_device)
return _device.value();
return "";
}

const std::vector<int>& device_index() const {
return _model_loader.device_indices;
if (_model_loader.has_value())
return _model_loader->device_indices;
if (!_device_index.has_value() || _device_index->empty())
throw pybind11::type_error("No device index found");
return _device_index.value();
}

std::string compute_type() const {
Expand All @@ -91,7 +141,9 @@ namespace ctranslate2 {

protected:
std::unique_ptr<T> _pool;
models::ModelLoader _model_loader;
std::optional<models::ModelLoader> _model_loader;
std::optional<std::string> _device;
std::optional<std::vector<int>> _device_index;
ReplicaPoolConfig _pool_config;

const std::shared_ptr<const models::Model>& model() const {
Expand Down
8 changes: 4 additions & 4 deletions python/cpp/translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ namespace ctranslate2 {
intra_threads,
max_queued_batches,
files)
, _device(_model_loader.device)
, _device_index(_model_loader.device_indices)
, _num_replicas_per_device(_model_loader.num_replicas_per_device)
, _device(_model_loader->device)
, _device_index(_model_loader->device_indices)
, _num_replicas_per_device(_model_loader->num_replicas_per_device)
, _model_is_loaded(true) {
}

Expand Down Expand Up @@ -324,7 +324,7 @@ namespace ctranslate2 {
return;

if (_cached_models.empty()) {
_cached_models = _model_loader.load();
_cached_models = _model_loader->load();
} else {
move_cached_models(_device, _device_index, _num_replicas_per_device);
}
Expand Down
1 change: 1 addition & 0 deletions python/ctranslate2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
set_random_seed,
)
from ctranslate2.extensions import register_extensions
from ctranslate2.generator_on_the_fly import GeneratorOnTheFly
from ctranslate2.logging import get_log_level, set_log_level

register_extensions()
Expand Down
36 changes: 36 additions & 0 deletions python/ctranslate2/converters/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,42 @@ def convert(
model_spec.save(output_dir)
return output_dir

def convert_on_the_fly(
self,
vmap: Optional[str] = None,
quantization: Optional[str] = None,
) -> ModelSpec:
"""Converts the model to the CTranslate2 format.

Arguments:
vmap: Optional path to a vocabulary mapping file that will be included
in the converted model directory.
quantization: Weight quantization scheme (possible values are: int8, int8_float32,
int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).

Returns:
Path to the output directory.

Raises:
RuntimeError: If the output directory already exists and :obj:`force`
is not set.
NotImplementedError: If the converter cannot convert this model to the
CTranslate2 format.
"""
model_spec = self._load()
if model_spec is None:
raise NotImplementedError(
"This model is not supported by CTranslate2 or this converter"
)
if vmap is not None:
model_spec.register_vocabulary_mapping(vmap)

model_spec.validate()
model_spec.optimize(quantization=quantization)
# model_spec.save(output_dir, False)

return model_spec

@abc.abstractmethod
def _load(self):
raise NotImplementedError()
Loading