OpenNMT · minhthuc2502 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/include/ctranslate2/models/language_model.h b/include/ctranslate2/models/language_model.h
@@ -22,6 +22,7 @@ namespace ctranslate2 {
 
     protected:
       void initialize(ModelReader& model_reader) override;
+      void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
 
     private:
       std::shared_ptr<const Vocabulary> _vocabulary;

diff --git a/include/ctranslate2/models/model.h b/include/ctranslate2/models/model.h
@@ -31,6 +31,17 @@ namespace ctranslate2 {
                                                Device device = Device::CPU,
                                                int device_index = 0,
                                                ComputeType compute_type = ComputeType::DEFAULT);
+      static std::shared_ptr<const Model> load(const std::string& spec,
+                                               const size_t& spec_version,
+                                               const size_t& binary_version,
+                                               std::unordered_map<std::string, std::string>& alias,
+                                               std::unordered_map<std::string, std::vector<std::string>>& vocabularies,
+                                               std::unordered_map<std::string, StorageView>& variables,
+                                               const std::string& config,
+                                               Device device = Device::CPU,
+                                               int device_index = 0,
+                                               ComputeType compute_type = ComputeType::DEFAULT);
+
 
       virtual std::unique_ptr<SequenceToSequenceReplica> as_sequence_to_sequence() const;
       virtual std::unique_ptr<SequenceGeneratorReplica> as_sequence_generator() const;
@@ -86,6 +97,10 @@ namespace ctranslate2 {
         return ScopedDeviceSetter(_device, _device_index);
       }
 
+      void set_config(const std::string& config_str);
+      void set_revision(const size_t revision);
+      void set_binary_version(const size_t binary_version);
+
       // If the model contains variables, they will be moved to the new device.
       void set_device(const Device device, const int index = 0);
 
@@ -143,6 +158,7 @@ namespace ctranslate2 {
 
       // Runs some initialization after the model is loaded.
       virtual void initialize(ModelReader&) {}
+      virtual void initialize(std::unordered_map<std::string, std::vector<std::string>>&) {}
 
       virtual std::unique_ptr<Model> clone() const = 0;
 

diff --git a/include/ctranslate2/models/sequence_to_sequence.h b/include/ctranslate2/models/sequence_to_sequence.h
@@ -32,6 +32,7 @@ namespace ctranslate2 {
 
     protected:
       virtual void initialize(ModelReader& model_reader) override;
+      virtual void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
 
     private:
       std::vector<std::shared_ptr<const Vocabulary>> _source_vocabularies;

diff --git a/include/ctranslate2/models/transformer.h b/include/ctranslate2/models/transformer.h
@@ -34,6 +34,7 @@ namespace ctranslate2 {
     protected:
       bool is_linear_weight(const std::string& variable_name) const override;
       void initialize(ModelReader& model_reader) override;
+      void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
       std::unique_ptr<Model> clone() const override;
     };
 

diff --git a/include/ctranslate2/models/wav2vec2.h b/include/ctranslate2/models/wav2vec2.h
@@ -41,6 +41,7 @@ namespace ctranslate2 {
 
     protected:
       void initialize(ModelReader& model_reader) override;
+      void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
     private:
       std::shared_ptr<const Vocabulary> _vocabulary;
     };

diff --git a/include/ctranslate2/models/whisper.h b/include/ctranslate2/models/whisper.h
@@ -90,6 +90,7 @@ namespace ctranslate2 {
 
     protected:
       void initialize(ModelReader& model_reader) override;
+      void initialize(std::unordered_map<std::string, std::vector<std::string>>& vocabularies) override;
 
     private:
       std::shared_ptr<const Vocabulary> _vocabulary;

diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
@@ -1,6 +1,7 @@
 #include "module.h"
 
 #include <ctranslate2/generator.h>
+#include <ctranslate2/storage_view.h>
 
 #include "replica_pool.h"
 
@@ -158,6 +159,44 @@ namespace ctranslate2 {
                      :obj:`model_path` acts as an identifier for this model.
              )pbdoc")
 
+        .def(py::init<const std::string&, const size_t&, const size_t&, std::unordered_map<std::string, std::string>&,
+             std::unordered_map<std::string, std::vector<std::string>>&, std::unordered_map<std::string, StorageView>&, const std::string&, const std::string&, const std::variant<int, std::vector<int>>, const StringOrMap&, size_t, size_t, long>(),
+             py::arg("spec"),
+             py::arg("spec_revision"),
+             py::arg("binary_version"),
+             py::arg("aliases"),
+             py::arg("vocabularies"),
+             py::arg("variables"),
+             py::arg("config"),
+             py::arg("device")="cpu",
+             py::arg("device_index")=0,
+             py::arg("compute_type")="default",
+             py::arg("inter_threads")=1,
+             py::arg("intra_threads")=0,
+             py::arg("max_queued_batches")=0,
+             R"pbdoc(
+                 Initializes the generator.
+
+                 Arguments:
+                   spec: The name of the model specification.
+                   spec_revision: The model specification revision.
+                   binary_version: The version of binary model
+                   aliases: aliases got in the mode
+                   vocabularies: dictionary of name and list of tokens
+                   variables: dictionary of name of variables and storage view of variable
+                   config: list of config (normally saved in config.json)
+                   device: Device to use (possible values are: cpu, cuda, auto).
+                   device_index: Device IDs where to place this generator on.
+                   compute_type: Model computation type or a dictionary mapping a device name
+                     to the computation type (possible values are: default, auto, int8, int8_float32,
+                     int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).
+                   inter_threads: Maximum number of parallel generations.
+                   intra_threads: Number of OpenMP threads per generator (0 to use a default value).
+                   max_queued_batches: Maximum numbers of batches in the queue (-1 for unlimited,
+                     0 for an automatic value). When the queue is full, future requests will block
+                     until a free slot is available.
+             )pbdoc")
+
         .def_property_readonly("device", &GeneratorWrapper::device,
                                "Device this generator is running on.")
         .def_property_readonly("device_index", &GeneratorWrapper::device_index,

diff --git a/python/cpp/replica_pool.h b/python/cpp/replica_pool.h
@@ -1,7 +1,11 @@
 #pragma once
 
 #include <ctranslate2/replica_pool.h>
+#include <ctranslate2/models/model_factory.h>
+#include <ctranslate2/models/model.h>
 
+#include <unordered_map>
+#include <optional>
 #include "utils.h"
 
 namespace ctranslate2 {
@@ -49,15 +53,53 @@ namespace ctranslate2 {
       {
         pybind11::gil_scoped_release nogil;
 
-        _model_loader.device = str_to_device(device);
-        _model_loader.device_indices = std::visit(DeviceIndexResolver(), device_index);
-        _model_loader.compute_type = std::visit(ComputeTypeResolver(device), compute_type);
-        _model_loader.num_replicas_per_device = inter_threads;
+        _model_loader->device = str_to_device(device);
+        _model_loader->device_indices = std::visit(DeviceIndexResolver(), device_index);
+        _model_loader->compute_type = std::visit(ComputeTypeResolver(device), compute_type);
+        _model_loader->num_replicas_per_device = inter_threads;
 
         _pool_config.num_threads_per_replica = intra_threads;
         _pool_config.max_queued_batches = max_queued_batches;
 
-        _pool = std::make_unique<T>(_model_loader, _pool_config);
+        _pool = std::make_unique<T>(_model_loader.value(), _pool_config);
+      }
+
+      ReplicaPoolHelper(const std::string& spec,
+                        const size_t& spec_version,
+                        const size_t& binary_version,
+                        std::unordered_map<std::string, std::string>& aliases,
+                        std::unordered_map<std::string, std::vector<std::string>>& vocabularies,
+                        std::unordered_map<std::string, StorageView>& variables,
+                        const std::string& config,
+                        const std::string& device,
+                        const std::variant<int, std::vector<int>>& device_index,
+                        const StringOrMap& compute_type,
+                        size_t ,//inter_threads
+                        size_t intra_threads,
+                        long max_queued_batches)
+      {
+        pybind11::gil_scoped_release nogil;
+
+        // Load the variables.
+        auto model_device = str_to_device(device);
+        auto model_device_indices = std::visit(DeviceIndexResolver(), device_index)[0];
+        auto model_compute_type = std::visit(ComputeTypeResolver(device), compute_type);
+
+        auto model = models::Model::load(spec,
+                                         spec_version,
+                                         binary_version,
+                                         aliases,
+                                         vocabularies,
+                                         variables,
+                                         config,
+                                         model_device,
+                                         model_device_indices,
+                                         model_compute_type);
+
+        _pool_config.num_threads_per_replica = intra_threads;
+        _pool_config.max_queued_batches = max_queued_batches;
+
+        _pool = std::make_unique<T>(model, _pool_config);
       }
 
       ~ReplicaPoolHelper() {
@@ -66,11 +108,19 @@ namespace ctranslate2 {
       }
 
       std::string device() const {
-        return device_to_str(_model_loader.device);
+        if (_model_loader.has_value())
+          return device_to_str(_model_loader->device);
+        if (_device)
+          return _device.value();
+        return "";
       }
 
       const std::vector<int>& device_index() const {
-        return _model_loader.device_indices;
+        if (_model_loader.has_value())
+          return _model_loader->device_indices;
+        if (!_device_index.has_value() || _device_index->empty())
+          throw pybind11::type_error("No device index found");
+        return _device_index.value();
       }
 
       std::string compute_type() const {
@@ -91,7 +141,9 @@ namespace ctranslate2 {
 
     protected:
       std::unique_ptr<T> _pool;
-      models::ModelLoader _model_loader;
+      std::optional<models::ModelLoader> _model_loader;
+      std::optional<std::string> _device;
+      std::optional<std::vector<int>> _device_index;
       ReplicaPoolConfig _pool_config;
 
       const std::shared_ptr<const models::Model>& model() const {

diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
@@ -42,9 +42,9 @@ namespace ctranslate2 {
                             intra_threads,
                             max_queued_batches,
                             files)
-        , _device(_model_loader.device)
-        , _device_index(_model_loader.device_indices)
-        , _num_replicas_per_device(_model_loader.num_replicas_per_device)
+        , _device(_model_loader->device)
+        , _device_index(_model_loader->device_indices)
+        , _num_replicas_per_device(_model_loader->num_replicas_per_device)
         , _model_is_loaded(true) {
       }
 
@@ -324,7 +324,7 @@ namespace ctranslate2 {
           return;
 
         if (_cached_models.empty()) {
-          _cached_models = _model_loader.load();
+          _cached_models = _model_loader->load();
         } else {
           move_cached_models(_device, _device_index, _num_replicas_per_device);
         }

diff --git a/python/ctranslate2/__init__.py b/python/ctranslate2/__init__.py
@@ -39,6 +39,7 @@
         set_random_seed,
     )
     from ctranslate2.extensions import register_extensions
+    from ctranslate2.generator_on_the_fly import GeneratorOnTheFly
     from ctranslate2.logging import get_log_level, set_log_level
 
     register_extensions()

diff --git a/python/ctranslate2/converters/converter.py b/python/ctranslate2/converters/converter.py
@@ -104,6 +104,42 @@ def convert(
         model_spec.save(output_dir)
         return output_dir
 
+    def convert_on_the_fly(
+        self,
+        vmap: Optional[str] = None,
+        quantization: Optional[str] = None,
+    ) -> ModelSpec:
+        """Converts the model to the CTranslate2 format.
+
+        Arguments:
+          vmap: Optional path to a vocabulary mapping file that will be included
+            in the converted model directory.
+          quantization: Weight quantization scheme (possible values are: int8, int8_float32,
+            int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).
+
+        Returns:
+          Path to the output directory.
+
+        Raises:
+          RuntimeError: If the output directory already exists and :obj:`force`
+            is not set.
+          NotImplementedError: If the converter cannot convert this model to the
+            CTranslate2 format.
+        """
+        model_spec = self._load()
+        if model_spec is None:
+            raise NotImplementedError(
+                "This model is not supported by CTranslate2 or this converter"
+            )
+        if vmap is not None:
+            model_spec.register_vocabulary_mapping(vmap)
+
+        model_spec.validate()
+        model_spec.optimize(quantization=quantization)
+        # model_spec.save(output_dir, False)
+
+        return model_spec
+
     @abc.abstractmethod
     def _load(self):
         raise NotImplementedError()