browsermt · XapaJIaMnu · May 9, 2023 · May 9, 2023 · May 9, 2023 · May 25, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -105,6 +105,9 @@ jobs:
               ccache -s # Print current cache stats
               ccache -z # Zero cache entry
 
+              python -m pip install --upgrade pip
+              pip install pybind11 pybind11-global
+
             CIBW_BEFORE_BUILD_MACOS: |
               brew install openblas protobuf ccache boost pybind11
               chmod -R a+rwx ${{ env.ccache_dir }}
@@ -375,10 +378,8 @@ jobs:
             python3 -m pip install black isort pytype
       - name: "Formatting checks: black, isort"
         run: |
-            python3 -m black --diff --check bindings/python/ setup.py doc/conf.py
+            python3 -m black --diff --check bindings/python/ setup.py doc/conf.py --exclude bindings/python/translator.py
             python3 -m isort --profile black --diff --check bindings/python setup.py doc/conf.py
-      - name: "Static typing checks: pytype"
-        run: |-
             python3 -m pytype bindings/python
 
     docs:

diff --git a/.gitmodules b/.gitmodules
@@ -7,6 +7,3 @@
 [submodule "bergamot-translator-tests"]
 	path = bergamot-translator-tests
 	url = https://github.com/browsermt/bergamot-translator-tests
-[submodule "3rd_party/pybind11"]
-	path = 3rd_party/pybind11
-	url = https://github.com/pybind/pybind11.git
diff --git a/3rd_party/CMakeLists.txt b/3rd_party/CMakeLists.txt
@@ -30,7 +30,3 @@ get_directory_property(CMAKE_C_FLAGS DIRECTORY marian-dev DEFINITION CMAKE_C_FLA
 get_directory_property(CMAKE_CXX_FLAGS DIRECTORY marian-dev DEFINITION CMAKE_CXX_FLAGS) 
 set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} PARENT_SCOPE)    
 set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} PARENT_SCOPE)    
-
-if(COMPILE_PYTHON)
-  add_subdirectory(pybind11)
-endif(COMPILE_PYTHON)
diff --git a/3rd_party/pybind11 b/3rd_party/pybind11
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -84,7 +84,6 @@ cmake_dependent_option(ENABLE_CACHE_STATS "Enable stats on cache" ON "COMPILE_TE
 # Set 3rd party submodule specific cmake options for this project
 SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
 SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
-SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
 SET(SSPLIT_COMPILE_LIBRARY_ONLY ON CACHE BOOL "Do not compile ssplit tests")
 if (USE_WASM_COMPATIBLE_SOURCE)
   SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")

diff --git a/README.md b/README.md
@@ -80,3 +80,46 @@ A short example of how to use the APIs is provided in `app/bergamot.cpp` file.
 ### Using WASM version
 
 Please follow the `README` inside the `wasm` folder of this repository that demonstrates how to use the translator in JavaScript.
+
+### Using python API
+
+Compile and install:
+```
+export CMAKE_BUILD_PARALLEL_LEVEL=8 # Use 8 cores to compile
+pip install wheel
+pip install .
+
+# Desktop app
+% bergamot-translator --help
+bergamot-translator interfance
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG, -c CONFIG
+                        Model YML configuration input.
+  --num-workers NUM_WORKERS, -n NUM_WORKERS
+                        Number of CPU workers.
+  --logging LOGGING, -l LOGGING
+                        Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off. Default is off
+  --cache-size CACHE_SIZE
+                        Cache size. 0 for caching is disabled
+  --terminology-tsv TERMINOLOGY_TSV, -t TERMINOLOGY_TSV
+                        Path to a terminology file TSV
+  --force-terminology, -f
+                        Force terminology to appear on the target side.
+  --path-to-input PATH_TO_INPUT, -i PATH_TO_INPUT
+                        Path to input file. Uses stdin if empty
+```
+Using the python interface
+```python
+from bergamot.translator import Translator
+print(Translator.__doc__)
+translator = Translator("/path/to/model.npz.best-bleu.npz.decoder.brg.yml", terminology="/path/to/terminology.tsv")
+translator.translate(["text"])
+[output]
+new_terminology = {}
+new_terminology['srcwrd'] = "trgwrd"
+translator.reset_terminology(new_terminology)
+translator.translate(["text"])
+[output_with_terminology]
+```
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_executable(bergamot bergamot.cpp)
-target_link_libraries(bergamot PRIVATE bergamot-translator)
+target_link_libraries(bergamot bergamot-translator)
diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt
@@ -1,3 +1,4 @@
+find_package(pybind11 REQUIRED)
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
 
 message("Using Python: " ${Python_EXECUTABLE})

diff --git a/bindings/python/bergamot.cpp b/bindings/python/bergamot.cpp
@@ -1,3 +1,4 @@
+// #define PYBIND11_DETAILED_ERROR_MESSAGES // Enables debugging
 #include <pybind11/iostream.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -12,6 +13,7 @@
 
 #include <iostream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace py = pybind11;
@@ -28,7 +30,9 @@ using Alignment = std::vector<std::vector<float>>;
 using Alignments = std::vector<Alignment>;
 
 PYBIND11_MAKE_OPAQUE(std::vector<Response>);
+PYBIND11_MAKE_OPAQUE(std::vector<size_t>);
 PYBIND11_MAKE_OPAQUE(std::vector<std::string>);
+PYBIND11_MAKE_OPAQUE(std::unordered_map<std::string, std::string>);
 PYBIND11_MAKE_OPAQUE(Alignments);
 
 class ServicePyAdapter {
@@ -116,6 +120,18 @@ class ServicePyAdapter {
     return responses;
   }
 
+  void setTerminology(py::dict terminology, bool forceTerminology = false) {
+    // It seems copying is not too bad for performance. Also this should happen rarely and with small objects
+    // https://github.com/pybind/pybind11/issues/3033
+    std::unordered_map<std::string, std::string> cppTerminology;
+    for (std::pair<py::handle, py::handle> item : terminology) {
+      auto key = item.first.cast<std::string>();
+      auto value = item.second.cast<std::string>();
+      cppTerminology[key] = value;
+    }
+    service_.setTerminology(cppTerminology, forceTerminology);
+  }
+
   private /*functions*/:
   static Service make_service(const Service::Config &config) {
     py::scoped_ostream_redirect outstream(std::cout,                                 // std::ostream&
@@ -195,19 +211,32 @@ PYBIND11_MODULE(_bergamot, m) {
       .def("modelFromConfig", &ServicePyAdapter::modelFromConfig)
       .def("modelFromConfigPath", &ServicePyAdapter::modelFromConfigPath)
       .def("translate", &ServicePyAdapter::translate)
-      .def("pivot", &ServicePyAdapter::pivot);
+      .def("pivot", &ServicePyAdapter::pivot)
+      .def("setTerminology", &ServicePyAdapter::setTerminology);
 
+  py::bind_vector<std::vector<size_t>>(m, "VectorSizeT");
   py::class_<Service::Config>(m, "ServiceConfig")
-      .def(py::init<>([](size_t numWorkers, size_t cacheSize, std::string logging) {
+      .def(py::init<>([](size_t numWorkers, std::vector<size_t> gpuWorkers, size_t cacheSize, std::string logging,
+                         std::string pathToTerminologyFile, bool terminologyForce, std::string terminologyForm) {
              Service::Config config;
              config.numWorkers = numWorkers;
+             config.gpuWorkers = gpuWorkers;
              config.cacheSize = cacheSize;
              config.logger.level = logging;
+             config.terminologyFile = pathToTerminologyFile;
+             config.terminologyForce = terminologyForce;
+             config.format = terminologyForm;
              return config;
            }),
-           py::arg("numWorkers") = 1, py::arg("cacheSize") = 0, py::arg("logLevel") = "off")
+           py::arg("numWorkers") = 1, py::arg("gpuWorkers") = std::vector<size_t>{}, py::arg("cacheSize") = 0,
+           py::arg("logLevel") = "off", py::arg("pathToTerminologyFile") = "", py::arg("terminologyForce") = false,
+           py::arg("terminologyForm") = "%s <tag0> %s </tag0> ")
       .def_readwrite("numWorkers", &Service::Config::numWorkers)
-      .def_readwrite("cacheSize", &Service::Config::cacheSize);
+      .def_readwrite("gpuWorkers", &Service::Config::gpuWorkers)
+      .def_readwrite("cacheSize", &Service::Config::cacheSize)
+      .def_readwrite("pathToTerminologyFile", &Service::Config::terminologyFile)
+      .def_readwrite("terminologyForce", &Service::Config::terminologyForce)
+      .def_readwrite("terminologyForm", &Service::Config::format);
 
   py::class_<_Model, std::shared_ptr<_Model>>(m, "TranslationModel");
 }
diff --git a/bindings/python/translator.py b/bindings/python/translator.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+import argparse
+from sys import stdin
+from typing import Dict, List
+
+import bergamot  # type: ignore
+
+
+class Translator:
+    """Bergamot translator interfacing with the C++ code.
+
+    Attributes:
+        _num_workers        Number of parallel CPU workers.
+        _gpu_workers        Indices of the GPU devices used. _num_workers must be set to zero!
+        _cache:             Cache size. 0 to disable cache.
+        _logging:           Log level: trace, debug, info, warn, err(or), critical, off. Default is off
+        _terminology:       Path to a TSV terminology file
+        _force_terminology  Force the terminology to appear on the target side. May affect translation quality negatively.
+        _format             Format of the terminology string
+
+        _config            Translation model config
+        _model:            Translation model
+        _responseOpts      What to include in the response (alignment, html restoration, etc..)
+        _service           The translation service
+    """
+
+    _num_workers: int
+    _gpu_workers: List[int]
+    _cache: int
+    _logging: str
+    _terminology: str
+    _force_terminology: bool
+    _terminology_form: str
+
+    _config: bergamot.ServiceConfig
+    _model: bergamot.TranslationModel
+    _responseOpts: bergamot.ResponseOptions
+    _service: bergamot.Service
+
+    def __init__(
+        self,
+        model_config_path: str,
+        num_workers: int = 1,
+        gpu_workers: List[int] = [],
+        cache: int = 0,
+        logging="off",
+        terminology: str = "",
+        force_terminology: bool = False,
+        terminology_form: str = "%s __target__ %s __done__ ",
+    ):
+        """Initialises the translator class
+
+        :param model_config_path: Path to the configuration file for the translation model.
+        :param num_workers: Number of CPU workers.
+        :param gpu_workers: Indices of the GPU devices. num_workers must be zero if this is non-empty
+        :param cache: cache size. 0 means no cache.
+        :param logging: Log level: trace, debug, info, warn, err(or), critical, off.
+        :param terminology: Path to terminology file, TSV format
+        :param force_terminology: Force terminology to appear on the target side. May impact translation quality.
+        """
+        self._num_workers = num_workers
+        self._gpu_workers = gpu_workers
+        self._cache = cache
+        self._logging = logging
+        self._terminology = terminology
+        self._force_terminology = force_terminology
+        self._terminology_form = terminology_form
+
+        self._config = bergamot.ServiceConfig(
+            self._num_workers,
+            bergamot.VectorSizeT(self._gpu_workers),
+            self._cache,
+            self._logging,
+            self._terminology,
+            self._force_terminology,
+            self._terminology_form,
+        )
+        self._service = bergamot.Service(self._config)
+        self._responseOpts = (
+            bergamot.ResponseOptions()
+        )  # Default false for all, if we want to enable HTML later, from here
+        self._model = self._service.modelFromConfigPath(model_config_path)
+
+    def reset_terminology(
+        self, terminology: str = "", force_terminology: bool = False
+    ) -> None:
+        """Resets the terminology of the model
+        :param terminology: path to the terminology file.
+        :param force_terminology: force terminology
+        :return: None
+        """
+        self._terminology = terminology
+        self._force_terminology = force_terminology
+        self._config = bergamot.ServiceConfig(
+            self._num_workers,
+            bergamot.VectorSizeT(self._gpu_workers),
+            self._cache,
+            self._logging,
+            self._terminology,
+            self._force_terminology,
+            self._terminology_form,
+        )
+        self._service = bergamot.Service(self._config)
+
+    def reset_terminology(
+        self, terminology: Dict[str, str], force_terminology: bool = False
+    ) -> None:
+        """Resets the terminology of the model
+        :param terminology: Dictionary that maps source words to their target side terminology
+        :param force_terminology: force terminology
+        :return: None
+        """
+        self._service.setTerminology(terminology, force_terminology)
+
+    def reset_num_workers(self, num_workers) -> None:
+        """Resets the number of workers
+        :param num_workers: number of parallel CPU threads.
+        :return: None
+        """
+        self._num_workers = num_workers
+        self._config = bergamot.ServiceConfig(
+            self._num_workers,
+            bergamot.VectorSizeT(self._gpu_workers),
+            self._cache,
+            self._logging,
+            self._terminology,
+            self._force_terminology,
+            self._terminology_form,
+        )
+        self._service = bergamot.Service(self._config)
+
+    def reset_gpu_workers(self, gpu_workers: List[int]) -> None:
+        """Resets the number of GPU workers
+        :param gpu_workers: Indices of the GPU devices to be used.
+        :return: None
+        """
+        self._gpu_workers = gpu_workers
+        self._config = bergamot.ServiceConfig(
+            self._num_workers,
+            bergamot.VectorSizeT(self._gpu_workers),
+            self._cache,
+            self._logging,
+            self._terminology,
+            self._force_terminology,
+            self._terminology_form,
+        )
+        self._service = bergamot.Service(self._config)
+
+    def translate(self, sentences: List[str]) -> List[str]:
+        """Translates a list of strings
+        :param sentences: A List of strings to be translated.
+        :return: A list of translation outputs.
+        """
+        responses = self._service.translate(
+            self._model, bergamot.VectorString(sentences), self._responseOpts
+        )
+        return [response.target.text for response in responses]
+
+    # @TODO add async translate with futures
+
+
+def main():
+    parser = argparse.ArgumentParser(description="bergamot-translator interface")
+    parser.add_argument("--config", '-c', required=True, type=str, help='Model YML configuration input.')
+    parser.add_argument("--num-workers", '-n', type=int, default=1, help='Number of CPU workers.')
+    parser.add_argument("--num-gpus", "-g", type=int, action='append', nargs='+', default=None, help='List of GPUs to use.')
+    parser.add_argument("--logging", '-l', type=str, default="off", help='Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off. Default is off')
+    parser.add_argument("--cache-size", type=int, default=0, help='Cache size. 0 for caching is disabled')
+    parser.add_argument("--terminology-tsv", '-t', default="", type=str, help='Path to a terminology file TSV')
+    parser.add_argument("--force-terminology", '-f', action="store_true", help='Force terminology to appear on the target side.')
+    parser.add_argument("--terminology-form", type=str, default="%s __target__ %s __done__ ", help='"Form for terminology. Default is "%%s __target__ %%s __done__ "')
+    parser.add_argument("--path-to-input", '-i', default=None, type=str, help="Path to input file. Uses stdin if empty")
+    parser.add_argument("--batch", '-b', default=32, type=int, help="Number of lines to process in a batch")
+    args = parser.parse_args()
+
+    if args.num_gpus is None:
+        num_gpus = []
+    else:
+        num_gpus = args.num_gpus[0]
+    translator = Translator(args.config, args.num_workers, num_gpus, args.cache_size, args.logging, args.terminology_tsv, args.force_terminology, args.terminology_form)
+
+
+    if args.path_to_input is None:
+        infile = stdin
+    else:
+        infile = open(args.path_to_input, "r", encoding="utf-8")
+
+    # In this example, each block of input text (i.e. a document) is a line.
+    # If you're using the API directly, feel free to include newlines in the
+    # block of text.  We aim to preserve whitespace at sentence boundaries.
+
+    # Buffer input text to allow the backend to parallelize.  We recommend
+    # there be about 16 sentences per worker (thread).  Note that blocks of
+    # text are internally split into sentences, so the number of sentences is
+    # typically larger than the length of the list of blocks provided.
+    buffer = []
+    for line in infile:
+        buffer.append(line.strip())
+        if len(buffer) >= args.batch:
+            print("\n".join(translator.translate(buffer)))
+            buffer = []
+
+    # Flush buffer
+    if len(buffer) > 0:
+        print("\n".join(translator.translate(buffer)))
+
+    if args.path_to_input is not None:
+        infile.close()
+
+
+if __name__ == "__main__":
+    main()