kermitt2 · kermitt2 · Jul 1, 2024 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml
@@ -4,21 +4,21 @@ on: [ push ]
 
 concurrency:
   group: build
-#  cancel-in-progress: true
+  cancel-in-progress: true
 
 
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8, 3.9 ]
+        python-version: [ '3.9', '3.10', '3.11' ]
     steps:
       - name: Cleanup more disk space
         run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/checkout@v2
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -176,7 +176,7 @@
 
    END OF TERMS AND CONDITIONS
 
-   Copyright [2018-2023] the DeLFT contributors
+   Copyright [2018-2025] the DeLFT contributors
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/Readme.md b/Readme.md
@@ -64,7 +64,7 @@ cd delft
 It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands:
 
 ```sh
-virtualenv --system-site-packages -p python3.8 env
+virtualenv --system-site-packages -p python3.10 env
 source env/bin/activate
 ```
 
@@ -99,7 +99,7 @@ If you want to this work, please refer to the present GitHub project, together w
     title = {DeLFT},
     howpublished = {\url{https://github.com/kermitt2/delft}},
     publisher = {GitHub},
-    year = {2018--2024},
+    year = {2018--2025},
     archivePrefix = {swh},
     eprint = {1:dir:54eb292e1c0af764e27dd179596f64679e44d06e}
 }

diff --git a/classifiers.txt b/classifiers.txt
@@ -1,3 +1,3 @@
-Programming Language :: Python :: 3.8
+Programming Language :: Python :: 3.10
 License :: OSI Approved :: Apache Software License
 Operating System :: OS Independent
diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
@@ -321,8 +321,14 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_
                 chars_tokens.append(self.empty_char_vector)
 
         # sub-tokenization
-        encoded_result = self.tokenizer(text_tokens, add_special_tokens=True, is_split_into_words=True,
-            max_length=max_seq_length, truncation=True, return_offsets_mapping=True)
+        encoded_result = self.tokenizer(
+            text_tokens,
+            add_special_tokens=True,
+            is_split_into_words=True,
+            max_length=max_seq_length,
+            truncation=True,
+            return_offsets_mapping=True
+        )
 
         input_ids = encoded_result.input_ids
         offsets = encoded_result.offset_mapping

diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py
@@ -168,8 +168,8 @@ def train_model(self, local_model, x_train, y_train, f_train=None,
 
         # multiple workers should work with transformer layers, but not with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
         if self.model_config.transformer_name is not None or (self.embeddings and self.embeddings.use_ELMo):
-            # worker at 0 means the training will be executed in the main thread
-            nb_workers = 0 
+            # worker at 1 means the training will be executed in the main thread
+            nb_workers = 1
             multiprocessing = False
 
         local_model.fit(training_generator,

diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py
@@ -2,6 +2,11 @@
 
 from packaging import version
 
+# for using legacy Keras 2, and not Keras 3 installed by default from TensorFlow 2.16
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import tf_keras as keras
+
 # ask tensorflow to be quiet and not print hundred lines of logs
 from delft.utilities.Transformer import TRANSFORMER_CONFIG_FILE_NAME, DEFAULT_TRANSFORMER_TOKENIZER_DIR
 from delft.utilities.misc import print_parameters

diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py
@@ -1,5 +1,12 @@
 import os
 
+from packaging import version
+
+# for using legacy Keras 2, and not Keras 3 installed by default from TensorFlow 2.16
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import tf_keras as keras
+
 from delft.sequenceLabelling.trainer import LogLearningRateCallback
 # ask tensorflow to be quiet and not print hundred lines of logs
 from delft.utilities.misc import print_parameters

diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py
@@ -1,6 +1,8 @@
 import os
 from typing import Union, Iterable
 
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
 from transformers import AutoTokenizer, TFAutoModel, AutoConfig, BertTokenizer, TFBertModel
 
 TRANSFORMER_CONFIG_FILE_NAME = 'transformer-config.json'
@@ -126,35 +128,30 @@ def init_preprocessor(self, max_sequence_length: int,
                 do_lower_case = False
 
             if do_lower_case is not None:
-                if self.auth_token != None:
+                if self.auth_token is not None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case)
             else:
                 if self.auth_token != None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space)
 
         elif self.loading_method == LOADING_METHOD_LOCAL_MODEL_DIR:
             self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir_path,
-                                                           add_special_tokens=add_special_tokens,
                                                            max_length=max_sequence_length,
                                                            add_prefix_space=add_prefix_space)
         elif self.loading_method == LOADING_METHOD_PLAIN_MODEL:

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ maintainers = [
     { name = "Patrice Lopez", email = "[email protected]" },
     { name = "Luca Foppiano", email = "[email protected]" }
 ]
-requires-python = ">=3.7"
+requires-python = ">=3.10"
 
 dynamic = ['version', "dependencies"]
 

diff --git a/requirements.macos.txt b/requirements.macos.txt
@@ -0,0 +1,18 @@
+numpy==1.23.5
+regex==2021.11.10 
+scikit-learn==1.1
+tqdm==4.62.3
+tensorflow==2.17.1
+tf_keras==2.17.0
+h5py==3.10.0
+unidecode==1.3.2
+pydot==1.4.0
+lmdb==1.2.1
+truecase
+requests>=2.20
+pandas==1.3.5
+transformers==4.46.1
+pytest
+#tensorflow-addons==0.19.0
+tfa-nightly
+accelerate>=0.20.3
diff --git a/requirements.txt b/requirements.txt
@@ -1,18 +1,23 @@
-numpy==1.22.3
-regex==2021.11.10 
-scikit-learn==1.0.1
+numpy==1.26.4
+regex
+scikit-learn==1.6.1
 tqdm==4.62.3
-tensorflow==2.9.3
-h5py==3.6.0
+#tensorflow==2.16.1
+tensorflow[and-cuda]==2.17.1
+tf_keras==2.17.0
+h5py==3.11.0
 unidecode==1.3.2
 pydot==1.4.0
-lmdb==1.2.1
+lmdb
 truecase
 requests>=2.20
 pandas==1.3.5
 transformers==4.33.2 
-torch==1.10.1
+#transformers==4.40.0
+torch
 pytest
-tensorflow-addons==0.19.0
+#tensorflow-addons==0.19.0
+tfa-nightly==0.23.0.dev20240415222534
 blingfire==0.1.8
 accelerate>=0.20.3
+Pillow
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="delft",
+    version="0.3.4",
+    author="Patrice Lopez",
+    author_email="[email protected]",
+    description="a Deep Learning Framework for Text",
+    long_description=open("Readme.md", encoding='utf-8').read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/kermitt2/delft",
+    packages=find_packages(exclude=['test', '*.test', '*.test.*']),  
+    include_package_data=True,
+    python_requires='>=3.10',
+    classifiers=[
+        "Programming Language :: Python :: 3.10",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+)