diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index f9ea863..6fe8bf2 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -4,7 +4,7 @@ on: [ push ] concurrency: group: build -# cancel-in-progress: true + cancel-in-progress: true jobs: @@ -12,13 +12,13 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ 3.8, 3.9 ] + python-version: [ '3.10', '3.11' ] steps: - name: Cleanup more disk space run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' diff --git a/LICENSE.txt b/LICENSE.txt index 17ad6fb..1955449 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -176,7 +176,7 @@ END OF TERMS AND CONDITIONS - Copyright [2018-2023] the DeLFT contributors + Copyright [2018-2025] the DeLFT contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Readme.md b/Readme.md index b852958..a0d266b 100644 --- a/Readme.md +++ b/Readme.md @@ -64,7 +64,7 @@ cd delft It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands: ```sh -virtualenv --system-site-packages -p python3.8 env +virtualenv --system-site-packages -p python3.10 env source env/bin/activate ``` @@ -99,7 +99,7 @@ If you want to this work, please refer to the present GitHub project, together w title = {DeLFT}, howpublished = {\url{https://github.com/kermitt2/delft}}, publisher = {GitHub}, - year = {2018--2024}, + year = {2018--2025}, archivePrefix = {swh}, eprint = {1:dir:54eb292e1c0af764e27dd179596f64679e44d06e} } diff --git a/classifiers.txt b/classifiers.txt index c92b88c..8a49ca2 100644 --- a/classifiers.txt +++ b/classifiers.txt @@ -1,3 +1,3 @@ -Programming Language :: Python :: 3.8 +Programming Language :: Python :: 3.10 License :: OSI Approved :: Apache Software License Operating System :: OS Independent \ No newline at end of file diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py index 2f33e52..cd60dd6 100644 --- a/delft/sequenceLabelling/preprocess.py +++ b/delft/sequenceLabelling/preprocess.py @@ -321,8 +321,14 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ chars_tokens.append(self.empty_char_vector) # sub-tokenization - encoded_result = self.tokenizer(text_tokens, add_special_tokens=True, is_split_into_words=True, - max_length=max_seq_length, truncation=True, return_offsets_mapping=True) + encoded_result = self.tokenizer( + text_tokens, + add_special_tokens=True, + is_split_into_words=True, + max_length=max_seq_length, + truncation=True, + return_offsets_mapping=True + ) input_ids = encoded_result.input_ids offsets = encoded_result.offset_mapping diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index 6c44455..08417f8 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -168,8 +168,8 @@ def train_model(self, local_model, x_train, y_train, f_train=None, # multiple workers should work with transformer layers, but not with ELMo due to GPU memory limit (with GTX 1080Ti 11GB) if self.model_config.transformer_name is not None or (self.embeddings and self.embeddings.use_ELMo): - # worker at 0 means the training will be executed in the main thread - nb_workers = 0 + # worker at 1 means the training will be executed in the main thread + nb_workers = 1 multiprocessing = False local_model.fit(training_generator, diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index f2b2d80..d1e652a 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -2,6 +2,11 @@ from packaging import version +# for using legacy Keras 2, and not Keras 3 installed by default from TensorFlow 2.16 +os.environ["TF_USE_LEGACY_KERAS"] = "1" +os.environ["KERAS_BACKEND"] = "tensorflow" +import tf_keras as keras + # ask tensorflow to be quiet and not print hundred lines of logs from delft.utilities.Transformer import TRANSFORMER_CONFIG_FILE_NAME, DEFAULT_TRANSFORMER_TOKENIZER_DIR from delft.utilities.misc import print_parameters diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index cc8127e..0c25a69 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -1,5 +1,12 @@ import os +from packaging import version + +# for using legacy Keras 2, and not Keras 3 installed by default from TensorFlow 2.16 +os.environ["TF_USE_LEGACY_KERAS"] = "1" +os.environ["KERAS_BACKEND"] = "tensorflow" +import tf_keras as keras + from delft.sequenceLabelling.trainer import LogLearningRateCallback # ask tensorflow to be quiet and not print hundred lines of logs from delft.utilities.misc import print_parameters diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py index 24e3a74..4b58f05 100644 --- a/delft/utilities/Transformer.py +++ b/delft/utilities/Transformer.py @@ -1,6 +1,8 @@ import os from typing import Union, Iterable +os.environ["KERAS_BACKEND"] = "tensorflow" + from transformers import AutoTokenizer, TFAutoModel, AutoConfig, BertTokenizer, TFBertModel TRANSFORMER_CONFIG_FILE_NAME = 'transformer-config.json' @@ -126,35 +128,30 @@ def init_preprocessor(self, max_sequence_length: int, do_lower_case = False if do_lower_case is not None: - if self.auth_token != None: + if self.auth_token is not None: self.tokenizer = AutoTokenizer.from_pretrained(self.name, - add_special_tokens=add_special_tokens, max_length=max_sequence_length, add_prefix_space=add_prefix_space, do_lower_case=do_lower_case, use_auth_token=self.auth_token) else: self.tokenizer = AutoTokenizer.from_pretrained(self.name, - add_special_tokens=add_special_tokens, max_length=max_sequence_length, add_prefix_space=add_prefix_space, do_lower_case=do_lower_case) else: if self.auth_token != None: self.tokenizer = AutoTokenizer.from_pretrained(self.name, - add_special_tokens=add_special_tokens, max_length=max_sequence_length, add_prefix_space=add_prefix_space, use_auth_token=self.auth_token) else: self.tokenizer = AutoTokenizer.from_pretrained(self.name, - add_special_tokens=add_special_tokens, max_length=max_sequence_length, add_prefix_space=add_prefix_space) elif self.loading_method == LOADING_METHOD_LOCAL_MODEL_DIR: self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir_path, - add_special_tokens=add_special_tokens, max_length=max_sequence_length, add_prefix_space=add_prefix_space) elif self.loading_method == LOADING_METHOD_PLAIN_MODEL: diff --git a/pyproject.toml b/pyproject.toml index ddaf17f..da1289d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ maintainers = [ { name = "Patrice Lopez", email = "patrice.lopez@science-miner.com" }, { name = "Luca Foppiano", email = "lucanoro@duck.com" } ] -requires-python = ">=3.7" +requires-python = ">=3.10" dynamic = ['version', "dependencies"] diff --git a/requirements.macos.txt b/requirements.macos.txt new file mode 100644 index 0000000..3b5568f --- /dev/null +++ b/requirements.macos.txt @@ -0,0 +1,18 @@ +numpy==1.23.5 +regex==2021.11.10 +scikit-learn==1.1 +tqdm==4.62.3 +tensorflow==2.17.1 +tf_keras==2.17.0 +h5py==3.10.0 +unidecode==1.3.2 +pydot==1.4.0 +lmdb==1.2.1 +truecase +requests>=2.20 +pandas==1.3.5 +transformers==4.46.1 +pytest +#tensorflow-addons==0.19.0 +tfa-nightly +accelerate>=0.20.3 diff --git a/requirements.txt b/requirements.txt index c45c129..30ad744 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,23 @@ -numpy==1.22.3 -regex==2021.11.10 -scikit-learn==1.0.1 +numpy==1.26.4 +regex +scikit-learn==1.6.1 tqdm==4.62.3 -tensorflow==2.9.3 -h5py==3.6.0 +#tensorflow==2.16.1 +tensorflow[and-cuda]==2.17.1 +tf_keras==2.17.0 +h5py==3.11.0 unidecode==1.3.2 pydot==1.4.0 -lmdb==1.2.1 +lmdb truecase requests>=2.20 pandas==1.3.5 transformers==4.33.2 -torch==1.10.1 +#transformers==4.40.0 +torch pytest -tensorflow-addons==0.19.0 +#tensorflow-addons==0.19.0 +tfa-nightly==0.23.0.dev20240415222534 blingfire==0.1.8 accelerate>=0.20.3 +Pillow diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6c6c33c --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup, find_packages + +setup( + name="delft", + version="0.3.4", + author="Patrice Lopez", + author_email="patrice.lopez@science-miner.com", + description="a Deep Learning Framework for Text", + long_description=open("Readme.md", encoding='utf-8').read(), + long_description_content_type="text/markdown", + url="https://github.com/kermitt2/delft", + packages=find_packages(exclude=['test', '*.test', '*.test.*']), + include_package_data=True, + python_requires='>=3.10', + classifiers=[ + "Programming Language :: Python :: 3.10", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], +)