fix conflicts, add some setup consistency

kermitt2 · Feb 28, 2025 · 01c9a4f · 01c9a4f
2 parents dc6391f + 6d12f76
commit 01c9a4f
Show file tree

Hide file tree

Showing 13 changed files with 105 additions and 193,074 deletions.
diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml
@@ -1,33 +1,38 @@
 name: Build
 
-on: [push]
+on: [ push ]
 
-concurrency: 
+concurrency:
   group: build
-#  cancel-in-progress: true
+  cancel-in-progress: true
 
 
 jobs:
   build:
     runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        python-version: [ '3.9', '3.10', '3.11' ]
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
-      with:
-        python-version: "3.8"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        pytest
+      - name: Cleanup more disk space
+        run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 pytest
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Test with pytest
+        run: |
+          pytest
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -176,7 +176,7 @@
 
    END OF TERMS AND CONDITIONS
 
-   Copyright [2018-2023] the DeLFT contributors
+   Copyright [2018-2025] the DeLFT contributors
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/Readme.md b/Readme.md
@@ -1,10 +1,11 @@
 <img align="right" width="150" height="150" src="doc/cat-delft-small.jpg">
 
 [![Documentation Status](https://readthedocs.org/projects/delft/badge/?version=latest)](https://readthedocs.org/projects/delft/?badge=latest)
-[![Build Status](https://travis-ci.org/kermitt2/delft.svg?branch=master)](https://travis-ci.org/kermitt2/delft)
+[![Build](https://github.com/kermitt2/delft/actions/workflows/ci-build-unstable.yml/badge.svg)](https://github.com/kermitt2/delft/actions/workflows/ci-build-unstable.yml)
 [![PyPI version](https://badge.fury.io/py/delft.svg)](https://badge.fury.io/py/delft)
 [![SWH](https://archive.softwareheritage.org/badge/origin/https://github.com/kermitt2/delft/)](https://archive.softwareheritage.org/browse/origin/https://github.com/kermitt2/delft/)
 [![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
+[![Downloads](https://static.pepy.tech/badge/delft)](https://pepy.tech/project/delft)
 
 
 # DeLFT
@@ -63,7 +64,7 @@ cd delft
 It is advised to setup first a virtual environment to avoid falling into one of these gloomy python dependency marshlands:
 
 ```sh
-virtualenv --system-site-packages -p python3.8 env
+virtualenv --system-site-packages -p python3.10 env
 source env/bin/activate
 ```
 
@@ -98,7 +99,7 @@ If you want to this work, please refer to the present GitHub project, together w
     title = {DeLFT},
     howpublished = {\url{https://github.com/kermitt2/delft}},
     publisher = {GitHub},
-    year = {2018--2024},
+    year = {2018--2025},
     archivePrefix = {swh},
     eprint = {1:dir:54eb292e1c0af764e27dd179596f64679e44d06e}
 }

diff --git a/classifiers.txt b/classifiers.txt
@@ -0,0 +1,3 @@
+Programming Language :: Python :: 3.10
+License :: OSI Approved :: Apache Software License
+Operating System :: OS Independent
diff --git a/data/sequenceLabelling/grobid/quantities/Readme.md b/data/sequenceLabelling/grobid/quantities/Readme.md
@@ -0,0 +1 @@
+The training data for the Deep Learning models used in Grobid-quantities can be found on the Grobid-quantities repository: https://github.com/lfoppiano/grobid-quantities.
diff --git a/data/sequenceLabelling/grobid/quantities/quantities.all.train b/data/sequenceLabelling/grobid/quantities/quantities.all.train
diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
@@ -321,8 +321,14 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_
                 chars_tokens.append(self.empty_char_vector)
 
         # sub-tokenization
-        encoded_result = self.tokenizer(text_tokens, add_special_tokens=True, is_split_into_words=True,
-            max_length=max_seq_length, truncation=True, return_offsets_mapping=True)
+        encoded_result = self.tokenizer(
+            text_tokens,
+            add_special_tokens=True,
+            is_split_into_words=True,
+            max_length=max_seq_length,
+            truncation=True,
+            return_offsets_mapping=True
+        )
 
         input_ids = encoded_result.input_ids
         offsets = encoded_result.offset_mapping

diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py
@@ -168,8 +168,8 @@ def train_model(self, local_model, x_train, y_train, f_train=None,
 
         # multiple workers should work with transformer layers, but not with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
         if self.model_config.transformer_name is not None or (self.embeddings and self.embeddings.use_ELMo):
-            # worker at 0 means the training will be executed in the main thread
-            nb_workers = 0 
+            # worker at 1 means the training will be executed in the main thread
+            nb_workers = 1
             multiprocessing = False
 
         local_model.fit(training_generator,

diff --git a/delft/utilities/Transformer.py b/delft/utilities/Transformer.py
@@ -128,35 +128,30 @@ def init_preprocessor(self, max_sequence_length: int,
                 do_lower_case = False
 
             if do_lower_case is not None:
-                if self.auth_token != None:
+                if self.auth_token is not None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 do_lower_case=do_lower_case)
             else:
                 if self.auth_token != None:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space, 
                                                                 use_auth_token=self.auth_token)
                 else:
                     self.tokenizer = AutoTokenizer.from_pretrained(self.name,
-                                                                add_special_tokens=add_special_tokens,
                                                                 max_length=max_sequence_length,
                                                                 add_prefix_space=add_prefix_space)
 
         elif self.loading_method == LOADING_METHOD_LOCAL_MODEL_DIR:
             self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir_path,
-                                                           add_special_tokens=add_special_tokens,
                                                            max_length=max_sequence_length,
                                                            add_prefix_space=add_prefix_space)
         elif self.loading_method == LOADING_METHOD_PLAIN_MODEL:

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.bumpversion]
+current_version = "0.3.4"
+commit = "true"
+tag = "true"
+tag_name = "v{new_version}"
+
+[project]
+name = "delft"
+description = "a Deep Learning Framework for Text"
+readme = "Readme.md"
+authors = [
+    { name = "Patrice Lopez", email = "[email protected]" }
+]
+maintainers = [
+    { name = "Patrice Lopez", email = "[email protected]" },
+    { name = "Luca Foppiano", email = "[email protected]" }
+]
+requires-python = ">=3.10"
+
+dynamic = ['version', "dependencies"]
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+classifiers = {file = ["classifiers.txt"]}
+
+[project.urls]
+Homepage = "https://github.com/kermitt2/delft"
+Repository = "https://github.com/kermitt2/delft"
+Changelog = "https://github.com/kermitt2/delft"
+
+[tool.setuptools.packages.find]
+exclude = ["test", "*.test", "*.test.*"]
diff --git a/requirements.macos.txt b/requirements.macos.txt
@@ -0,0 +1,18 @@
+numpy==1.23.5
+regex==2021.11.10 
+scikit-learn==1.1
+tqdm==4.62.3
+tensorflow==2.17.1
+tf_keras==2.17.0
+h5py==3.10.0
+unidecode==1.3.2
+pydot==1.4.0
+lmdb==1.2.1
+truecase
+requests>=2.20
+pandas==1.3.5
+transformers==4.46.1
+pytest
+#tensorflow-addons==0.19.0
+tfa-nightly
+accelerate>=0.20.3
diff --git a/requirements.txt.dev b/requirements.txt.dev
@@ -0,0 +1 @@
+bump-my-version
diff --git a/setup.py b/setup.py
@@ -11,9 +11,9 @@
     url="https://github.com/kermitt2/delft",
     packages=find_packages(exclude=['test', '*.test', '*.test.*']),  
     include_package_data=True,
-    python_requires='>=3.7',
+    python_requires='>=3.10',
     classifiers=[
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.10",
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
     ],
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The training data for the Deep Learning models used in Grobid-quantities can be found on the Grobid-quantities repository: https://github.com/lfoppiano/grobid-quantities.