release 0.2.0, now depending on text-utils>=0.2.1

ad-freiburg · Nov 2, 2023 · 287f34e · 287f34e
1 parent cf62a05
commit 287f34e
Show file tree

Hide file tree

Showing 16 changed files with 144 additions and 148 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "third_party/text-correction-utils"]
-	path = third_party/text-correction-utils
-	url = https://github.com/bastiscode/text-correction-utils
+[submodule "third_party/text-utils"]
+	path = third_party/text-utils
+	url = https://github.com/ad-freiburg/text-utils
diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ echo "splitthissentenceforme" | wsc
 cat "path/to/input/file.txt" | wsc > output.txt
 
 # correct a string using
-wsc -c "splitthissentenceforme"
+wsc -p "splitthissentenceforme"
 
 # correct a text file line by line and print the corrected lines
 wsc -f path/to/input/file.txt

diff --git a/configs/eo_lstm_char.yaml b/configs/eo_lstm_char.yaml
@@ -26,8 +26,7 @@ model:
     num_classes: 3
 
 train:
-  mixed_precision: true
-  mixed_precision_dtype: env(MIXED_PRECISION_DTYPE:fp16)
+  precision: fp16
   clip_grad_norm: env(CLIP_GRAD_NORM:1.0)
   num_epochs: env(NUM_EPOCHS:3)
   eval_interval: eval(1 / env(EVAL_PER_EPOCH:10))
@@ -47,6 +46,7 @@ train:
     strategy: weighted
     shuffle: true
     sort: true
+    max_length: env(MAX_LENGTH:512)
     buffer_size: env(BATCH_LIMIT:32)
     prefetch_factor: env(PREFETCH_FACTOR:2048)
     num_threads: eval(env(THREADS:None) or len(os.sched_getaffinity(0)) // 2)

diff --git a/configs/eo_transformer_byt5.yaml b/configs/eo_transformer_byt5.yaml
@@ -24,8 +24,7 @@ model:
     num_classes: 3
 
 train:
-  mixed_precision: env(MIXED_PRECISION:true)
-  mixed_precision_dtype: env(MIXED_PRECISION_DTYPE:bfp16)
+  precision: env(PRECISION:fp16)
   clip_grad_norm: env(CLIP_GRAD_NORM:1.0)
   num_epochs: env(NUM_EPOCHS:1)
   eval_interval: eval(1 / env(EVAL_PER_EPOCH:10))
@@ -50,6 +49,7 @@ train:
     strategy: weighted
     shuffle: true
     sort: true
+    max_length: env(MAX_LENGTH:512)
     buffer_size: env(BATCH_LIMIT:32)
     prefetch_factor: env(PREFETCH_FACTOR:2048)
     num_threads: eval(env(THREADS:None) or len(os.sched_getaffinity(0)) // 2)

diff --git a/configs/eo_transformer_byt5_scratch.yaml b/configs/eo_transformer_byt5_scratch.yaml
@@ -25,8 +25,7 @@ model:
     num_classes: 3
 
 train:
-  mixed_precision: env(MIXED_PRECISION:true)
-  mixed_precision_dtype: env(MIXED_PRECISION_DTYPE:bfp16)
+  precision: env(PRECISION:fp16)
   clip_grad_norm: env(CLIP_GRAD_NORM:1.0)
   num_epochs: env(NUM_EPOCHS:1)
   eval_interval: eval(1 / env(EVAL_PER_EPOCH:10))
@@ -47,6 +46,7 @@ train:
     strategy: weighted
     shuffle: true
     sort: true
+    max_length: env(MAX_LENGTH:512)
     buffer_size: env(BATCH_LIMIT:32)
     prefetch_factor: env(PREFETCH_FACTOR:2048)
     num_threads: eval(env(THREADS:None) or len(os.sched_getaffinity(0)) // 2)

diff --git a/configs/eo_transformer_byte_v1_like.yaml b/configs/eo_transformer_byte_v1_like.yaml
@@ -35,8 +35,7 @@ model:
     num_classes: 3
 
 train:
-  mixed_precision: env(MIXED_PRECISION:true)
-  mixed_precision_dtype: env(MIXED_PRECISION_DTYPE:fp16)
+  precision: env(PRECISION:fp16)
   clip_grad_norm: env(CLIP_GRAD_NORM:1.0)
   num_epochs: env(NUM_EPOCHS:3)
   eval_interval: eval(1 / env(EVAL_PER_EPOCH:10))
@@ -56,6 +55,7 @@ train:
     strategy: weighted
     shuffle: true
     sort: true
+    max_length: env(MAX_LENGTH:512)
     buffer_size: env(BATCH_LIMIT:32)
     prefetch_factor: env(PREFETCH_FACTOR:2048)
     num_threads: eval(env(THREADS:None) or len(os.sched_getaffinity(0)) // 2)

diff --git a/configs/eo_transformer_char_v1_like.yaml b/configs/eo_transformer_char_v1_like.yaml
@@ -29,8 +29,7 @@ model:
     num_classes: 3
 
 train:
-  mixed_precision: true
-  mixed_precision_dtype: env(MIXED_PRECISION_DTYPE:fp16)
+  precision: env(PRECISION:fp16)
   clip_grad_norm: env(CLIP_GRAD_NORM:1.0)
   num_epochs: env(NUM_EPOCHS:3)
   eval_interval: eval(1 / env(EVAL_PER_EPOCH:10))
@@ -50,6 +49,7 @@ train:
     strategy: weighted
     shuffle: true
     sort: true
+    max_length: env(MAX_LENGTH:512)
     buffer_size: env(BATCH_LIMIT:32)
     prefetch_factor: env(PREFETCH_FACTOR:2048)
     num_threads: eval(env(THREADS:None) or len(os.sched_getaffinity(0)) // 2)

diff --git a/configs/server.yaml b/configs/server.yaml
@@ -1,14 +1,13 @@
 port: 40000
 timeout: 10
-# precision: fp16
 # allow_origin: test.mydomain.com 
 base_url: env(BASE_URL:/api)
 models:
   # load a pretrained model by specifying the name
-  # - eo_large_arxiv
+  # - name: eo_large_arxiv
   # load a model from a local experiment by specifying the
   # directory path (you can use special configuration operators,
   # e.g. env(ENV_VAR) to load env variables)
-  - env(EXPERIMENT)
+  - path: env(EXPERIMENT)
 batch_size: env(BATCH_SIZE:16)
 # batch_max_tokens: env(BATCH_MAX_TOKENS:8192)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "whitespace_correction"
-version = "0.1.5"
+version = "0.2.0"
 description = "Correct missing or spurious whitespaces in text."
 authors = [
     { name = "Sebastian Walter", email = "[email protected]" }
@@ -20,7 +20,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "text-correction-utils==0.1.4",
+    "dtpu>=0.2.1",
     "transformers>=4.26.0"
 ]
 

diff --git a/scripts/generate_visualizations.py b/scripts/generate_visualizations.py
@@ -6,7 +6,7 @@
 
 from whitespace_correction.api import WhitespaceCorrector
 
-from text_correction_utils import hook, logging
+from text_utils import hook, logging
 
 from torch import nn
 import numpy as np

diff --git a/src/whitespace_correction/api/cli.py b/src/whitespace_correction/api/cli.py
@@ -1,27 +1,27 @@
 from io import TextIOWrapper
 from typing import Iterator, Optional, Union
 
-from text_correction_utils.api.cli import TextCorrectionCli
-from text_correction_utils import data
+from text_utils.api.cli import TextProcessingCli
+from text_utils import data
 
 from whitespace_correction import version
 from whitespace_correction.api.corrector import WhitespaceCorrector
 from whitespace_correction.api.server import WhitespaceCorrectionServer
 
 
-class WhitespaceCorrectionCli(TextCorrectionCli):
-    text_corrector_cls = WhitespaceCorrector
-    text_correction_server_cls = WhitespaceCorrectionServer
+class WhitespaceCorrectionCli(TextProcessingCli):
+    text_processor_cls = WhitespaceCorrector
+    text_processing_server_cls = WhitespaceCorrectionServer
 
     def version(self) -> str:
         return version.__version__
 
-    def correct_iter(
+    def process_iter(
         self,
-        corrector: WhitespaceCorrector,
+        processor: WhitespaceCorrector,
         iter: Iterator[data.InferenceData]
     ) -> Iterator[data.InferenceData]:
-        yield from corrector.correct_iter(
+        yield from processor.correct_iter(
             ((data.text, data.language) for data in iter),
             self.args.batch_size,
             self.args.batch_max_tokens,
@@ -31,14 +31,14 @@ def correct_iter(
             show_progress=self.args.progress
         )
 
-    def correct_file(
+    def process_file(
         self,
-        corrector: WhitespaceCorrector,
+        processor: WhitespaceCorrector,
         path: str,
         lang: Optional[str],
         out_file: Union[str, TextIOWrapper]
     ):
-        corrector.correct_file(
+        processor.correct_file(
             path,
             self.args.input_format,
             out_file,