-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathdata_text.py
130 lines (109 loc) · 4.96 KB
/
data_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Text preprocessing functions.
"""
import re
from functools import partial
from typing import Callable, List, Optional
from nntrainer.typext import ConstantHolder
RE_WHITESPACES = re.compile(r"\s+")
class TextPreprocessing(ConstantHolder):
"""
Enum for text preprocessing functions.
"""
BERT_NEW = "bert_new"
BERT_PAPER = "bert_paper"
GPT2 = "gpt2"
SIMPLE = "simple"
NOTHING = "nothing"
WITH_DOTS = "with_dots"
def get_text_preprocessor(func: str) -> Callable[[str], str]:
"""
Given a string descriptor of the function, return the requested preprocessing function.
Args:
func: Function name.
Returns:
Text preprocessing function.
"""
if func == TextPreprocessing.BERT_PAPER:
# original implementation without dots and capitalization
return partial(preprocess_paragraph, begin_paragraph_token="[CLS]", end_sentence_token="[SEP]",
remove_ending_dot=True, replace_inside_dots=True, capitalize=False)
if func == TextPreprocessing.BERT_NEW:
# new BERT implementation, no dots, with casing
return partial(preprocess_paragraph, begin_paragraph_token="[CLS]", end_sentence_token="[SEP]",
remove_ending_dot=True, replace_inside_dots=True)
if func == TextPreprocessing.GPT2:
return partial(preprocess_paragraph, add_space_before_token=False)
if func == TextPreprocessing.SIMPLE:
return preprocess_paragraph
if func == TextPreprocessing.NOTHING:
return partial(preprocess_paragraph, capitalize=False)
if func == TextPreprocessing.WITH_DOTS:
return partial(preprocess_paragraph, remove_ending_dot=True, replace_inside_dots=True, capitalize=False)
raise NotImplementedError(f"Text Processing '{func}' unknown")
def preprocess_paragraph(
paragraph: List[str], begin_sentence_token: Optional[str] = None, end_sentence_token: Optional[str] = None,
begin_paragraph_token: Optional[str] = None, end_paragraph_token: Optional[str] = None,
add_space_before_token: bool = True,
remove_ending_dot: bool = False, replace_inside_dots: bool = False, capitalize: bool = True) -> str:
"""
Preprocess list of a paragraph into a single sentence.
"""
new_paragraph = []
space_before_token = " " if add_space_before_token else ""
# define tokens between sentences
between_sentence_token = None
if end_sentence_token is not None or begin_sentence_token is not None:
between_sentence_token = (f"{'' if end_sentence_token is None else f'{end_sentence_token} '}"
f"{'' if begin_sentence_token is None else f'{begin_sentence_token}'}")
for num_sentence, sentence in enumerate(paragraph):
# strip and remove whitespaces
sentence = RE_WHITESPACES.sub(" ", sentence).strip()
assert len(sentence) > 0
# remove last dot if requested, but keep multiple dots
if remove_ending_dot:
if sentence[-1] == "." and len(sentence) > 1 and sentence[-2] != ".":
sentence = sentence[:-1]
# add last dot if requested
else:
if sentence[-1] != ".":
sentence += "."
if capitalize:
sentence = sentence.capitalize()
sentence = sentence.strip()
# dots inside sentences can happen in some datasets
if capitalize:
# capitalize after the dot
find_pos = sentence.find(". ")
if find_pos > -1:
while True:
if find_pos > len(sentence):
break
find_pos += 1
if sentence[find_pos].isalnum():
sentence = sentence[:find_pos] + sentence[find_pos:].capitalize()
break
if replace_inside_dots and between_sentence_token is not None:
# add tokens and replace dot
sentence = sentence.replace(". ", f"{'' if remove_ending_dot else '.'} {between_sentence_token} ")
# start building new list of words
new_words = []
if begin_paragraph_token is not None and num_sentence == 0:
new_words.append(begin_paragraph_token)
if begin_sentence_token is not None:
new_words.append(begin_sentence_token)
# fill list of words, make sure there are no empty words or spaces
words = sentence.split(" ")
for word in words:
word = word.strip()
if word == "":
continue
new_words.append(f" {word.strip()}")
# finish building new list of words
if end_sentence_token is not None:
new_words.append(f"{space_before_token}{end_sentence_token}")
if end_paragraph_token is not None and num_sentence == len(paragraph) - 1:
new_words.append(f"{space_before_token}{end_paragraph_token}")
sentence = "".join(new_words).strip()
new_paragraph.append(sentence)
return new_paragraph