-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUntitled
25 lines (22 loc) · 845 Bytes
/
Untitled
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
input = fear,"Every time I imagine that someone I love or I could contact a á
serious illness, even death."
def clean_str_vn(string):
"""
Tokenization/string cleaning for all datasets except for SST.
"""
string = re.sub(r"[~`@#$%^&*-+]", " ", string)
def sharp(str):
b = re.sub('\s[A-Za-z]\s\.', ' .', ' '+str)
while (b.find('. . ')>=0): b = re.sub(r'\.\s\.\s', '. ', b)
b = re.sub(r'\s\.\s', ' # ', b)
return b
string = sharp(string)
string = re.sub(r" : ", ":", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
clean_string(input)