-
Notifications
You must be signed in to change notification settings - Fork 10
/
preprocess_char.py
executable file
·59 lines (43 loc) · 2 KB
/
preprocess_char.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re, os
from tqdm import tqdm
import codecs
import sys
def preprocess_dataset(use_vocab=False, use_lower=False, use_blacklist=False,
blacklist_file='blacklist.txt', blacklist_threshold=100):
if not os.path.exists('./dataset'):
os.mkdir('./dataset')
vocab = {}
def add_to_vocab(string):
for char in string:
if char in vocab: vocab[char] += 1
else: vocab[char] = 1
def preprocess_paper(raw_text):
output = ''
raw_text = raw_text.split('\n')
# Find lines start with % which means comment in laTex, replace them lines with newlines
raw_text = ['\n' if i is '' else i for i in raw_text if len(re.findall('^\s{0,}%', i)) == 0]
for line in raw_text:
if use_lower: line = line.lower()
if use_vocab: add_to_vocab(line)
output += (line + '\n')
if use_vocab: return output, vocab
else: return output
# Creating blacklist that contains items lower than threshold and write it in RegEx format
# There can be needed manual replacements for non-english characters
def create_blacklist(vocab):
blacklist = [i for i in vocab if vocab[i] < blacklist_threshold]
with open(out_file, 'wb') as file:
file.write('|'.join(blacklist_file).encode('utf-8'))
for file_name in tqdm(os.listdir('dataset_generation/papers'),ascii=True):
with open('dataset_generation/papers/%s' % file_name, 'rb') as file:
paper_text = file.read().decode('utf-8', 'ignore')
if use_vocab: (output, vocab) = preprocess_paper(paper_text)
else: output = preprocess_paper(paper_text)
with open('dataset/%s' % file_name, 'wb') as file:
file.write(output.encode('utf-8'))
print('>>Char based dataset is created...')
if use_vocab: return vocab
def read_blacklist(file_name):
with open(file_name, 'rb') as file:
blacklist = file.read().decode('utf-8').split('|')
return blacklist