forked from fajri91/sum_liputan6
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_preprocessing.py
80 lines (67 loc) · 2.34 KB
/
1_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# coding: utf-8
import json
import glob
import json
import re, os
regex_bracket = r"\(([^)]+)\)"
punctuation = '.,!?\'\[]();"'
unknown = set()
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def clean_article(article):
article = cleanhtml(article)
article = article.replace('\n', ' ')
sentences = []
words = []
for word in article.split(' '):
word = word.replace('–', '-')
word = word.replace('__', '').replace('--', '')
word = word.replace('"', '"')
word = word.strip()
if len(word) > 0:
tokens = re.findall(r"[\w'\%\&\-\/\=\+\*$£]+|[\[\]().,!?\:;\"\“\”]", word)
words += tokens
try:
if tokens[-1] in '.!?':
sentences.append(words)
words = []
except:
unknown.add(word)
if words != []:
if not words[-1][-1] in '.!?':
words.append('.')
sentences.append(words)
return sentences
def get_string(sentences):
all_sentence = []
for sentence in sentences:
all_sentence += sentence
return ' '.join(all_sentence)
def process(PATH, DST):
os.makedirs(DST, exist_ok=True)
files = glob.glob(PATH)
for file in files:
data = json.load(open(file))
clean_data = {}
article = data['content']
summary = data['summary']
if(len(article.split())>30 and len(summary.split())>10):
article_arr = clean_article(article)
summary_arr = clean_article(summary)
clean_data['id'] = data['id']
clean_data['url'] = data['url']
article_v2 = get_string(article_arr).split()
summary_v2 = get_string(summary_arr).split()
if len(article_v2) < len(article.split()) or len(summary_v2) < len(summary.split()):
print(str(data['id']))
clean_data['clean_article'] = article_arr
clean_data['clean_summary'] = summary_arr
with open(DST+str(clean_data['id'])+'.json', 'w') as json_file:
json.dump(clean_data, json_file)
process('data/raw/train/*', 'data/clean/train/')
process('data/raw/dev/*', 'data/clean/dev/')
process('data/raw/test/*', 'data/clean/test/')
print(str(unknown))