-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
25 lines (21 loc) · 979 Bytes
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os,json
def preprocess(data_name):
pathraw=os.path.join('data_set',data_name)
filenames=os.listdir(pathraw)
for i in filenames:
path=os.path.join(pathraw,'{}'.format(i))
with open(path,'r',encoding='utf8') as f:
data=json.load(f)
data['source']=''.join(data['source']).replace('`','')\
.replace('.','').replace('|','').replace(':','').replace('-','')\
.replace(',','').replace('!','').replace('?','').replace('\'','')
data['summary']=''.join(data['summary']).replace('`','')\
.replace('.','').replace('|','').replace(':','').replace('-','')\
.replace(',','').replace('!','').replace('?','').replace('\'','')
f.close()
with open(path,'w') as f:
json.dump(data,f)
f.close()
preprocess('train')
preprocess('test')
preprocess('dev')