forked from ouyangsizhuo/2021Spring_CRF_AGACtask1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathto_xml_needed.py
41 lines (40 loc) · 1.38 KB
/
to_xml_needed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#! usr/bin/env python3
# -*- coding:utf-8 -*-
'''
Author:zhoukaiyin
Time:2018年1月16日
'''
#该脚本用于将下载并重新命名后的xml文件整理成CRF所需要的格式。
from bs4 import BeautifulSoup as bs
import codecs
import os
import sys
import glob
inppath = sys.argv[1]
outpath = sys.argv[2]
if not os.path.exists(outpath):
os.system("mkdir "+outpath)
files = glob.glob(inppath+"/*")
for file in files:
filename = os.path.basename(file)
print(filename)
try:
with codecs.open(file,'r',encoding='utf8') as rf:
do=rf.read()
with codecs.open(outpath+'/'+filename.split('.')[0]+'.xml','w',encoding='utf8') as wf:
soup = bs(do,'lxml')
txt=soup.document.text.replace('\n',' ')
txt=txt.replace('<','<')
txt=txt.replace('>','>')
txt=txt.replace('&','')
wf.write('<?xml version="1.0" encoding="UTF-8"?>'+'\n')
wf.write('<Label drug="'+filename.split('.')[0]+'" track="TAC2017_ADR">'+'\n')
wf.write(' <Text>'+'\n')
wf.write(' <Section name="adverse reactions" id="S1">')
wf.write(' '+txt)
wf.write('\n'+' </Section>'+'\n')
wf.write(' </Text>'+'\n')
wf.write('</Label>' + '\n')
# wf.write(txt)
except PermissionError:
pass