-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMedline Parse Example.py
83 lines (72 loc) · 3.08 KB
/
Medline Parse Example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from bs4 import BeautifulSoup
#Use BeautifulSoup to clean up html files
#BeautifulSoup documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
html_doc = '/Users/christopherpan 1/Desktop/TOPIC MODELING PROJECT/OLD/medlineplus.gov/ency/article/000423.htm'
data = ''
with open(html_doc, 'r') as f:
data = f.read().replace('\n', '')
#New lines appear as \n in the html file so this removes all of them
soup = BeautifulSoup(data, 'html.parser')
#Create "soup" to work on
#print(soup.prettify) prints what the "soup" looks like
def get_plain_text(soup):
for el in soup.find_all('li'):
el.insert(0, '123456')
el.append('123456')
for el in soup.find_all('p'):
el.insert(0, '123456')
el.append('123456')
for el in soup.find_all('h2'):
el.insert(0, '123456')
el.append('123456')
#123456's are inserted and then replaced later on by \n so that the final text file looks like the actual page
elements1 = soup.find_all('div', id = 'section-Ref')
for element in elements1:
element.decompose()
elements2 = soup.find_all('div', id = 'section-version')
for element in elements2:
element.decompose()
elements3 = soup.find_all(class_ = 'adam-info')
for element in elements3:
element.decompose()
elements4 = soup.find_all(class_ = 'videobox')
for element in elements4:
element.decompose()
elements5 = soup.find_all(class_ = 'section img-thumb')
for element in elements5:
element.decompose()
#element.decompose() removes unnecessary elements
#For example, all elements with the tag 'videobox' are videos
#You have to look through the page source (or soup.prettify) to find the tags of unnecessary elements
plain_text = ''
#For Medline articles, class_ = 'main-single' contains the content of the text
for el in soup.find_all(class_ = 'main-single'):
plain_text += el.get_text()
plain_text = plain_text.replace(u'\xa0', ' ')
final_text = ''
if 'References' in plain_text:
i = 0
while i < (plain_text.index('References')):
final_text += plain_text[i]
i += 1
#In Medline, I did not want the References part of the text so I removed it
else:
final_text = plain_text
return final_text
print(soup.title.string)
#Title of the article
a = (repr(get_plain_text(soup)).replace('\\','')) #repr turns it into a string
#Replaces are specific to Medline
b = a.replace('.', '. ')
c = b.replace('123456', '\n')
d = c.replace('123456', '\n')
print(d[1:len(d)-1]) #Remove ' at beginning and end of string
#After running ^ on all files, run this on all text files to remove all empty lines or white spaces at the beginning of the line (This is separate code)
import os
with open('/Users/christopherpan 1/Desktop/TOPIC MODELING PROJECT/FINAL DATA/Medline/000001.htm.txt', 'r') as filen:
complete = os.path.join('/Users/christopherpan 1/Desktop/TOPIC MODELING PROJECT/', 'test.txt')
f = open(complete, 'w')
for i in filen.readlines():
if not i.strip():
continue
f.write(i)