-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlscrape.py
53 lines (35 loc) · 1.17 KB
/
htmlscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from lxml import html
import requests
import sys
#get main page
main_page = requests.get('http://distrowatch.com')
tree = html.fromstring(main_page.content)
#read top 100 into lists
ranks = tree.xpath('//th[@class="phr1"]/text()')
names = tree.xpath('//td[@class="phr2"]//a/@href')
scores = tree.xpath('//td[@class="phr3"]/text()')
print(ranks)
print(names)
print(scores)
distros_info = []
count = 0
#get the data for each distro
for x in names:
URL = 'http://distrowatch.com/table.php?distribution=' + x
distro_page = requests.get(URL)
tree2 = html.fromstring(distro_page.content)
distro_data = tree2.xpath('//tr/td[1]/text()')
#remove useless data
distro_info = []
for y in range(len(distro_data) - 61, len(distro_data),1):
distro_info.append(distro_data[y])
distros_info.append(distro_info)
#get the per-distro data right
output = ''
for y in range(len(distros_info[count])):
output = output + distros_info[count][y] + ','
#write it all to a file
with open('/home/roger/Documents/python/distrowatchdata.csv','a') as f:
f.write(ranks[count] + ',' + names[count] + ',' + scores[count] + ',' + output + '\n')
count+=1
print('done: ' + str(count))