-
Notifications
You must be signed in to change notification settings - Fork 0
/
SSD_inserter.py
186 lines (158 loc) · 7.99 KB
/
SSD_inserter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import mysql.connector
import configparser as cf
from selenium import webdriver
from time import sleep
from random import randint
from selenium.webdriver.common.by import By
from datetime import datetime
#function to check if an entry has already been inserted into the table
def checkIfExists(cursor, table_name, column_name, value):
query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name} = %s"
cursor.execute(query, (value,))
result = cursor.fetchone()
count = result[0]
return count > 0
def parseSsdPage(url, cursor):
#first we initialize the values in case they are not found on the page
overall_capacity = "Unknown"
nand_capacity = "Unknown"
nand_technology = "Unknown"
nand_type = "Unknown"
form_factor = "Unknown"
interface = "Unknown"
endurance = "Unknown"
dram = "Unknown"
model = "Unknown"
read_speed = "Unknown"
write_speed = "Unknown"
random_read = "Unknown"
random_write = "Unknown"
protocol = "Unknown"
controller = "Unknown"
table_name = "ssd"
column_name = "model"
driver.get(url)
sleep(30)
#get all the information on the page into the sections variable
sections = driver.find_elements(By.CSS_SELECTOR, "section.details")
model = driver.find_element(By.CLASS_NAME, "drivename").text
#check if this model is already in the sql table
if(checkIfExists(cursor, table_name, column_name, model) == True):
print(f"{model} already in table")
return
#loop through the information and put all relevant specs into their variable
for section in sections:
h1_element = section.find_element(By.CSS_SELECTOR, "h1")
if(h1_element.text == "Solid-State-Drive"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Capacity:"):
overall_capacity = row.find_element(By.CSS_SELECTOR, "td").text
elif(h1_element.text == "NAND Flash"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Type:"):
nand_type = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Technology:"):
nand_technology = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Capacity:"):
nand_capacity = row.find_element(By.CSS_SELECTOR, "td").text
elif(h1_element.text == "Physical"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Form Factor:"):
form_factor = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Interface:"):
interface = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Protocol:"):
protocol = row.find_element(By.CSS_SELECTOR, "td").text
elif(h1_element.text == "Controller"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Architecture:"):
controller = row.find_element(By.CSS_SELECTOR, "td").text
elif(h1_element.text == "DRAM Cache"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Capacity:"):
dram = row.find_element(By.CSS_SELECTOR, "td").text
elif(h1_element.text == "Performance"):
rows = section.find_elements(By.CSS_SELECTOR, "tr")
for row in rows:
header = row.find_element(By.CSS_SELECTOR, "th")
if(header.text == "Sequential Read:"):
read_speed = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Sequential Write:"):
write_speed = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Random Read:"):
random_read = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Random Write:"):
random_write = row.find_element(By.CSS_SELECTOR, "td").text
elif(header.text == "Endurance:"):
endurance = row.find_element(By.CSS_SELECTOR, "td").text
#insert into the table
query = "INSERT INTO ssd (capacity, interface, read_speed, write_speed, endurance, dram, model, protocol, form_factor, controller, nand_type, nand_capacity, nand_technology, random_read, random_write) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (overall_capacity, interface, read_speed, write_speed, endurance, dram, model, protocol, form_factor, controller, nand_type, nand_capacity, nand_technology, random_read, random_write)
cursor.execute(query, values)
conn.commit()
print(f"{model} inserted into database")
#target attributes: /capacity, /interface, /random_read, /random_write /read_speed(sequential read), /write_speed,
# endurance, /dram, model, /protocol, /form_factor, /controller(architecture)
#/nand_type, /nand_capacity, /nand_technology,
driver = webdriver.Chrome()
# Open a website
driver.get('https://www.techpowerup.com/ssd-specs/')
sleep(15)
# find the search bar
search_bar = driver.find_element(By.CSS_SELECTOR, ".js-search-input.search-input")
#the approach: type two letters into search bar (aa, ab, ac, etc) and then visit each entry and insert based on info from page
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9']
while(True):
counter = 0
try:
config = cf.ConfigParser()
config.read('config.ini')
db_host = config.get('Database', 'HOST')
db_user = config.get('Database', 'USER')
db_password = config.get('Database', 'PASSWORD')
db_name = config.get('Database', 'DATABASE')
conn = mysql.connector.connect(
host = db_host,
user = db_user,
password = db_password,
database = db_name
)
cursor = conn.cursor()
table_name = "ssd"
column_name = "model"
value = "temp"
for i in range(len(alphabet)):
for j in range(len(alphabet)):
if(counter > 476):
#we need to reload the page and relocate the search bar each time because we will be loading new pages
driver.get('https://www.techpowerup.com/ssd-specs/')
search_bar = driver.find_element(By.CSS_SELECTOR, ".js-search-input.search-input")
search_bar.send_keys(f"{alphabet[i]}{alphabet[j]}")
sleep(5)
ssd_table = driver.find_elements(By.CSS_SELECTOR, ".drive-title")
link_attributes = []
for item in ssd_table:
a_elements = item.find_elements(By.TAG_NAME, 'a')
for result in a_elements:
if result.text.strip():
href = result.get_attribute("href")
text = result.text
link_attributes.append({"href": href, "text": text})
for item in link_attributes:
parseSsdPage(item["href"], cursor)
counter += 1
print(f"counter is {counter}")
except:
now = datetime.now()
print(f"Error at {now.strftime('%H:%M:%S')}. Waiting for 20 minutes")
sleep(1200)