This repository has been archived by the owner on Jan 27, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
47 lines (43 loc) · 1.61 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#This script is designed to scrape Encyclopaedia Denhac for the current button to pop values
#Logic
##Get the web page at http://denhac.org/wiki/index.php?title=Soda_Machine
##Parse it for the key value pair of button:sodaType
##Export it as JSON to a flat file with specified path
import requests,json
from bs4 import BeautifulSoup
def main():
#Variables
completeList = list()
completeDict = dict()
tempTestingFile = "testOut.json"
remoteURL = "http://denhac.org/wiki/index.php?title=Soda_Machine"
#Obtaining the data from the wiki
page = requests.get(remoteURL)
soup = BeautifulSoup(page.text)
#Get the table out of the raw HTML
table = soup.find("table", { "class" : "wikitable" })
#We have the table now. Just need to parse the data into a JSON type
for row in table.findAll("tr"):
cells = row.findAll("td")
cellsList = list(cells)
if len(cellsList) == int(2):
cellsList
#For each line in the file, remove <td> and </td> and \n with nothing. Move each row into a list element.
for line in cellsList:
line = str(line)
line = line.replace("<td>","").replace("</td>","").replace("\n","")
completeList.append(line)
#Turn the list into a dictionary
while len(completeList) >= 2:
completeDict[completeList[0]] = completeList[1]
completeList.pop(0)
completeList.pop(0)
#Turn the dict into JSON
sodaTableInJSON = json.dumps(completeDict, ensure_ascii=False)
#Write the JSON object to a file.
json_str = json.dumps(sodaTableInJSON)
with open(tempTestingFile,"w") as outFile:
print(sodaTableInJSON)
outFile.write(sodaTableInJSON)
if __name__ == '__main__':
main()