-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_warnings.py
104 lines (86 loc) · 3.08 KB
/
scrape_warnings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import urllib.request
from urllib.request import urlopen
import re
from collections import defaultdict
import csv
base_url = "https://punishments.mindbuzz.com.au/warnings.php?page="
header = {'User-Agent': 'Mozilla/5.0'}
pattern_3 = '<a href="info.php\?type=warn&id=(\d+)">(.*?)</a></td><td>'
# open and decode the webpage
def fetch_page(url):
req = urllib.request.Request(url=url, headers=header)
print("opening...")
page = urlopen(req)
print("opened!")
bytes = page.read()
raw_html = bytes.decode("utf-8")
print("decoded")
return raw_html
# retrieve the dehta (but remove all html goop {<>})
def get_reason(raw_html):
match_results = re.findall(pattern_3, raw_html, re.IGNORECASE)
reasons_with_ids = [(match[0], re.sub("<.*?>", "", match[1])) for match in match_results]
return reasons_with_ids
# fooorrrmmmatttt
def format_output(reasons_with_ids):
formatted_output = []
for i in range(0, len(reasons_with_ids), 4):
id = reasons_with_ids[i][0]
who_was_warned = reasons_with_ids[i][1]
warned_by = reasons_with_ids[i+1][1]
reason = reasons_with_ids[i+2][1]
time = reasons_with_ids[i+3][1]
expired = "yes" if "(Expired)" in time else "no"
time = time.replace(" (Expired)", "")
formatted_output.append({
"ID": id,
"Who was warned": who_was_warned,
"Warned by": warned_by,
"Reason": reason,
"Time": time,
"Expired?": expired
})
return formatted_output
# clear all warnings
all_warnings = []
# start doing stuff!
page_number = 1
while True:
# increment the URL by 1 and fetch
url = base_url + str(page_number)
raw_html = fetch_page(url)
# extract the reasons
reasons_with_ids = get_reason(raw_html)
if not reasons_with_ids: # if none, stop
break
# format
all_warnings.extend(format_output(reasons_with_ids))
# increase URL number
page_number += 1
# tally warnings
warns_given = defaultdict(int)
warns_received = defaultdict(int)
for warning in all_warnings:
warns_given[warning["Warned by"]] += 1
warns_received[warning["Who was warned"]] += 1
# create and write to the csv file
with open("warnings_summary.csv", "w", newline='') as csvfile:
csv_writer = csv.writer(csvfile)
# tally all the warnings
csv_writer.writerow(["Player", "Number of warns given", "Number of warns received"])
for player in set(warns_given.keys()).union(set(warns_received.keys())):
csv_writer.writerow([player, warns_given[player], warns_received[player]])
csv_writer.writerow([])
csv_writer.writerow(["ID", "Who was warned", "Warned by", "Reason", "Time", "Expired?"])
# do the list of all the warnings
for item in all_warnings:
csv_writer.writerow([
item["ID"],
item["Who was warned"],
item["Warned by"],
item["Reason"],
item["Time"],
item["Expired?"]
])
print("data written :]")
print("if you see this it means that the code didn't error out... good job!")