-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmufa_scraper.py
181 lines (159 loc) · 8.08 KB
/
mufa_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import requests
import bs4
import re
import csv
import time
def scrape_teams(session, parent_league_id):
"""Gets all the team IDs, and the sub-league ID for each team.
Takes parameter parent_league_id, which is the integer ID for the main
league (e.g. Summer Swiss 2013).
Returns a list of teams and leagues."""
html_response = session.get("http://sandlotsports.biz/teams/",
params={"leagueid": parent_league_id})
html_file = html_response.text
soup = bs4.BeautifulSoup(html_file, 'lxml')
teams_table = soup.find('table') # Just get first table
# Could replace this with a generator
table_rows = teams_table.select('tr') # Select all tables
teams_list = []
# Create rows for a list of teams
for row in table_rows:
team_name = row.find('td', class_='teamName').get_text()
# From the URL of the team's page, we can get team/league ID
url = row.find('td', class_='teamName').find('a')['href']
# Get the team ID using regex
teamid_regex = re.compile('teamid=\d{1,6}', re.IGNORECASE)
teamid_match = teamid_regex.search(url)
team_id = ''.join(x for x in teamid_match.group() if x.isdigit())
# Get the league ID using regex
leagueid_regex = re.compile('leagueid=\d{1,6}', re.IGNORECASE)
leagueid_match = leagueid_regex.search(url)
league_id = ''.join(x for x in leagueid_match.group() if x.isdigit())
teams_list.append((team_name, team_id, league_id))
print(html_response.headers)
return teams_list
def scrape_scores(session, team_id, league_id):
"""Scrapes the game results table for a specific team in a league.
Writes rows of data to a tab-delimited text file."""
html_response = session.get("http://sandlotsports.biz/teams/",
params={"teamid": team_id,
"leagueid": league_id}
)
html_file = html_response.text
# Create a BeautifulSoup object from the file, parsing the HTML
soup = bs4.BeautifulSoup(html_file, 'lxml')
try:
# Get team name from the header
team_header = soup.find('h2', id='pageName')
# Team naem comes after the header in the DOM
team_name = team_header.next_element
# If there are problems here, log the team/league for investigation
except:
with open('mufa_errors.txt', 'a', encoding='utf-8',
newline='') as errorf:
error_writer = csv.writer(errorf, dialect='excel-tab')
error_writer.writerow((team_id, league_id))
# Get team rating if available
try:
# The next thing after "Self Rating: "
team_rating = team_header.find('br').next_element[13:]
except AttributeError:
team_rating = ''
# If there's no rating, then the site usually appends '(-)' to the name
team_name = team_name.replace(' (-)', '')
try:
# Get just the <table> Tag with scores/games
games_table = soup.select('table#upcomingGames')[0]
# Identify where each column will be in the table (this will vary based
# on the season; for example, Fall has no self-ratings.)
header_row = [header for header in games_table.find('tr').contents
if type(header) is bs4.element.Tag]
header_dict = {}
# Strip out the HTML tags from each header
for i, header in enumerate(header_row):
header_name = (str(header).replace('</br>', '')
.replace('<br>', ' ').replace('<br/>', ' ')
.replace('<th>', '').replace('</th>', ''))
header_dict[header_name] = i
# Create a list of bs4 Tags for table rows
# Only get the rows with a CSS class, so we don't grab the header row
table_rows = games_table.find_all('tr', class_=True)
# In Python 3, we open the file not in "binary" mode
with open('mufa_test.txt', 'a', encoding='utf-8', newline='') as fout:
for row in table_rows:
# Set up a variable for an iterable of the row's contents
r = row.contents
# If there's no score, skip the row
if not ''.join(x for x in r[header_dict['Score']].get_text()
if x.isdigit()):
print("No score found for " + r[0].get_text())
continue
# Get game location, stripping out map/diagram links
# Separate out the park and field names as well
game_loc = (r[header_dict['Field']].get_text()
.replace(' (', '')
.replace(')', '').replace("Map", '')
.replace("Diagram", ''))
field_names = ['A', 'B', 'C', 'D', 'E', 'F', 'North',
'South', 'East', 'West', '1', '2', '3',
'4', '5', '6', '7', '8']
if game_loc.split()[-1] in field_names and ':' not in game_loc:
game_park = ' '.join(game_loc.split()[:-1])
game_field = game_loc.split()[-1]
elif game_loc.split()[-1] in field_names and ':' in game_loc:
game_park = ' '.join(game_loc.split()[:-1])
game_field = ''
else:
game_park = game_loc
game_field = game_loc
# Add all columns that will always exist
data_row = [
# Get date, stripped of other stuff
r[header_dict['Date']].get_text()[
0:r[header_dict['Date']].get_text().find(' ')],
team_id, # Team ID, which gets passed in to function
league_id, # League ID, passed into the function
team_name, # Team name var from the top
# Get opponent; strip their score, which is typically
# displayed in parens
r[header_dict['Opponent']].get_text()[
0:r[header_dict['Opponent']].get_text().find('(')-1],
# Get result, without the "c" captain's link
r[header_dict['Win or Loss']].get_text()[
0:
r[header_dict['Win or Loss']].get_text().find(' ')-1],
# Get scores, with only the digits (remove weird chars)
''.join(x for x in r[header_dict['Score']].get_text()
if x.isdigit()),
''.join(x for x in r[header_dict['Opp. Score']].get_text()
if x.isdigit()),
# Game location
game_loc,
game_park,
game_field,
r[header_dict['Game Time']].get_text(),
team_rating
]
# Append opponent's rating, which may or may not exist,
# depending on the season
try:
data_row.append(r[header_dict['Self Rtg']].get_text())
except KeyError:
data_row.append('')
print("Adding row to file:\n{0}".format(data_row))
mufa_writer = csv.writer(fout, dialect='excel-tab')
mufa_writer.writerow(data_row)
except: # Log any team/league IDs that caused an error, for investigation
with open('mufa_errors.txt', 'a', encoding='utf-8',
newline='') as errorf:
error_writer = csv.writer(errorf, dialect='excel-tab')
error_writer.writerow((team_id, league_id))
# Add a time delay to be gentle on the server
time.sleep(15)
if __name__ == '__main__':
with requests.Session() as sess:
parent_league_id = input("Please enter the parent league ID: ")
teams_list = scrape_teams(sess, parent_league_id)
# For each team row in teams_list, call scrape_scores
for team_name, team_id, league_id in teams_list:
scrape_scores(sess, team_id, league_id)