-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
110 lines (97 loc) · 7.07 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import sqlite3
from tqdm import tqdm # For some sick loading bars 8D
import sys
import statistics
import math
statcastdb = sqlite3.connect('raw2017.db') # Defining statcast database for MLB 2017 season
# Defining all 30 MLB team abbreviations
teams = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CIN',
'CLE', 'COL', 'CWS', 'DET', 'HOU', 'KC',
'LAA', 'LAD', 'MIA', 'MIL', 'MIN', 'NYM',
'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SEA',
'SF', 'STL', 'TB', 'TEX', 'TOR','WSH']
refresh = False
normalize = False
for arg in sys.argv:
if arg == '-r':
refresh = True
elif arg == '-n':
normalize = True
if refresh: # Checking for flag
print("Scraping database...") # Begin scraping from mlb.com
for year in tqdm(range(2017, 2018), desc = 'Years scraped'): # Currently capping year at 2017
for team in tqdm(teams, desc = 'Teams scraped', leave = False): # Currently iterating through all 30 MLB teams
# Link is concatenated with team and year in ID locations
link = 'https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=&hfC=&hfSea=' + str(year) + '%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&team=' + team + '&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name-event&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&min_abs=0&type=details&'
data = pd.read_csv(link, low_memory=False) # Error if no low_memory
data.rename(columns={'player_name': 'pitcher_name'}, inplace=True) # Rename player_name to denote that it is the pitcher
pd.io.sql.to_sql(data, name='statcast', con=statcastdb, if_exists='append') # Insert to table
statcastdb.commit()
print("Processing database...") # Make adjustments to database
statcastdb.execute("DELETE FROM statcast WHERE rowid NOT IN (SELECT MIN(rowid) FROM statcast GROUP BY sv_id, batter, pitcher)") # I forgot what this does
statcastdb.execute("DELETE FROM statcast WHERE description = 'foul'") # Get rid of fouls
statcastdb.execute("ALTER TABLE statcast ADD batter_name TEXT") # Add column for batter names
statcastdb.execute("UPDATE statcast SET description='hit_into_play' WHERE description='hit_into_play_score'") # Merge hits and hits+RBI's
statcastdb.execute("DELETE FROM statcast WHERE description!='hit_into_play'") # Delete all non-hits
statcastdb.commit()
#statcastdb.close()
print("Database done processing!")
allBatterEntries = statcastdb.execute("SELECT batter FROM statcast").fetchall() # Get all batters
uniqueBatterEntriesSet = set(allBatterEntries) # Filter list of batters such that each batter is there once by making it a set
uniqueBatterEntriesList = list(uniqueBatterEntriesSet) # Convert set to list
playerIDLink = 'http://crunchtimebaseball.com/master.csv' # Set link to scrape
playerID = pd.read_csv(playerIDLink, low_memory=False, encoding='latin-1') # Error if no low_memory
playerIDNumber = list(playerID["mlb_id"]) # List of player ID's
playerIDName = list(playerID["mlb_name"]) # Corresponding list of player names
for batterEntry in tqdm(uniqueBatterEntriesList, desc = 'Player IDs updated', leave = False): # For each batter
try:
row = playerIDNumber.index(batterEntry[0]) # Get row index for batter
# Replace player's ID with player's name
statcastdb.execute("UPDATE statcast SET batter_name=? WHERE batter=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos1_person_id=? WHERE pos1_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos2_person_id=? WHERE pos2_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos3_person_id=? WHERE pos3_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos4_person_id=? WHERE pos4_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos5_person_id=? WHERE pos5_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos6_person_id=? WHERE pos6_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos7_person_id=? WHERE pos7_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos8_person_id=? WHERE pos8_person_id=?", (playerIDName[row],playerIDNumber[row]))
statcastdb.execute("UPDATE statcast SET pos9_person_id=? WHERE pos9_person_id=?", (playerIDName[row],playerIDNumber[row]))
except:
#print(str("Minor leaguer: " + batterEntry[0]) + " was omitted!")
statcastdb.commit()
statcastdb.execute("CREATE TABLE IF NOT EXISTS statcastNormalized AS SELECT * FROM statcast") # Clones database if the database doesn't already exist
if normalize: # Checking for flag
# Attributes to normalize
normalizeAttributes = ['ax', 'ay', 'az', 'hc_x', 'hc_y', 'launch_angle', 'launch_speed', 'pfx_x', 'pfx_z',
'hit_distance_sc', 'vx0', 'vy0', 'vz0', 'release_pos_x', 'release_pos_y', 'release_pos_z']
for attribute in tqdm(normalizeAttributes, desc = 'Attributes normalized: ', leave = True): # For each attribute
attributeData = statcastdb.execute("SELECT `index`," + attribute + " FROM statcastNormalized").fetchall() # Get list of entries for attribute
attributeDataComplete = statcastdb.execute("SELECT " + attribute + " FROM statcastNormalized WHERE " + attribute + " IS NOT NULL").fetchall() # Get list of entries for attribute that aren't null
attributeDataComplete = list(sum(attributeDataComplete, ())) # Filter list such that each tuple entry is one string entry
median = statistics.median(attributeDataComplete) # Get median value for attribute
mean = 0 # Init mean
for entry in attributeData: # Calculating total value for attribute
if entry[1] != None:
mean += entry[1]
else: # Impute
mean += median
mean /= len(attributeData) # Divide by total to get mean
squaredDifferences = 0 # Init squared differences
missingSquaredDifference = (median - mean) ** 2 # Calculate squared difference since it's the same for all missing values
for entry in attributeData: # Calculating standard deviation
if entry[1] != None: # Calculate squared difference if value is not missing
squaredDifferences += (entry[1] - mean) ** 2 # (value - mean)^2
else: #impute
squaredDifferences += missingSquaredDifference
standardDeviation = math.sqrt(squaredDifferences/len(attributeData)) # Take sqrt of entire thing
missingZScore = (median - mean) / standardDeviation # the Z score for missing values is always the same, so calculate beforehand
for entry in tqdm(attributeData, desc = 'Normalizing entries: ', leave = True): # Calculate and set Z score for each entry
if entry[1] != None: # Calculate Z score if value isn't missing
zScore = (entry[1] - mean) / standardDeviation
else:
zScore = missingSquaredDifference
#print(attribute + " | " + str(zScore) + " | " + str(entry[0]))
statcastdb.execute("UPDATE statcastNormalized SET " + attribute + "=? WHERE `index`=?",(zScore,entry[0])) # Replace value for attribute with the Z-score
statcastdb.commit()