-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_dive.py
208 lines (197 loc) · 9.17 KB
/
deep_dive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/python
"""
This script loads the initial price data that Sam loaded from DeepDive on about 12/25/2014
It also adds census data from sam on 1/8/2015.
It then creates a data set of only ads with two prices "doubles" where
the implied fixed cost is listed
"""
import pandas
import datetime
import ipdb
import json
import numpy as np
from sklearn import linear_model
from sklearn import datasets
def census_lookup(geo_id, table_value, verbose=False):
"""
table_value is like B01001001
Where B01001 is the table id and 001 is the value id
"""
try:
table = table_value[0:6]
value = table_value[6:]
data = combined['data'][geo_id][table]['estimate'][table_value]
if verbose:
print('Info for: %s ' % combined['geography'][geo_id]['name'])
print('Found value: %s' % data)
return data
except:
return np.nan
#data = pandas.read_csv('rates_locs.csv')
data = pandas.read_csv('rates_pop.csv', sep='\t')
data['unit'] = data['time_str'].apply(lambda x: x.split(' ')[1])
data = data[data['unit'] != 'DURATION']
data['timeValue'] = data['time_str'].apply(lambda x: x.split(' ')[0])
data['unit'][data['unit'] == 'HOURS'] = 'HOUR'
data['minutes'] = np.nan
data['minutes'][data['unit']=='MINS'] = data['timeValue'][data['unit']=='MINS'].astype(np.integer)
data['minutes'][data['unit']=='HOUR'] = 60*data['timeValue'][data['unit']=='HOUR'].astype(np.integer)
data['price_per_hour'] = 60*data['price']/data['minutes']
#data['price'] = data['price'].apply(lambda x: x.replace('$',''))
#data['price'] = data['price'].apply(lambda x: x.replace('roses',''))
#data['price'] = data['price'].apply(lambda x: x.replace('rose',''))
#data['price'] = data['price'].apply(lambda x: x.replace('kisses',''))
#data['price'] = data['price'].apply(lambda x: x.replace('dollars',''))
#data = data[data['price'].apply(lambda x: 'euro' not in x)]
#data['price'] = data['price'].astype(np.integer)
# This code is useful for dealing with the 'price' string problem in
# sam's rates_locs file from 12/29
# Begin merging information from census
combined = json.loads(open('census.json','r').read())
tables_list = [
"B15003001", # Total population
"B23025005", # Unemployed
"B23025002", # In Labor Force
#"B19013_001", # Median Income
"B02001001", # Total Population
"B02001002", # Total White Population
"B02001003", # Total Black Population
"B15003002", # No schooling completed
"B15003003", # Nursery school
"B15003004", # Kindergarten
"B15003005", # 1st Grade
"B15003006", # 2nd Grade
"B15003007", # 3rd Grade
"B15003008", # 4th Grade
"B15003009", # 5th Grade
"B15003010", # 6th Grade
"B15003011", # 7th Grade
"B15003012", # 8th Grade
"B15003013", # 9th Grade
"B15003014", # 10th Grade
"B15003015", # 11th Grade
"B15003016", # 12th Grade, no diploma
"B15003017", # High School Diploma
"B15003018", # GED or alternative diploma
"B15003019", # Some college < 1 year
"B15003020", # Some college, 1+ years
"B15003021", # Associates degree
"B15003022", # Bachelors degree
"B15003023", # Masters degree
"B15003023", # Professional degree
"B15003024", # Doctoral degree
"B19013001", # Household Median Income
"B08303001", # Total Commuters
"B08303002", # Less than 5 Minutes
"B08303003", # 5 to 9 minutes
"B08303004", # 10 to 14 minutes
"B08303005", # 15 to 19 minutes
"B08303006", # 20 to 24 minutes
"B08303007", # 25 to 29 minutes
"B08303008", # 30 to 34 minutes
"B08303009", # 35 to 39 minutes
"B08303010", # 40 to 44 minutes
"B08303011", # 45 to 59 minutes
"B08303012", # 60 to 89 minutes
"B08303013", # 90 or more minutes
]
hs_educations = [
"B15003017", # High School Diploma
"B15003018", # GED or alternative diploma
]
some_college_educations = [
"B15003019", # Some college < 1 year
"B15003020", # Some college, 1+ years
"B15003021", # Associates degree
]
college_plus_educations = [
"B15003022", # Bachelors degree
"B15003023", # Masters degree
"B15003023", # Professional degree
"B15003024", # Doctoral degree
]
less_than_hs_educations = [
"B15003002", # No schooling completed
"B15003003", # Nursery school
"B15003004", # Kindergarten
"B15003005", # 1st Grade
"B15003006", # 2nd Grade
"B15003007", # 3rd Grade
"B15003008", # 4th Grade
"B15003009", # 5th Grade
"B15003010", # 6th Grade
"B15003011", # 7th Grade
"B15003012", # 8th Grade
"B15003013", # 9th Grade
"B15003014", # 10th Grade
"B15003015", # 11th Grade
"B15003016", # 12th Grade, no diploma
]
commute_times = [
"B08303002", # Less than 5 Minutes
"B08303003", # 5 to 9 minutes
"B08303004", # 10 to 14 minutes
"B08303005", # 15 to 19 minutes
"B08303006", # 20 to 24 minutes
"B08303007", # 25 to 29 minutes
"B08303008", # 30 to 34 minutes
"B08303009", # 35 to 39 minutes
"B08303010", # 40 to 44 minutes
"B08303011", # 45 to 59 minutes
"B08303012", # 60 to 89 minutes
"B08303013", # 90 or more minutes
]
commute_centers = [
2.5, #"B08303002", # Less than 5 Minutes
7.5, #"B08303003", # 5 to 9 minutes
12.5, #"B08303004", # 10 to 14 minutes
17.5, #"B08303005", # 15 to 19 minutes
22.5, #"B08303006", # 20 to 24 minutes
27.5, #"B08303007", # 25 to 29 minutes
32.5, #"B08303008", # 30 to 34 minutes
37.5, #"B08303009", # 35 to 39 minutes
42.5, #"B08303010", # 40 to 44 minutes
52.5, #"B08303011", # 45 to 59 minutes
72.5, #"B08303012", # 60 to 89 minutes
100, #"B08303013", # 90 or more minutes
]
geo_ids = pandas.DataFrame(data.geo_id.unique())
for i in tables_list:
geo_ids[i] = np.nan # Create a column
geo_ids[i]=data.apply(lambda x: census_lookup(x['geo_id'],i), axis=1)
# Look up the census code for the given geo id and column name
data = pandas.merge(data, geo_ids,left_on='geo_id', right_on=0)
data['population'] = data["B15003001"] # Total population
data['unemployment'] = data["B23025005"]/data["B23025002"]
data['lt_highschool'] = data[less_than_hs_educations].sum(axis=1)/data['population']
data['highschool'] = data[hs_educations].sum(axis=1)/data['population']
data['some_college'] = data[some_college_educations].sum(axis=1)/data['population']
data['college_plus'] = data[college_plus_educations].sum(axis=1)/data['population']
data['median_income'] = data['B19013001']
data['frac_white'] = data["B02001002"]/data["B02001001"]
data['avg_commute'] = data[commute_times].dot(commute_centers)/data["B08303001"]
# End merging information from census
counts = pandas.DataFrame(data.groupby('ad_id')['ad_id'].count())
counts.rename(columns={'ad_id':'counts'}, inplace=True)
out = pandas.merge(data, counts,left_on='ad_id', right_index=True)
doubles = out.copy()
calcs=doubles.groupby('ad_id').agg({'price':['min','max'], 'minutes':['min','max']})
doubles = pandas.merge(doubles, pandas.DataFrame(calcs['price']['min']), left_on='ad_id', right_index=True)
doubles.rename(columns={'min':'p1'}, inplace=True)
doubles = pandas.merge(doubles, pandas.DataFrame(calcs['price']['max']), left_on='ad_id', right_index=True)
doubles.rename(columns={'max':'p2'}, inplace=True)
doubles = pandas.merge(doubles, pandas.DataFrame(calcs['minutes']['min']), left_on='ad_id', right_index=True)
doubles.rename(columns={'min':'m1'}, inplace=True)
doubles = pandas.merge(doubles, pandas.DataFrame(calcs['minutes']['max']), left_on='ad_id', right_index=True)
doubles.rename(columns={'max':'m2'}, inplace=True)
doubles['f'] = (doubles['p1'] * doubles['m2'] - doubles['m1'] * doubles['p2']) / (doubles['m2'] - doubles['m1'])
doubles=doubles[~doubles['ad_id'].duplicated()] # remove duplicates
doubles =doubles[doubles['m1'] != doubles['m2']] # remove those with two prices for the same time...
doubles['marginal_price'] = (doubles['p2'] - doubles['p1']) / (doubles['m2'] - doubles['m1'])
doubles.to_csv('intercept.csv', index=False)
out.index = range(out.shape[0])
out.reindex()
out.to_csv('prices_%s.csv' % datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d'), index=False)
#reg = linear_model.LinearRegression()
#(p1 m2 - m1 p2)/ (m2 - m1)
#out = reg.fit(X=data.minutes.values[:,np.newaxis],y=data.price.values[:,np.newaxis])