-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpostgeo.py
222 lines (193 loc) · 10.1 KB
/
postgeo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python
"""
This thing assumes you're throwing it at a CSV that ends in .csv or .CSV or similar.
Your full address field -- "1600 Pennsylvania Ave. NW, Washington, D.C., USA" -- MUST be the final column.
"""
from __future__ import print_function
import argparse
import sys
import csv
import os
import time
from geopy.geocoders import GoogleV3
from geopy.exc import GeocoderTimedOut
import creds
try:
GoogleAPIkey = creds.setup['GoogleAPIkey']
if GoogleAPIkey == "GetYourKeyUsingTheDirectionsAbove":
raise KeyError
except KeyError:
print("You need to configure your Google geocoding API key in creds.py. Instructions are there.")
print("It's not as scary as it sounds, if you've never done this before.")
sys.exit()
try:
geocachepath = creds.setup['geocachepath']
except KeyError:
print("You need to figure a setup['geocachepath'] within creds.py to meet the new file format.")
print("Check the sample on Github.com/PalmBeachPost/postgeo to figure out how to update your version.")
sys.exit()
geolocator = GoogleV3(api_key=GoogleAPIkey, timeout=10)
global timedelay
timedelay = 0
def timedisplay(timediff):
m, s = divmod(timediff, 60)
h, m = divmod(m, 60)
return("%d:%02d:%02d" % (h, m, s))
def main(geocacheflag):
# Start building caching as we go along.
# Format: Full address as key, value as a tuple of lat, long, accuracy, lat-long
geocache = {"1600 Pennsylvania Ave. NW, Washington, D.C. 20500":
("-77.036482", "38.897667", "Rooftop", "-77.036482, 38.897667")}
inputfilename = args.filename
buffersize = 1
totalrows = 0
rowsprocessed = 0
lastpercentageprocessed = 0
starttime = time.clock()
# Use the base of the original filename, append "-geo", tack on the same extension as the original.
outputfilename = inputfilename[:inputfilename.rfind(".")] + "-geo" + inputfilename[inputfilename.rfind("."):]
if os.path.isfile(outputfilename):
message = "File {} exists, proceeding will overwrite(y or n)? "
proceed_prompt = get_input(message.format(outputfilename))
if proceed_prompt.lower() == 'y':
pass
else:
print('Aborting . . .')
exit()
# Read from geocache.csv file, if selected as option at command line.
if geocacheflag == 1:
if not os.path.isfile(geocachepath): # If we need to begin a new cache file
print("Using " + geocachepath + " file to speed up results.")
with open(geocachepath, 'wb') as cachefilehandle:
# with open(geocachepath, 'wb', buffersize) as cachefilehandle:
cacheput = csv.writer(cachefilehandle)
geocacheheader = ['fulladdy', 'lat', 'long', 'accuracy', 'latlong']
cacheput.writerow(geocacheheader)
for fulladdy in geocache:
mylat, mylong, myaccuracy, mylatlong = geocache[fulladdy]
myrow = [fulladdy, mylat, mylong, myaccuracy, mylatlong]
cacheput.writerow(myrow)
else: # If we have that geocachepath file
print("Using " + geocachepath + " file to speed up results.")
with open(geocachepath, 'rU') as cachefilehandle:
rows = csv.reader(cachefilehandle)
rows.next() # Skip header row
for row in rows:
if len(row) > 4: # If we have a blank row, skip it.
if row[0] not in geocache: # check for repeats of fulladdy as key
geocache[row[0]] = (row[1], row[2], row[3], row[4])
# Geocache should be fully set up now.
# Cache file should now be closed. Let's open it again to append to it.
cachefilehandle = open(geocachepath, "ab") # Open to append
cacheput = csv.writer(cachefilehandle)
#
# Note we still have the file handle open for our cache. This is a good thing, but we do need to remember to write and close.
#
# Next, we open the source data CSV entirely to get a row count, then close it.
with open(inputfilename, 'rU') as inputfilehandle:
rows = csv.reader(inputfilehandle)
for row in rows:
totalrows += 1
print(str(totalrows) + " rows to be processed.")
with open(outputfilename, 'wb', buffersize) as outputfile:
put = csv.writer(outputfile)
with open(inputfilename, 'rU') as inputfilehandle:
rows = csv.reader(inputfilehandle)
headers = next(rows)
newstuff = ["lat", "long", "accuracy", "latlong"]
headers.extend(newstuff)
put.writerow(headers)
# print headers
for row in rows:
# print(row)
fulladdy = row[-1] # Last column of the file
if fulladdy in geocache:
print("\tFound in cache: " + fulladdy)
mylat, mylong, myaccuracy, mylatlong = geocache[fulladdy]
row.extend([mylat, mylong, myaccuracy, mylatlong])
put.writerow(row)
rowsprocessed += 1
outputfile.flush() # Encourage write after each line. Should be no performance hit because geocoding is so slow.
os.fsync(outputfile)
else:
if len(fulladdy) > 0:
try:
location = geolocator.geocode(fulladdy.replace("'", ""))
except: # GeocoderServiceError
print("GeocoderServiceError (or some other error) occured. Sleeping five seconds to try again.")
time.sleep(5)
location = geolocator.geocode(fulladdy.replace("'", ""))
try:
mylatlong = str(location.latitude) + ", " + str(location.longitude)
mylat = str(location.latitude)
mylong = str(location.longitude)
myaccuracy = location.raw["geometry"]["location_type"]
newstuff = [mylat, mylong, myaccuracy, mylatlong]
row.extend(newstuff)
geocache[fulladdy] = newstuff
rowsprocessed += 1
percentageprocessed = int(100*rowsprocessed/totalrows)
if percentageprocessed > lastpercentageprocessed:
lastpercentageprocessed = percentageprocessed
endtime = time.clock()
timediff = (endtime-starttime)
print(str(percentageprocessed) + "% processed in " + timedisplay(timediff) + ". ETA: " + timedisplay((timediff/rowsprocessed)*(totalrows-rowsprocessed)) + ".")
put.writerow(row)
outputfile.flush()
os.fsync(outputfile)
print("Found: " + fulladdy)
if geocacheflag == 1:
cacheput.writerow([fulladdy, mylat, mylong, myaccuracy, mylatlong])
cachefilehandle.flush()
os.fsync(cachefilehandle)
time.sleep(timedelay) # Necessary to avoid getting shut out
except AttributeError:
if len(fulladdy) > 0:
print("Dropping row: Something went wrong on " + fulladdy)
time.sleep(timedelay)
rowsprocessed += 1
else:
print("Dropping row: No address listed in this row: " + str(row))
rowsprocessed += 1
except GeocoderTimedOut:
print("Geocoder service timed out on this row: " + str(row))
print("You should probably re-run this on the next pass.")
time.sleep(timedelay)
rowsprocessed += 1
else: # If fulladdy was blank
print("Dropping row: No address listed in this row: " + str(row))
rowsprocessed += 1
if geocacheflag == 1:
cachefilehandle.flush()
cachefilehandle.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="postgeo.py is a command-line geocoder tool.")
parser.add_argument('filename', metavar='filename', help='CSV file containing addresses to be geocoded')
parser.add_argument('-c', help='Use geocache.csv file to speed up coding and recoding. Now the default.', action="store_true")
parser.add_argument('-n', help='Do NOT use geocache file Use geocache.csv file to speed up geocoding and recoding.', action="store_true")
parser.add_argument('-t', type=float, nargs=1, default=[0.0], action="store", help='Enter a delay between queries measured in seconds, such as 1 or 0.5.')
try:
args = parser.parse_args()
except:
parser.print_help()
sys.exit(1)
get_input = input
if sys.version_info[:2] <= (2, 7):
get_input = raw_input
timedelay = args.t[0]
if args.n:
geocacheflag = 0
else:
geocacheflag = 1
if args.c: # If we get both options
geocacheflag = 1
if geocacheflag == 1:
print("Speeding up results with cached file " + geocachepath)
if args.filename.lower().endswith('.csv'):
if os.path.isfile(args.filename):
print("Beginning to process " + args.filename)
main(geocacheflag)
else:
print("File " + args.filename + " not found.")
else:
print("File must be of type CSV and end with .csv extension")