-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgreat_circle_tax_migration_long_lat.R
304 lines (238 loc) · 11.2 KB
/
great_circle_tax_migration_long_lat.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
## Great Circles Tax Migration Long Lat
## to match logic with Nathan Yau, FlowingData
## http://flowingdata.com/2011/05/11/how-to-map-connections-with-great-circles/
library(maps)
library(geosphere)
## POSTGIS 2.0 Notes on preparing the centroids for use in R.
## map data from http://www.census.gov/geo/www/cob/co2000.html
## co99_d00 is a multipolygon and needs to be dumped to polygon
## and centroids taken on those polygons to avoid massive headaches.
## My two step approach, first dump to polygon:
##
## create table us_cnty_poly as
## select
## co99_d00.state,
## co99_d00.county,
## co99_d00.fips,
## (ST_Dump(co99_d00.geom)).geom as geom
## from co99_d00
## group by state,
## co99_d00.county,
## co99_d00.fips,
## co99_d00.geom
## order by co99_d00.state,
## co99_d00.county;
##
## create index idx_us_cnty_poly_gist on us_cnty_poly using gist(geom);
## select populate_geometry_columns('us_cnty_poly'::regclass);
## (can use pgsql2shp and use these to plot county outlines if it doesn't
## make the map look too busy aesthetically)
##
## (next centroids from the polygons)
## create table us_cnty_centroid
## as select state, county, fips,
## ST_Centroid(geom) as centroid,
## ST_AsText(ST_Centroid(geom)) as point_wkt,
## ST_AsEWKB(ST_Centroid(geom)) as cent_b,
## ST_y(ST_Centroid(geom)) as lat,
## ST_x(ST_Centroid(geom)) as long
## from us_cnty_poly;
##
## create index idx_us_cnty_centroid_gist on us_cnty_centroid using gist(centroid);
## select populate_geometry_columns('us_cnty_centroid'::regclass);
## gcIntermediate can consume different types of geometries and I want to play around
## hence the many different geometries in the centroid table
##
## Ship it off to use with R using pgsql2shp from command line
## -- remember to refresh your tables in pgAdmin prior to shipping or
## -- pgsql2shp will just ship a .dbf which won't help maptools
## you'll know you didn't refresh if you see this:
## Done (postgis major version: 2).
## Output shape: Unknown <- this tells you didn't refresh
## Dumping: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX [3118 rows].
##
## rather than this
## Initializing...
## Done (postgis major version: 2).
## Output shape: Point <- ah, points, lovely points
## Dumping: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX [3118 rows].
##
## ship it off
##
##C:\Documents and Settings\Owner>pgsql2shp -f c:/gisdata/us_cnty_centroid -h localhost -p 5434 -u postgres geocoder us_cn
## ty_centroid
## Initializing...
## Done (postgis major version: 2).
## Output shape: Point
## Dumping: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX [3489 rows].
##
## and ship off us_cnty_poly as well
## and now we can play with R
## read in Counties - note colClass that preserves leading 0's in fips
## this is from POSTGIS of - had to fix several 02(state) longitude that were
## missing '-' ie. -178.xxxxxxxxx was 178.xxxxxxxx that snarled antipodal
## comes up further down - fixed in Notepad** on visual inspection
## have to come up with a way to change programmatically.
##
require(maptools)
county_centroid <- readShapeSpatial("c:/gisdata/us_cnty_centroid.shp" )
names(county_centroid)
names(county_centroid) <- c("state", "county", "fips", "point_wkt", "cent_b","lat", "long") ##lower case
##see what we've got so far
str(county_centroid)
## address those factors
## county_centroid$state <- as.character(county_centroid$state)
## county_centroid$county <- as.character(county_centroid$county)
## county_centroid$fips <- as.character(county_centroid$fips)
## county_centroid$point_wkt <- as.character(county_centroid$point_wkt)
## county_centroid$centroid_b <-as.character(county_centroid$centroid_b)
## county_centroid$cent_ewkt <- as.character(county_centroid$cent_ewkt)
## put proj$string
proj4string(county_centroid) <- CRS(as.character("+proj=longlat +ellps=GRS80"))
proj = proj4string(county_centroid) ## just storing this for later
## we're matching on fips so check number of characters
min(nchar(as.character(county_centroid$fips))) ## [1] 5
## read the inflow data 2009_2010
## I took out the file header with Notepad++ and examined the file
## for obvious errors like embedded "
## source - http://www.irs.gov/pub/irs-soi/countyinflow0910.csv
## except for " scrubbing as it comes from IRS
inflow0910 = read.csv("c:/gisdata/census/migration/countyinflow0910.csv",
sep=",",
stringsAsFactors=FALSE,
as.is=TRUE,
colClass=c(rep("character",6),rep("numeric",3)),
col.names=c("st_code_dest","co_code_dest","st_code_orig","co_code_orig",
"st_abbv","co_name","return_num","exmpt_num","aggr_agi")
)
## since we're going to match this data with census maps by fips
## codes - check the IRS fips are giving valid fips code, five characters -
## leading two characters - state code, last three characters - county
require(stringr)
## actually want nchar() for these tests
max(nchar(inflow0910$st_code_dest))## this comes up 2
min(nchar(inflow0910$st_code_dest))## this comes up 1
min(nchar(inflow0910$co_code_orig))## this comes up 1
min(nchar(inflow0910$co_code_dest))## this comes up 1
## so the IRS files strip out leading 0's so we put them back in -
## from names(inflow0910)
## [1] "st_code_dest" "co_code_dest" "st_code_orig" "co_code_orig"
## we have four codes that need left padding with 0's
inflow0910$st_code_dest <- str_pad(c(inflow0910$st_code_dest),
2, side="left", pad="0")
min(nchar(inflow0910$st_code_dest)) #2
## now fix the other three
inflow0910$st_code_orig <- str_pad(c(inflow0910$st_code_orig),
2, side="left", pad="0")
inflow0910$co_code_dest <- str_pad(c(inflow0910$co_code_dest),
3, side="left", pad="0")
inflow0910$co_code_orig <- str_pad(c(inflow0910$co_code_orig),
3, side="left", pad="0")
nrow(inflow0910)
## we've still got something like 30,000 summary records
## that won't match against county centroids and cause
## mismatches in gcIntermediate so we dispatch them with some sqldf
## this will leave a county matches itself but we may find that useful
## this could be achieved with merge but I don't know how yet
require(sqldf)
## take out summary records
inflow0910 <- sqldf("SELECT st_code_dest, co_code_dest, st_code_orig, co_code_orig,
st_abbv, co_name, return_num, exmpt_num, aggr_agi FROM inflow0910 WHERE
st_code_dest NOT LIKE '00' and st_code_orig NOT LIKE '00' and st_code_dest NOT LIKE '01'
and st_code_orig NOT LIKE '01'and st_code_orig NOT LIKE '96' and st_code_orig NOT LIKE '97'
and st_code_orig NOT LIKE '98' and st_code_dest NOT LIKE '96' and st_code_dest NOT LIKE '97'
and st_code_dest NOT LIKE '98' and co_code_dest NOT LIKE '000' and st_abbv NOT LIKE 'DS'
and st_abbv NOT LIKE 'SS' and st_abbv NOT LIKE 'FR' and co_code_orig NOT LIKE '000'")
## compute fips_in_orig fips_in_dest
inflow0910$fips_in_dest <- paste(inflow0910$st_code_dest, inflow0910$co_code_dest, sep="")
inflow0910$fips_in_orig <- paste(inflow0910$st_code_orig, inflow0910$co_code_orig, sep="")
head(inflow0910_sql) ## take a look at it
tail(inflow0910_sql) ## look again
#
# read the outflow data 2009_10
## again took out header and cleaned out " around text in Notepad++
## which wasn't present in the inflow file
## source - http://www.irs.gov/pub/irs-soi/countyoutflow0910.csv
outflow0910 = read.csv("c:/gisdata/census/migration/countyoutflow0910.csv",
sep=",",
stringsAsFactors=FALSE,
as.is=TRUE,
colClass=c(rep("character",6),rep("numeric",3)),
col.names=c("st_code_orig", "co_code_orig", "st_code_dest", "co_code_dest",
"st_abbv", "co_name", "return_num", "exmpt_num", "aggr_agi")
)
## fix these up with padding as above
outflow0910$st_code_dest <- str_pad(c(outflow0910$st_code_dest),2, side="left", pad="0")
outflow0910$st_code_orig <- str_pad(c(outflow0910$st_code_orig),2, side="left", pad="0")
outflow0910$co_code_dest <- str_pad(c(outflow0910$co_code_dest),3, side="left", pad="0")
outflow0910$co_code_orig <- str_pad(c(outflow0910$co_code_orig),3, side="left", pad="0")
## take out summary records
outflow0910 <- sqldf("SELECT st_code_dest, co_code_dest, st_code_orig, co_code_orig,
st_abbv, co_name, return_num, exmpt_num, aggr_agi FROM outflow0910 WHERE
st_code_dest NOT LIKE '00' and st_code_orig NOT LIKE '00' and st_code_dest NOT LIKE '01'
and st_code_orig NOT LIKE '01'and st_code_orig NOT LIKE '96' and st_code_orig NOT LIKE '97'
and st_code_orig NOT LIKE '98' and st_code_dest NOT LIKE '96' and st_code_dest NOT LIKE '97'
and st_code_dest NOT LIKE '98' and co_code_dest NOT LIKE '000' and st_abbv NOT LIKE 'DS'
and st_abbv NOT LIKE 'SS' and st_abbv NOT LIKE 'FR' and co_code_orig NOT LIKE '000'")
head(outflow0910_sql)
tail(outflow0910_sql)
## compute outflow origin FIPS code
outflow0910$fips_out_orig <- paste(outflow0910$st_code_orig, outflow0910$co_code_orig, sep="")
## compute outflow destination FIPS
outflow0910$fips_out_dest <- paste(outflow0910$st_code_dest, outflow0910$co_code_dest, sep="")
## library(maptools)
## map data from http://www.census.gov/geo/www/cob/co2000.html
## state.map <- readShapeSpatial("c:/gisdata/st99_d00.shp")
## county.map <- readShapeSpatial("c:/gisdata/census/co99_d00.shp")
# Color
pal <- colorRampPalette(c("#f2f2f2", "green"))
colors <- pal(100)
pal2 <- colorRampPalette(c("pink", "red"))
colors2 <- pal2(100)
xlim <- c(-171.738281, -56.601563)
ylim <- c(12.039321, 71.856229)
## get rid of cases that aren't running all the way through
movein_0910 <- unique(inflow0910$st_abbv)
movein_0910 <- movein_0910[!grepl("TX$|IL$|MN$|CO$|NM$|AZ$|CA$|WA$|PR$|AK$|NV$|UT$", movein_0910)]
moveout_0910 <- unique(outflow_0910$st_abbv)
moveout_0910 <- moveout_0910[!grepl("TX$|IL$|MN$|CO$|NM$|AZ$|CA$|WA$|PR$|AK$|NV$|UT$", moveout_0910)]
for (i in 1:length(movein_0910)) {
map("world", col="#f2f2f2", fill=TRUE, bg="white", lwd=0.05, xlim=xlim, ylim=ylim)
m_insub <- inflow0910[inflow0910$st_abbv == "NY",] ##options "NY",] or movein_0910[1],]
m_insub <- m_insub[order(m_insub$return_num),]
max_return_num <- max(m_insub$return_num)
for (j in 1:(length)(m_insub$fips_in_orig)) {
message(j)
##for(k in 1:length) (inter)) {
##message(k)
movein_orig <- county_centroid[county_centroid$fips == (m_insub[j,]$fips_in_orig),]
movein_dest <- county_centroid[county_centroid$fips == (m_insub[j,]$fips_in_dest),]
inter <- gcIntermediate(c(movein_orig[1,]$long, movein_orig[1,]$lat), c(movein_dest[1,]$long,
movein_dest[1,]$lat),sp=TRUE, n=100, addStartEnd=TRUE)
lines_in <<- inter
colindex <- round((m_insub[j,]$return_num / (max_return_num/10000)) * length(colors))#per Paul Butler Facebook visualize
lines(inter, col=colors[colindex], lwd=0.6)
}
##}
dev.off()
}
(i in 1:length(moveout_0910)) {
m_outsub <- outflow0910[outflow0910$st_abbv == moveout_0910[i],] ##was "NY",]
m_outsub <- m_outsub[order(m_outsub$return_num),]
max_return_num <- max(m_outsub$return_num)
for (j in 1:length(m_outsub$fips_out_orig)) {
message(j)
moveout_orig <- county_centroid[county_centroid$fips == (m_outsub[j,]$fips_out_orig),]
moveout_dest <- county_centroid[county_centroid$fips == (m_outsub[j,]$fips_out_dest),]
inter <- gcIntermediate(c(moveout_orig[1,]$long, moveout_orig[1,]$lat), c(moveout_dest[1,]$long,
moveout_dest[1,]$lat), sp=TRUE, n=100, addStartEnd=TRUE)
lines_out <<- inter
colindex2 <- round( (m_outsub[j,]$return_num /( max_return_num/10000)) * length(colors2))
##for k in 1:length(inter){
##lines_out <<- inter[k,]$
lines(inter, col=colors2[colindex2], lwd=0.6)
}
dev.off()
}
require(raster)
require(gdistance)