forked from vanatteveldt/learningr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_organizing.html
528 lines (414 loc) · 28.7 KB
/
3_organizing.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
<!DOCTYPE html>
<!-- saved from url=(0014)about:internet -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta http-equiv="x-ua-compatible" content="IE=9" >
<title>Organizing data in R</title>
<style type="text/css">
body, td {
font-family: sans-serif;
background-color: white;
font-size: 12px;
margin: 8px;
}
tt, code, pre {
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
}
h1 {
font-size:2.2em;
}
h2 {
font-size:1.8em;
}
h3 {
font-size:1.4em;
}
h4 {
font-size:1.0em;
}
h5 {
font-size:0.9em;
}
h6 {
font-size:0.8em;
}
a:visited {
color: rgb(50%, 0%, 50%);
}
pre {
margin-top: 0;
max-width: 95%;
border: 1px solid #ccc;
white-space: pre-wrap;
}
pre code {
display: block; padding: 0.5em;
}
code.r, code.cpp {
background-color: #F8F8F8;
}
table, td, th {
border: none;
}
blockquote {
color:#666666;
margin:0;
padding-left: 1em;
border-left: 0.5em #EEE solid;
}
hr {
height: 0px;
border-bottom: none;
border-top-width: thin;
border-top-style: dotted;
border-top-color: #999999;
}
@media print {
* {
background: transparent !important;
color: black !important;
filter:none !important;
-ms-filter: none !important;
}
body {
font-size:12pt;
max-width:100%;
}
a, a:visited {
text-decoration: underline;
}
hr {
visibility: hidden;
page-break-before: always;
}
pre, blockquote {
padding-right: 1em;
page-break-inside: avoid;
}
tr, img {
page-break-inside: avoid;
}
img {
max-width: 100% !important;
}
@page :left {
margin: 15mm 20mm 15mm 10mm;
}
@page :right {
margin: 15mm 10mm 15mm 20mm;
}
p, h2, h3 {
orphans: 3; widows: 3;
}
h2, h3 {
page-break-after: avoid;
}
}
</style>
<!-- Styles for R syntax highlighter -->
<style type="text/css">
pre .operator,
pre .paren {
color: rgb(104, 118, 135)
}
pre .literal {
color: rgb(88, 72, 246)
}
pre .number {
color: rgb(0, 0, 205);
}
pre .comment {
color: rgb(76, 136, 107);
}
pre .keyword {
color: rgb(0, 0, 255);
}
pre .identifier {
color: rgb(0, 0, 0);
}
pre .string {
color: rgb(3, 106, 7);
}
</style>
<!-- R syntax highlighter -->
<script type="text/javascript">
var hljs=new function(){function m(p){return p.replace(/&/gm,"&").replace(/</gm,"<")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
hljs.initHighlightingOnLoad();
</script>
</head>
<body>
<pre><code>## (C) (cc by-sa) Wouter van Atteveldt, file generated juni 06 2014
</code></pre>
<blockquote>
<p>Note on the data used in this howto:
This data can be downloaded from <a href="http://piketty.pse.ens.fr/files/capital21c/en/xls/">http://piketty.pse.ens.fr/files/capital21c/en/xls/</a>,
but the excel format is a bit difficult to parse at it is meant to be human readable, with multiple header rows etc.
For that reason, I've extracted csv files for some interesting tables that I've uploaded to
<a href="http://vanatteveldt.com/uploads/rcourse/data">http://vanatteveldt.com/uploads/rcourse/data</a></p>
</blockquote>
<h1>Organizing data in R</h1>
<p>This hands-on demonstrates reading, writing, and manipulating data in R.
As before, we will continue using the data from Piketty's 'Capital in the 21st Century' </p>
<pre><code class="r">download.file("http://vanatteveldt.com/wp-content/uploads/rcourse/data/income_toppercentile.csv",
destfile = "income_toppercentile.csv")
income = read.csv("income_toppercentile.csv")
</code></pre>
<h2>Saving and loading data</h2>
<p>So far, we've used the <code>read.csv</code> command to read data from a CSV file.
As can be guessed, there is also a <code>write.csv</code> command that writes data into a CSV file:</p>
<pre><code class="r">write.csv(income, file = "test.csv")
test = read.csv("test.csv")
head(test)
</code></pre>
<pre><code>## X Year Canada Australia New.Zealand Denmark Italy Holland Spain France
## 1 1 1900 NA NA NA NA NA NA NA NA
## 2 2 1901 NA NA NA NA NA NA NA NA
## 3 3 1902 NA NA NA NA NA NA NA NA
## 4 4 1903 NA NA NA 0.162 NA NA NA NA
## 5 5 1904 NA NA NA NA NA NA NA NA
## 6 6 1905 NA NA NA NA NA NA NA NA
## US
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
</code></pre>
<p>A new column was created because by default <code>write.csv</code> also writes the row numbers
(you can check this by opening test.csv in excel).
Since this row number column has no header, it is given the variable name <code>X</code>.
You can suppress this by adding <code>row.names=F</code> to the write.csv function:</p>
<pre><code class="r">write.csv(income, file = "test.csv", row.names = F)
</code></pre>
<p>On european computers, excel produces (and expects) csv files to be delimited with semicolons rather then commas by default,
using the comma as a decimal separator (instead of period).
To facilitate this, R provides a pair of functions <code>read.csv2</code>/<code>write.csv2</code> that use this format. </p>
<p>If you open a CSV file using the wrong function, you will only see a single column with all the values in it.
For example, if we use <code>read.csv2</code> to open the file we just created we get the following:</p>
<pre><code class="r">d = read.csv2("test.csv")
head(d)
</code></pre>
<pre><code>## Year.Canada.Australia.New.Zealand.Denmark.Italy.Holland.Spain.France.US
## 1 1900,NA,NA,NA,NA,NA,NA,NA,NA,NA
## 2 1901,NA,NA,NA,NA,NA,NA,NA,NA,NA
## 3 1902,NA,NA,NA,NA,NA,NA,NA,NA,NA
## 4 1903,NA,NA,NA,0.162,NA,NA,NA,NA,NA
## 5 1904,NA,NA,NA,NA,NA,NA,NA,NA,NA
## 6 1905,NA,NA,NA,NA,NA,NA,NA,NA,NA
</code></pre>
<p>The bottom line is: when using CSV data, always check your results, and use the 'European' version of the commands when appropriate.</p>
<p>Apart from writing csv files, R can also write to a native file format, which has the advantage of correctly storing all types of data (including numbers and date columns) and of storing multiple variables in one file.</p>
<p>For example, the following code stores the incomep and a new <code>x</code> variable in a file called <code>mydata.rdata</code>:</p>
<pre><code class="r">x = 12
save(income, x, file = "mydata.rdata")
</code></pre>
<p>Now, you can clear the data from your environment, using the Clear button in RStudio or by issuing the somewhat cryptic command <code>rm(list=ls())</code></p>
<pre><code class="r">rm(list = ls())
income
</code></pre>
<pre><code>## Error: object 'income' not found
</code></pre>
<p>And if you load the file, the variables will appear again:</p>
<pre><code class="r">load("mydata.rdata")
head(income)
</code></pre>
<pre><code>## Year Canada Australia New.Zealand Denmark Italy Holland Spain France US
## 1 1900 NA NA NA NA NA NA NA NA NA
## 2 1901 NA NA NA NA NA NA NA NA NA
## 3 1902 NA NA NA NA NA NA NA NA NA
## 4 1903 NA NA NA 0.162 NA NA NA NA NA
## 5 1904 NA NA NA NA NA NA NA NA NA
## 6 1905 NA NA NA NA NA NA NA NA NA
</code></pre>
<p>Note that you do not load the file into a specific variable, as the file can contain multiple variables.
The load command will automatically create those variables with their original names. </p>
<h2>Subsetting data</h2>
<p>The data we have downloaded into <code>income</code> contains income series from 1900 to 2010 for a number of countries.
We can use hard brackets <code>[rows, columns]</code> to subset this dataset, for example to select only the first 10 rows or to only select the US and Franch data. </p>
<pre><code class="r">income[1:10, ]
</code></pre>
<pre><code>## Year Canada Australia New.Zealand Denmark Italy Holland Spain France US
## 1 1900 NA NA NA NA NA NA NA NA NA
## 2 1901 NA NA NA NA NA NA NA NA NA
## 3 1902 NA NA NA NA NA NA NA NA NA
## 4 1903 NA NA NA 0.162 NA NA NA NA NA
## 5 1904 NA NA NA NA NA NA NA NA NA
## 6 1905 NA NA NA NA NA NA NA NA NA
## 7 1906 NA NA NA NA NA NA NA NA NA
## 8 1907 NA NA NA NA NA NA NA NA NA
## 9 1908 NA NA NA 0.165 NA NA NA NA NA
## 10 1909 NA NA NA NA NA NA NA NA NA
</code></pre>
<pre><code class="r">subset = income[, c("US", "France")]
head(subset)
</code></pre>
<pre><code>## US France
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
</code></pre>
<p>A more common use case is that we want to select based on specific criteria.
Suppose that we are now only interested in the series for the US, and France since 1945.
We can place an expression in the rows selector to subset the data like that:</p>
<pre><code class="r">subset = income[income$Year > 1945, c("Year", "US", "France")]
head(subset)
</code></pre>
<pre><code>## Year US France
## 47 1946 0.133 0.092
## 48 1947 0.120 0.092
## 49 1948 0.122 0.088
## 50 1949 0.117 0.090
## 51 1950 0.128 0.090
## 52 1951 0.118 0.090
</code></pre>
<h2>Calculating columns</h2>
<p>We saw earlier that you can store the result of a calculation in a new variable.
You can also create a new column by storing the result of a calculation in a column.
For example, we could create an column for the average of US and French inequality:</p>
<pre><code class="r">subset$average = (subset$US + subset$France)/2
head(subset)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0.092 0.1125
## 48 1947 0.120 0.092 0.1060
## 49 1948 0.122 0.088 0.1050
## 50 1949 0.117 0.090 0.1035
## 51 1950 0.128 0.090 0.1090
## 52 1951 0.118 0.090 0.1040
</code></pre>
<p>It is also possible to replace part of a column.
For example, we can set the average to NA when the French value is lower than 0.09 like so:</p>
<pre><code class="r">subset$average[subset$France < 0.09] = NA
head(subset)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0.092 0.1125
## 48 1947 0.120 0.092 0.1060
## 49 1948 0.122 0.088 NA
## 50 1949 0.117 0.090 0.1035
## 51 1950 0.128 0.090 0.1090
## 52 1951 0.118 0.090 0.1040
</code></pre>
<p>What you are doing there is in fact assigning <code>NA</code> to a subset of the column, selected using the France column.
Becoming good at R for a large part means becoming good at using the subsetting and assignment operations,
so take some time to understand and play around with this code.</p>
<h2>Dealing with Missing Values</h2>
<p>Finally, a useful function is <code>is.na</code>. This function is true when it's argument is NA (i.e., missing):</p>
<pre><code class="r">is.na(subset$average)
</code></pre>
<pre><code>## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
</code></pre>
<p>As you can see, it is true for the thrid row and for most rows past the 23d.
In fact, an expression lik <code>subset$average > 3</code> also returns such a vector of logical values:</p>
<pre><code class="r">subset$US > 0.11
</code></pre>
<pre><code>## [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
</code></pre>
<p>This result is <code>TRUE</code> for those years where the income inequality in the US is larger than .11.
Just as we can use <code>subset$France < 0.09</code> to selectively replace certain cells, we can do so with <code>is.na</code>:</p>
<pre><code class="r">subset$average[is.na(subset$average)] = 0
head(subset)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0.092 0.1125
## 48 1947 0.120 0.092 0.1060
## 49 1948 0.122 0.088 0.0000
## 50 1949 0.117 0.090 0.1035
## 51 1950 0.128 0.090 0.1090
## 52 1951 0.118 0.090 0.1040
</code></pre>
<p>This command tells R to replace every cell in the average column where the average is missing with zero.
Since sometimes NA values are really zero, this is quite a useful command.
We can also use this to remove NA rows, similar to the na.omit command used earlier but more flexible.
Let's first introduce our NA's again:</p>
<pre><code class="r">subset$average[subset$France < 0.09] = NA
head(subset)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0.092 0.1125
## 48 1947 0.120 0.092 0.1060
## 49 1948 0.122 0.088 NA
## 50 1949 0.117 0.090 0.1035
## 51 1950 0.128 0.090 0.1090
## 52 1951 0.118 0.090 0.1040
</code></pre>
<p>And now use <code>!is.na</code> to select certain rows in the data frame (an exclamation mark (read as NOT) inverts a selection)</p>
<pre><code class="r">subset.nomissing = subset[!is.na(subset$average), ]
head(subset.nomissing)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0.092 0.1125
## 48 1947 0.120 0.092 0.1060
## 50 1949 0.117 0.090 0.1035
## 51 1950 0.128 0.090 0.1090
## 52 1951 0.118 0.090 0.1040
## 53 1952 0.108 0.092 0.1000
</code></pre>
<p>As you can see, row 49 is gone. Note the trailing comma in the subset command. Although we only want to select on rows (and not on columns), we still need to place a comma after the row selection to complete the <code>[rows, columns]</code> pattern.</p>
<p>In fact, you can also use selections on a whole data frame, allowing you to replace all values under a certain condition.</p>
<pre><code class="r">subset[subset < 0.11] = NA
head(subset, n = 10)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 NA 0.1125
## 48 1947 0.120 NA NA
## 49 1948 0.122 NA NA
## 50 1949 0.117 NA NA
## 51 1950 0.128 NA NA
## 52 1951 0.118 NA NA
## 53 1952 NA NA NA
## 54 1953 NA NA NA
## 55 1954 NA NA NA
## 56 1955 0.111 NA NA
</code></pre>
<p>Note that here the trailing comma is not given since the selection is based on the whole data set, not just on certain rows.
Similarly, the is.na function can be used to globally replace NA values in a data frame:</p>
<pre><code class="r">subset[is.na(subset)] = 0
head(subset, n = 10)
</code></pre>
<pre><code>## Year US France average
## 47 1946 0.133 0 0.1125
## 48 1947 0.120 0 0.0000
## 49 1948 0.122 0 0.0000
## 50 1949 0.117 0 0.0000
## 51 1950 0.128 0 0.0000
## 52 1951 0.118 0 0.0000
## 53 1952 0.000 0 0.0000
## 54 1953 0.000 0 0.0000
## 55 1954 0.000 0 0.0000
## 56 1955 0.111 0 0.0000
</code></pre>
<h1>Good practice: self-contained scripts</h1>
<p>Using R is programming, and one of the most important parts of programming is managing your source code.
An important thing to realize is that your code will be written only once, but read many times over.
Spending twice as much time to make the code well organized and more readable might feel like wasting time,
but you (or your colleagues/students) will be very happy when you are reading it again.
Especially since in research code is often left alone for a number of months until it is time to review an article,
it is very important to make sure that you (and ideally: the readers/reviewers of the article) can understand the code.</p>
<p>Although there are no simple rules for writing readable code, and sometimes what is readable to one is quite cryptic to the other.
However, here are three tips that I can offer and that I expect you to incorporate in your assignments:</p>
<ol>
<li>Use descriptive variable names. Use <code>income</code> (or better: <code>income.top.percent</code>) rather than <code>i</code>. </li>
<li>Use comments where needed, especially to explain decisions, assumptions, and possible problems.
In R, every line starting with <code>#</code> is a comment, i.e. the line is completely skipped by R.</li>
<li>Often, when doing an analysis you're not quite sure where you are going to end up, so you write a lot of code that turns out not to be needed. When your analysis is done, take a moment to reorganize the code, remove redundancies, et cetera. It is often best to just start a new file and copy paste the relevant bits (add comments where needed). Assume that your code will also be reviewed, even if it is not, because you are sure to read it again later and wonder why/how you did certain things. </li>
<li>Finally, try to write what I term 'self contained scripts'. The script should start with some kind of data gathering commands such as <code>download.file</code> or <code>read.csv</code>, and end with your analyses. You should be able to clear your environment and run the code from top to bottom and arrive at the same results. In fact, when cleaning up my code I often do just that: clean up part of the code, clear all, re-run, and check the results. This is also important for reproducibility, as being able to run the whole code and get the same results is the only guarantee that that code in fact produced these results. </li>
</ol>
<p>We will come across some tools to make these things easier such as defining your own functions and working with knitr, but the most important thing is to accept the your code is part of your product and you should take the time to polish it a bit.</p>
</body>
</html>