This repository has been archived by the owner on Sep 18, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 370
/
Copy pathblock011_write-your-own-function-01.html
451 lines (396 loc) · 18.8 KB
/
block011_write-your-own-function-01.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />
<title>Write your own R functions, part 1</title>
<script src="libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="libs/bootstrap-3.3.5/css/bootstrap.min.css" rel="stylesheet" />
<script src="libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="libs/navigation-1.1/tabsets.js"></script>
<link href="libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="libs/highlightjs-9.12.0/highlight.js"></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-68219208-1', 'auto');
ga('send', 'pageview');
</script>
<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
pre:not([class]) {
background-color: white;
}
</style>
<script type="text/javascript">
if (window.hljs) {
hljs.configure({languages: []});
hljs.initHighlightingOnLoad();
if (document.readyState && document.readyState === "complete") {
window.setTimeout(function() { hljs.initHighlighting(); }, 0);
}
}
</script>
<style type="text/css">
h1 {
font-size: 34px;
}
h1.title {
font-size: 38px;
}
h2 {
font-size: 30px;
}
h3 {
font-size: 24px;
}
h4 {
font-size: 18px;
}
h5 {
font-size: 16px;
}
h6 {
font-size: 12px;
}
.table th:not([align]) {
text-align: left;
}
</style>
<link rel="stylesheet" href="libs/local/main.css" type="text/css" />
<link rel="stylesheet" href="libs/local/nav.css" type="text/css" />
<link rel="stylesheet" href="//netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css" type="text/css" />
</head>
<body>
<style type = "text/css">
.main-container {
max-width: 940px;
margin-left: auto;
margin-right: auto;
}
code {
color: inherit;
background-color: rgba(0, 0, 0, 0.04);
}
img {
max-width:100%;
height: auto;
}
.tabbed-pane {
padding-top: 12px;
}
.html-widget {
margin-bottom: 20px;
}
button.code-folding-btn:focus {
outline: none;
}
</style>
<div class="container-fluid main-container">
<!-- tabsets -->
<style type="text/css">
.tabset-dropdown > .nav-tabs {
display: inline-table;
max-height: 500px;
min-height: 44px;
overflow-y: auto;
background: white;
border: 1px solid #ddd;
border-radius: 4px;
}
.tabset-dropdown > .nav-tabs > li.active:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
content: "";
border: none;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
border: none;
display: inline-block;
border-radius: 4px;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
display: block;
float: none;
}
.tabset-dropdown > .nav-tabs > li {
display: none;
}
</style>
<script>
$(document).ready(function () {
window.buildTabsets("TOC");
});
$(document).ready(function () {
$('.tabset-dropdown > .nav-tabs > li').click(function () {
$(this).parent().toggleClass('nav-tabs-open')
});
});
</script>
<!-- code folding -->
<header>
<div class="nav">
<a class="nav-logo" href="index.html">
<img src="static/img/stat545-logo-s.png" width="70px" height="70px"/>
</a>
<ul>
<li class="home"><a href="index.html">Home</a></li>
<li class="faq"><a href="faq.html">FAQ</a></li>
<li class="syllabus"><a href="syllabus.html">Syllabus</a></li>
<li class="topics"><a href="topics.html">Topics</a></li>
<li class="people"><a href="people.html">People</a></li>
</ul>
</div>
</header>
<div class="fluid-row" id="header">
<h1 class="title toc-ignore">Write your own R functions, part 1</h1>
</div>
<div id="TOC">
<ul>
<li><a href="#what-and-why">What and why?</a></li>
<li><a href="#load-the-gapminder-data">Load the Gapminder data</a></li>
<li><a href="#max---min">Max - min</a></li>
<li><a href="#get-something-that-works">Get something that works</a></li>
<li><a href="#turn-the-working-interactive-code-into-a-function">Turn the working interactive code into a function</a></li>
<li><a href="#test-your-function">Test your function</a></li>
<li><a href="#check-the-validity-of-arguments">Check the validity of arguments</a></li>
<li><a href="#wrap-up-and-whats-next">Wrap-up and what’s next?</a></li>
<li><a href="#resources">Resources</a></li>
</ul>
</div>
<div id="what-and-why" class="section level3">
<h3>What and why?</h3>
<p>My goal here is to reveal the <strong>process</strong> a long-time useR employs for writing functions. I also want to illustrate why the process is the way it is. Merely looking at the finished product, e.g. source code for R packages, can be extremely deceiving. Reality is generally much uglier … but more interesting!</p>
<p>Why are we covering this now, smack in the middle of data aggregation? Powerful machines like <code>dplyr</code>, <code>purrr</code>, and the built-in <code>apply</code> family of functions, are ready and waiting to apply your purpose-built functions to various bits of your data. If you can express your analytical wishes in a function, these tools will give you great power.</p>
</div>
<div id="load-the-gapminder-data" class="section level3">
<h3>Load the Gapminder data</h3>
<p>As usual, load the Gapminder data.</p>
<pre class="r"><code>library(gapminder)
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...</code></pre>
</div>
<div id="max---min" class="section level3">
<h3>Max - min</h3>
<p>Say you’ve got a numeric vector, and you want to compute the difference between its max and min. <code>lifeExp</code> or <code>pop</code> or <code>gdpPercap</code> are great examples of a typical input. You can imagine wanting to get this statistic after we slice up the Gapminder data by year, country, continent, or combinations thereof.</p>
</div>
<div id="get-something-that-works" class="section level3">
<h3>Get something that works</h3>
<p>First, develop some working code for interactive use, using a representative input. I’ll use Gapminder’s life expectancy variable.</p>
<p>R functions that will be useful: <code>min()</code>, <code>max()</code>, <code>range()</code>. <strong>Read their documentation.</strong></p>
<pre class="r"><code>## get to know the functions mentioned above
min(gapminder$lifeExp)
## [1] 23.599
max(gapminder$lifeExp)
## [1] 82.603
range(gapminder$lifeExp)
## [1] 23.599 82.603
## some natural solutions
max(gapminder$lifeExp) - min(gapminder$lifeExp)
## [1] 59.004
with(gapminder, max(lifeExp) - min(lifeExp))
## [1] 59.004
range(gapminder$lifeExp)[2] - range(gapminder$lifeExp)[1]
## [1] 59.004
with(gapminder, range(lifeExp)[2] - range(lifeExp)[1])
## [1] 59.004
diff(range(gapminder$lifeExp))
## [1] 59.004</code></pre>
<p>Internalize this “answer” because our informal testing relies on you noticing departures from this.</p>
<div id="skateboard-perfectly-formed-rear-view-mirror" class="section level4">
<h4>Skateboard >> perfectly formed rear-view mirror</h4>
<p>This image <a href="http://blog.fastmonkeys.com/?utm_content=bufferc2d6e&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer">widely attributed to the Spotify development team</a> conveys an important point.</p>
<p><img src="img/spotify-howtobuildmvp.gif" /></p>
<p>Build that skateboard before you build the car or some fancy car part. A limited-but-functioning thing is very useful. It also keeps the spirits high.</p>
<p>This is related to the valuable <a href="http://c2.com/cgi/wiki?TelescopeRule">Telescope Rule</a>:</p>
<blockquote>
<p>It is faster to make a four-inch mirror then a six-inch mirror than to make a six-inch mirror.</p>
</blockquote>
</div>
</div>
<div id="turn-the-working-interactive-code-into-a-function" class="section level3">
<h3>Turn the working interactive code into a function</h3>
<p>Add NO new functionality! Just write your very first R function.</p>
<pre class="r"><code>max_minus_min <- function(x) max(x) - min(x)
max_minus_min(gapminder$lifeExp)
## [1] 59.004</code></pre>
<p>Check that you’re getting the same answer as you did with your interactive code. Test it eyeball-o-metrically at this point.</p>
</div>
<div id="test-your-function" class="section level3">
<h3>Test your function</h3>
<div id="test-on-new-inputs" class="section level4">
<h4>Test on new inputs</h4>
<p>Pick some new artificial inputs where you know (at least approximately) what your function should return.</p>
<pre class="r"><code>max_minus_min(1:10)
## [1] 9
max_minus_min(runif(1000))
## [1] 0.997364</code></pre>
<p>I know that 10 minus 1 is 9. I know that random uniform [0, 1] variates will be between 0 and 1. Therefore max - min should be less than 1. If I take LOTS of them, max - min should be pretty close to 1.</p>
<p>It is intentional that I tested on integer input as well as floating point. Likewise, I like to use valid-but-random data for this sort of check.</p>
</div>
<div id="test-on-real-data-but-different-real-data" class="section level4">
<h4>Test on real data but <em>different</em> real data</h4>
<p>Back to the real world now. Two other quantitative variables are lying around: <code>gdpPercap</code> and <code>pop</code>. Let’s have a go.</p>
<pre class="r"><code>max_minus_min(gapminder$gdpPercap)
## [1] 113282
max_minus_min(gapminder$pop)
## [1] 1318623085</code></pre>
<p>Either check these results “by hand” or apply the “does that even make sense?” test.</p>
</div>
<div id="test-on-weird-stuff" class="section level4">
<h4>Test on weird stuff</h4>
<p>Now we try to break our function. Don’t get truly diabolical (yet). Just make the kind of mistakes you can imagine making at 2am when, 3 years from now, you rediscover this useful function you wrote. Give your function inputs it’s not expecting.</p>
<pre class="r"><code>max_minus_min(gapminder) ## hey sometimes things "just work" on data.frames!
## Error in FUN(X[[i]], ...): only defined on a data frame with all numeric variables
max_minus_min(gapminder$country) ## factors are kind of like integer vectors, no?
## Error in Summary.factor(structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, : 'max' not meaningful for factors
max_minus_min("eggplants are purple") ## i have no excuse for this one
## Error in max(x) - min(x): non-numeric argument to binary operator</code></pre>
<p>How happy are you with those error messages? You must imagine that some entire <strong>script</strong> has failed and that you were hoping to just <code>source()</code> it without re-reading it. If a colleague or future you encountered these errors, do you run screaming from the room? How hard is it to pinpoint the usage problem?</p>
</div>
<div id="i-will-scare-you-now" class="section level4">
<h4>I will scare you now</h4>
<p>Here are some great examples STAT545 students devised during class where the function <strong>should break but it does not.</strong></p>
<pre class="r"><code>max_minus_min(gapminder[c('lifeExp', 'gdpPercap', 'pop')])
## [1] 1318683072
max_minus_min(c(TRUE, TRUE, FALSE, TRUE, TRUE))
## [1] 1</code></pre>
<p>In both cases, R’s eagerness to make sense of our requests is unfortunately successful. In the first case, a data.frame containing just the quantitative variables is eventually coerced into numeric vector. We can compute max minus min, even though it makes absolutely no sense at all. In the second case, a logical vector is converted to zeroes and ones, which might merit an error or at least a warning.</p>
</div>
</div>
<div id="check-the-validity-of-arguments" class="section level3">
<h3>Check the validity of arguments</h3>
<p>For functions that will be used again – which is not all of them! – it is good to check the validity of arguments. This implements a rule from <a href="http://www.faqs.org/docs/artu/ch01s06.html">the Unix philosophy</a>:</p>
<blockquote>
<p>Rule of Repair: When you must fail, fail noisily and as soon as possible.</p>
</blockquote>
<div id="stopifnot" class="section level4">
<h4>stopifnot</h4>
<p><code>stopifnot()</code> is the entry level solution. I use it here to make sure the input <code>x</code> is a numeric vector.</p>
<pre class="r"><code>mmm <- function(x) {
stopifnot(is.numeric(x))
max(x) - min(x)
}
mmm(gapminder)
## Error in mmm(gapminder): is.numeric(x) is not TRUE
mmm(gapminder$country)
## Error in mmm(gapminder$country): is.numeric(x) is not TRUE
mmm("eggplants are purple")
## Error in mmm("eggplants are purple"): is.numeric(x) is not TRUE
mmm(gapminder[c('lifeExp', 'gdpPercap', 'pop')])
## Error in mmm(gapminder[c("lifeExp", "gdpPercap", "pop")]): is.numeric(x) is not TRUE
mmm(c(TRUE, TRUE, FALSE, TRUE, TRUE))
## Error in mmm(c(TRUE, TRUE, FALSE, TRUE, TRUE)): is.numeric(x) is not TRUE</code></pre>
<p>And we see that it catches all of the self-inflicted damage we would like to avoid.</p>
</div>
<div id="if-then-stop" class="section level4">
<h4>if then stop</h4>
<p><code>stopifnot()</code> doesn’t provide a very good error message. The next approach is very widely used. Put your validity check inside an <code>if()</code> statement and call <code>stop()</code> yourself, with a custom error message, in the body.</p>
<pre class="r"><code>mmm2 <- function(x) {
if(!is.numeric(x)) {
stop('I am so sorry, but this function only works for numeric input!\n',
'You have provided an object of class: ', class(x)[1])
}
max(x) - min(x)
}
mmm2(gapminder)
## Error in mmm2(gapminder): I am so sorry, but this function only works for numeric input!
## You have provided an object of class: tbl_df</code></pre>
<p>In addition to a gratuitous apology, the error raised also contains two more pieces of helpful info:</p>
<ul>
<li><em>Which</em> function threw the error.</li>
<li>Hints on how to fix things: expected class of input vs actual class.</li>
</ul>
<p>If it is easy to do so, I highly recommend this template: “you gave me THIS, but I need THAT”.</p>
</div>
<div id="sidebar-non-programming-uses-for-assertions" class="section level4">
<h4>Sidebar: non-programming uses for assertions</h4>
<p>Another good use of this pattern is to leave checks behind in data analytical scripts. Consider our repetitive use of Gapminder in this course. Every time we load it, we inspect it, hoping to see the usual stuff. If we were loading from file (vs. a stable data package), we might want to formalize our expectations about the number of rows and columns, the names and flavors of the variables, etc. This would alert us if the data suddenly changed, which can be a useful wake-up call in scripts that you re-run <em>ad nauseum</em> on auto-pilot or non-interactively.</p>
</div>
</div>
<div id="wrap-up-and-whats-next" class="section level3">
<h3>Wrap-up and what’s next?</h3>
<p>Here’s the function we’ve written so far:</p>
<pre class="r"><code>mmm2
## function(x) {
## if(!is.numeric(x)) {
## stop('I am so sorry, but this function only works for numeric input!\n',
## 'You have provided an object of class: ', class(x)[1])
## }
## max(x) - min(x)
## }</code></pre>
<p>What we’ve accomplished:</p>
<ul>
<li>we’ve written our first function</li>
<li>we are checking the validity of its input, argument <code>x</code></li>
<li>we’ve done a good amount of informal testing</li>
</ul>
<p>Where to next? In <a href="block011_write-your-own-function-02.html">part 2</a>, we generalize this function to take differences in other quantiles and learn how to set default values for arguments.</p>
</div>
<div id="resources" class="section level3">
<h3>Resources</h3>
<p>Packages for runtime assertions (the last 3 seem to be under more active development than <code>assertthat</code>):</p>
<ul>
<li><code>assertthat</code> on <a href="https://cran.r-project.org/web/packages/asserthat/index.html">CRAN</a> and <a href="https://github.com/hadley/assertthat">GitHub</a> <em>the Hadleyverse option</em></li>
<li><code>ensurer</code> on <a href="https://cran.r-project.org/web/packages/ensurer/index.html">CRAN</a> and <a href="https://github.com/smbache/ensurer">GitHub</a> <em>general purpose, pipe-friendly</em></li>
<li><code>assertr</code> on <a href="https://cran.r-project.org/web/packages/assertr/index.html">CRAN</a> and <a href="https://github.com/tonyfischetti/assertr">GitHub</a> <em>explicitly data pipeline oriented</em></li>
<li><code>assertive</code> on <a href="https://cran.r-project.org/web/packages/assertive/index.html">CRAN</a> and <a href="https://bitbucket.org/richierocks/assertive">Bitbucket</a> <em>rich set of built-in functions</em></li>
</ul>
<p>Hadley Wickham’s book <a href="http://adv-r.had.co.nz">Advanced R</a></p>
<ul>
<li>Section on <a href="http://adv-r.had.co.nz/Exceptions-Debugging.html#defensive-programming">defensive programming</a></li>
</ul>
</div>
<div class="footer">
This work is licensed under the <a href="http://creativecommons.org/licenses/by-nc/3.0/">CC BY-NC 3.0 Creative Commons License</a>.
</div>
</div>
<script>
// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
$('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
bootstrapStylePandocTables();
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>