-
Notifications
You must be signed in to change notification settings - Fork 2
/
Rexamples.Rnw
158 lines (102 loc) · 3.99 KB
/
Rexamples.Rnw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
%\VignetteIndexEntry{Analysis of bead-summary data}
%\VignettePackage{beadarray}
%\VignetteEngine{knitr::knitr}
% To compile this document
% library('knitr'); rm(list=ls()); knit('beadsummary.Rnw')
\documentclass[12pt]{article}
\newcommand{\usecase}{\textit{\textbf{Use Case: }}}
<<knitr, echo=FALSE, results="hide">>=
library("knitr")
opts_chunk$set(tidy=FALSE,dev="png",fig.show="as.is",
fig.width=10,fig.height=6,
message=FALSE,eval=TRUE,warning=FALSE,echo=TRUE)
@
<<style, eval=TRUE, echo=F, results="asis">>=
BiocStyle::latex()
@
\usepackage{ifthen}
\newboolean{includethis}
\setboolean{includethis}{true}
\newcommand{\ifinclude}[1]{\ifthenelse{\boolean{includethis}}{#1}{}}
\title{Reading and exploring data using R - Examples}
\author{Mark Dunning}
\begin{document}
\maketitle
\tableofcontents
\section{UK Birth Rates}
These data were obtained from \href{http://www.theguardian.com/news/datablog/2011/jun/08/life-expectancy-uk-data-health}{a Guardian blog of June 2011}
\subsection{Reading the data}
We will read the file into R. As a rule-of-thumb, if the extension of the file is \textit{.tsv}, then the \Rfunction{read.delim} function can be used. Assuming that the file is read without error, we should print the first few lines of the file using \Rfunction{head} to check that the contents are as we expect. We can also check the dimensions of the data frame that we have created.
<<>>=
birth <- read.delim("data/UKBirthRate.tsv")
head(birth)
dim(birth)
summary(birth)
str(birth)
@
\subsection{Visualisation}
Lets assume that we want to visualise
<<>>=
plot(birth[,1],birth[,2])
@
\subsubsection{Changing the plot character}
<<>>=
plot(birth[,1],birth[,2],pch=16)
@
\subsubsection{Changing the plot type}
<<>>=
plot(birth[,1],birth[,2],type="l")
@
\subsubsection{Adding colours}
A complete list of colours can be seen \href{http://www.stat.columbia.edu/\~tzheng/files/Rcolor.pdf}{here}
<<>>=
plot(birth[,1],birth[,2],type="l",col="steelblue")
@
\subsubsection{Modifying the labels and title}
<<>>=
plot(birth[,1],birth[,2],xlab="Year",ylab="Births",main="Male birth rate", type="l")
@
\subsubsection{Adding extra lines and points}
<<>>=
plot(birth[,1],birth[,2],xlab="Year",ylab="Births",type="l",col="steelblue")
lines(birth[,1],birth[,3],col="midnightblue")
@
\subsubsection{Adding a legend}
<<>>=
plot(birth[,1],birth[,2],xlab="Year",ylab="Births",type="l",col="steelblue")
lines(birth[,1],birth[,3],col="midnightblue")
legend("topleft", fill=c("steelblue", "midnightblue"),legend=c("Male", "Female"))
@
\subsubsection{Adding text and points}
<<>>=
plot(birth[,1],birth[,2],xlab="Year",ylab="Births",type="l",col="steelblue")
lines(birth[,1],birth[,3],col="midnightblue")
legend("topleft", fill=c("steelblue", "midnightblue"),legend=c("Male", "Female"))
events <- which(birth[,4] != "")
points(birth[events,1],birth[events,2],pch=16)
points(birth[events,1],birth[events,3],pch=16)
abline(v = birth[events,1])
text(birth[events,1], 80, birth[events,4],srt=45)
@
\subsection{Statistical analysis}
\section{Rail station usage}
These data were obtained from \href{http://www.theguardian.com/news/datablog/2009/jul/02/rail-transport-travelleisure}{a Guardian blog of July 2009}
<<>>=
stations <- read.csv("data/Station use - 2011-12.csv",stringsAsFactors=FALSE)
head(stations)
@
Suppose we are interested in which stations are most / least busy. A measure of this is given in the column 'X1112.Entries...Exits'. However, this is a bit messay so we will create a new variable with the same data.
<<>>=
stations$Total <- stations$X1112.Entries...Exits
@
Now try and find the station with the most exits and entrances using the \Rfunction{max} and \Rfunction{which.max}. Does the answer look correct to you? What has gone wrong?
<<>>=
max(stations$Total)
@
The problem arises because the numbers in the original file had comma separators in them.
<<>>=
stations$Total <- as.numeric(gsub(",", "",stations$Total))
max(stations$Total)
stations[which.max(stations$Total),]
@
\end{document}