-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path20_format_predictors.R
224 lines (184 loc) · 8.42 KB
/
20_format_predictors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#' ---
#' title: "Preparation of predictor data "
#' author: "Paul Czechowski"
#' date: "October 22nd, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#'
#' This code is tested using a raw R terminal. Path names are
#' defined relative to the project directory. This code commentary is included
#' in the R code itself and can be rendered at any stage using
#' `rmarkdown::render ("./20_format_predictors.r")`. Please check the session info
#' at the end of the document for further notes on the coding environment.
#'
#' # Prerequisites
#'
#' * `./10_import_data.R` was run, or the objects to be read in are available in
#' a folder tree `Zenodo` in the project parent directory.
#'
#' # Environment preparation
#'
#' ## Packages loading and cleaning of work-space. Functions may also load packages.
#+ message=FALSE, results='hide'
library ("DataCombine") # here used for string replacement in data frames
library ("data.table") # here used for variable renaming
rm(list=ls()) # clear R environment
# working directory is current directory by default and need not
# to be set
#' ## Setting locations for data import and export
#'
#' This script uses the objects generated by `10_import_data.R` that are located
#' in the `Zenodo` directory tree. It will also write to that location.
# import locations
path_predictors_in <- "./Zenodo/R_Objects/10_predictors.Rdata"
# export locations
path_workspace <- "./Zenodo/R_Objects/20_workspace.Rdata"
path_predictors_out <- "./Zenodo/R_Objects/20_predictors.Rdata"
#' # Formatting the abiotic predictor data
#'
#' ## Data import
#'
#' Soil geochemical and X-Ray diffraction data is imported using basic R functionality
load(path_predictors_in) # loading data frame "predictors"
# generated through "10_import_data.R"
#' ## Minor Adjustments, removal of superfluous sampling locations
#'
#' There may be some duplicate entries in the data which need to be removed:
predictors <- predictors[!duplicated(predictors$Sample),]
#' The `rownames()` are numerical so far, but should be something more meaningful.
#' Hence, the samples are re-labelled with data of one column of the original data
#' frame.
rownames(predictors) <- predictors$Sample
#' X-ray diffraction data from Duanne White contained minor rounding errors due to `Excel`
#' particularities. The rounding errors are corrected by slightly re-scaling
#' those values here.
# check prior to adjusting
rowSums ( predictors[grep ("x_",colnames (predictors))])
# adjusting
predictors[grep ("x_",colnames (predictors))] <-
t (apply (predictors[grep ("x_",colnames (predictors))], 1,
function (x) {x/sum(x)}))
# check prior to adjusting
rowSums ( predictors[grep ("x_",colnames (predictors))])
#' The data also contains sample information from samples that are not included
#' in the current project. These samples will be removed from the data.
predictors <- predictors[(predictors$Location != "Reinbolt_Hills"),]
#' ## Renaming area identifiers and sample identifiers
#'
#' The long sampling location strings are an annoyance during plotting, and
#' are shortened here.
#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors) -->
# "FindReplace()" needs a data frame for replacing things
Replaces <- data.frame ( from = c("Mount_Menzies", "Mawson_Escarpment",
"Lake_Terrasovoje"), to = c ("MM", "ME", "LT"))
# Re-writing the predictor data frame with replaced strings
predictors <- FindReplace(predictors, "Location", Replaces, from = "from", to = "to",
exact = TRUE, vector = FALSE)
# garbage collection
rm(Replaces)
#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors) -->
#' The `.PCM` ate the end of the plate position is also not needed and will likely
#' be annoying much later. This is cut out here.
predictors$Sample <- gsub('.{4}$', '', predictors$Sample)
#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors) -->
#' In fact, the sample names can be used as the row names, and then be deleted.
rownames(predictors) <- predictors$Sample
predictors$Sample <- NULL
#' ## Removal of unused variables
#'
#' Also several other variables not needed for this analysis are dropped.
#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors) -->
predictors$CSBP_id <- NULL # meta-data for sample sorting and assignment
predictors$XRD.id <- NULL # meta-data for sample sorting and assignment
predictors$o_Mites <- NULL # spotty observation data, needs improved encoding
predictors$o_Moss <- NULL # spotty observation data, needs improved encoding
predictors$o_Salts <- NULL # spotty observation data, needs improved encoding
predictors$o_Moisture <- NULL # spotty observation data, needs improved encoding
predictors$c_Texture <- NULL # spotty observation data, needs improved encoding
#' ## Setting and naming of variables used for analysis
#'
#' Now, the variable names are changed to four letter abbreviations, to aid
#' later analysis. Also the types will be set properly. This does not need to be
#' written as a function, because this will only be done once.
# rename variables - "setnames()" does this by reference, to avoid errors
setnames (predictors,
old = c ("Location", "Genes", "c_Ammonium", "c_Nitrate", "c_Phosphorus",
"c_Potassium", "c_Sulphur", "c_Org_Carbon", "c_Conductivity",
"c_pH_CaCl2", "c_pH_H2O", "x_Quartz", "x_Feltspar", "x_Titanite",
"x_Pyr_Amp_Gar", "x_Micas", "x_Dolomite", "x_Kao_Chlor", "x_Calcite",
"x_Chlorite", "g_Latitude", "g_Longitude", "g_Elevation", "s_Slope",
"s_Aspect", "t_Soil_Temp", "low_age", "high_age", "c_Gravel",
"ATP"),
new = c ("AREA", "GENE", "AMMN", "NITR", "PHOS", "POTA", "SLPH", "CARB",
"COND", "PHCC", "PHHO", "QUTZ", "FDSP", "TTAN", "PRAG", "MICA",
"DOLO", "KAOC", "CALC", "CHLR", "LATI", "LONG", "ELEV", "SLPE",
"ASPT", "SPTT", "LAGE", "HAGE", "GRVL", "MATP"))
#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors) -->
# correct variable types to make everything easier subsequently
predictors$AREA <- as.factor(predictors$AREA)
predictors$GENE <- as.factor(predictors$GENE)
predictors$GRVL <- as.numeric(predictors$GRVL)
predictors$AMMN <- as.numeric(predictors$AMMN)
predictors$NITR <- as.numeric(predictors$NITR)
predictors$PHOS <- as.numeric(predictors$PHOS)
predictors$POTA <- as.numeric(predictors$POTA)
predictors$SLPH <- as.numeric(predictors$SLPH)
predictors$CARB <- as.numeric(predictors$CARB)
predictors$COND <- as.numeric(predictors$COND)
predictors$PHCC <- as.numeric(predictors$PHCC)
predictors$PHHO <- as.numeric(predictors$PHHO)
predictors$QUTZ <- as.numeric(predictors$QUTZ)
predictors$FDSP <- as.numeric(predictors$FDSP)
predictors$TTAN <- as.numeric(predictors$TTAN)
predictors$PRAG <- as.numeric(predictors$PRAG)
predictors$MICA <- as.numeric(predictors$MICA)
predictors$DOLO <- as.numeric(predictors$DOLO)
predictors$KAOC <- as.numeric(predictors$KAOC)
predictors$CALC <- as.numeric(predictors$CALC)
predictors$CHLR <- as.numeric(predictors$CHLR)
predictors$LATI <- as.numeric(predictors$LATI)
predictors$LONG <- as.numeric(predictors$LONG)
predictors$ELEV <- as.numeric(predictors$ELEV)
predictors$SLPE <- as.numeric(predictors$SLPE)
predictors$ASPT <- as.numeric(predictors$ASPT)
predictors$SPTT <- as.numeric(predictors$SPTT)
predictors$MATP <- as.numeric(predictors$MATP)
predictors$LAGE <- as.numeric(predictors$LAGE)
predictors$HAGE <- as.numeric(predictors$HAGE)
#' ## Checking variables used for analysis
#'
#' The formatted data is shown here.
# data structure
str(predictors)
# data summary
summary (predictors)
#' Correcting a mistake in marker availability data (excluding one more
#' that is only relevant when analysing COI data as well):
predictors[which (rownames(predictors) %in% c("2.10.E", "2.10.C")), "GENE"] <- "18Sonly"
#' # Write data to disk
#'
#' Saved are object created by this script as well as command history and work-space
#' image. The number in front of the file name denotes the source script.
save (predictors, file = path_predictors_out) # data frame
save.image (path_workspace) # work-space
#' # Session info
#'
#' The code and output in this document were tested and generated in the
#' following computing environment:
#+ echo=FALSE
sessionInfo()