-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path40_main_analysis.r
739 lines (604 loc) · 31.1 KB
/
40_main_analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
#' ---
#' title: "Main analysis"
#' author: "Paul Czechowski"
#' date: "December 5th, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#'
#' This text was re-rendered in the proofing stage, it is slightly newer then
#' the versions available in the released online repositories. This code is
#' tested using a raw R terminal. Path names are defined relative
#' to the project directory. This code commentary is included in the R code
#' itself and can be rendered at any stage using
#' `rmarkdown::render ("40_main_analysis.r")`. Please check the session info
#' at the end of the document for further notes on the coding environment.
#'
#' # Prerequisites to run this analysis
#'
#' * This script is in the parent directory of the repository.
#' * Scripts `10_import_predictors.r`, `20_format_predictors.r`,
#' and `30_format_phyloseq.r`were run.
#' * Script `00_functions.r` is available .
#' * Output of these scripts is available throughout the repository directory
#' tree.
#'
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Environment preparation
#'
#' ## Package loading and cleaning of work-space
#+ message=FALSE, results='hide'
library ("phyloseq") # required for working with phyloseq objects
library ("ggbiplot") # package for simple generation of PCA biplots
library ("ggplot2") # package used for barplot (for themes())
library ("corrplot") # plotting of correlations ...
library ("ggcorrplot") # ... which can be saved to disk
library ("gridExtra") # re-arranging graphical objects
library ("GGally") # scatter plot matrices (geochemical and combined data)
library ("vegan") # CCA and MDS
library ("dplyr") # for easier reading of complex expressions
rm(list=ls()) # clear R environment
# working directory needs to be set manually for
# cross-platform compatibility
#' ## Setting locations for data import and export
#'
#' This script uses the objects generated by `30_format_phyloseq.r` that are located
#' in the `Zenodo` directory tree. It will also write to that location. The
#' number in front of the file name denotes the source script.
#'
#' ### Import locations
#'
#' Path to filtered, CSS abundance-corrected data with curated taxonomy and field
#' measurements.
path_phsq_ob <- file.path ("Zenodo/R_Objects/35_phsq_ob.Rdata",
fsep = .Platform$file.sep)
#' ### Export locations
#'
#' Workspace image for future reference, after plotiing and after analyses.
path_workspace_a <- file.path ("Zenodo/R_Objects/40_010_workspace.Rdata",
fsep = .Platform$file.sep)
path_workspace_b <- file.path ("Zenodo/R_Objects/40_020_workspace.Rdata",
fsep = .Platform$file.sep)
#' Graphics for main manuscript and supplemental information.
path_abnds <- file.path ('Zenodo/R_Output/40-010_abundances.pdf',
fsep = .Platform$file.sep)
path_coord <- file.path ('Zenodo/R_Output/40-020_coordinates.csv',
fsep = .Platform$file.sep)
path_brplts <- file.path ('Zenodo/R_Output/40-030_barplots.pdf',
fsep = .Platform$file.sep)
path_map_pcm <- file.path ('Zenodo/R_Output/40-040_map_pcm.pdf',
fsep = .Platform$file.sep)
path_all_cor <- file.path ('Zenodo/R_Output/40-050_cor_all_vars.pdf',
fsep = .Platform$file.sep)
path_min_cor <- file.path ('Zenodo/R_Output/40-060_cor_min_vars.pdf',
fsep = .Platform$file.sep)
path_sca_tra <- file.path ('Zenodo/R_Output/40-070_sca_tra_all_vars.pdf',
fsep = .Platform$file.sep)
path_vio_tra <- file.path ('Zenodo/R_Output/40-080_vio_tra_all_vars.pdf',
fsep = .Platform$file.sep)
path_pca_var <- file.path ('Zenodo/R_Output/40-090_pca_var_all_vars.pdf',
fsep = .Platform$file.sep)
path_pca_bip <- file.path ('Zenodo/R_Output/40-110_pca_bip_all_vars.pdf',
fsep = .Platform$file.sep)
path_mds_hip <- file.path ('Zenodo/R_Output/40-120_mds_mds_all_vars.pdf',
fsep = .Platform$file.sep)
path_hmp_all <- file.path ('Zenodo/R_Output/40-130_hmp_____all_vars.pdf',
fsep = .Platform$file.sep)
path_regr <- file.path ('Zenodo/R_Output/40-140_regr_age_chem.pdf',
fsep = .Platform$file.sep)
#' ## Loading functions
#'
#' This script uses many functions, and it impractical to have them all in here.
#' They are loaded from `00_functions.R`.
source (file.path ("00_functions.r", fsep = .Platform$file.sep))
#' ## Data import
#'
#' Phylotype data is imported using basic R functionality. Imported are `phyloseq`
#' objects [@McMurdie2013].
load (path_phsq_ob) # object name is "phsq_ob"
#' ## Defining predictor categories
#'
#' These character vectors can be passed to `get_predictors()` to create data
#' frames with customised variable content.
geochems <- c ("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC", "PHOS", "CARB",
"PHHO")
# "AMMN" and "NITR" with 57% and 47% undefined data
# may be removed, these may hold important
# information. "PHOS" "CARB" may be removed because not
# relevant for MM?
minerals <- c ("QUTZ", "FDSP", "TTAN", "PRAG", "MICA", "DOLO", "KAOC")
# "CALC" with 33 NA's and "CHLR" with 52 NA's
# are removed, otherwise the MM mineral composition
# can't be analysed
location <- c ("AREA")
position <- c ("LONG", "LATI") # for spatial distance matrices
raw_ages <- c ("LAGE", "HAGE") # low and high age estimate
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Describe imported `phyloseq` object
#'
#' ## Invertebrate abundances per sample
#'
#' The following plots are agglomerated on a specified level by calling
#' `agglomerate()`. The first plot shows proportional abundance of invertebrates
#' in rarefied data. In the second plot, all abundances are converted to "1",
#' (via `make_binary ()`)and the rank composition is more visible. Create the
#' plots.:
pl1 <- barplot_samples (agglomerate (phsq_ob, "Class"), "Class")
pl2 <- barplot_samples (make_binary (agglomerate (phsq_ob, "Class")), "Class")
#' Showing the plots.:
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=7, dpi=200, fig.align='center', fig.cap="Invertebrate abundances at selected taxonomic level. (approx. code line 149)"
grid.arrange(pl1, pl2, nrow = 2)
#' Save the plots.:
ggsave (file = path_abnds, plot = arrangeGrob (pl1, pl2, nrow = 2),
dpi = 200, width = 7, height = 7, units = "in")
#' Garbage collection.
rm(pl1, pl2)
#' ## Invertebrate abundances per location
#'
#' Bar-plot taxa by locations. Firstly, building a list which can be used
#' to give and take from plotting function. "[[ ]]" will get the atomic
#' vector rather then a sub-data frame, to give a character vector of length
#' three (3 locations).
areas <- unique (as.character (as.data.frame (sample_data (phsq_ob))[["AREA"]]))
#' An object will be agglomerated on class level to match the previous plots.
phsq_agg <- agglomerate (phsq_ob, "Class")
#' `subset_samples()` and `lapply()` are still not friends, but subsetting
#' per location is still necessary.
phsq_obs <- list (subset_samples (phsq_ob,
sample_data (phsq_ob)[, "AREA"] == areas[[1]]),
subset_samples (phsq_ob,
sample_data (phsq_ob)[, "AREA"] == areas[[2]]),
subset_samples (phsq_ob,
sample_data (phsq_ob)[, "AREA"] == areas[[3]]))
#' I will also filter for empty sample columns, as no empty are columns wanted
#' in plots.
phsq_obs <- lapply (phsq_obs, remove_empty)
#' Looks like `plot_bar()` doesn't like `lapply()` either, as it looks, so I
#' need to create a plot for each location individually.
plots <- list (plot_bar (phsq_obs[[1]], fill = "Class", facet_grid = "Phylum~AREA") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1,
size = 8), strip.text.x = element_text(size = 12),
strip.text.y = element_text(size = 12, angle = 0),
axis.text.y = element_text (size = 8)) +
geom_bar ( aes(color = Class, fill = Class), stat="identity",
position="stack"),
plot_bar (phsq_obs[[3]], fill = "Class", facet_grid = "Phylum~AREA") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1,
size = 8), strip.text.x = element_text(size = 12),
strip.text.y = element_text(size = 12, angle = 0),
axis.text.y = element_text (size = 8)) +
geom_bar ( aes(color = Class, fill = Class), stat="identity",
position="stack"),
plot_bar (phsq_obs[[2]], fill = "Class", facet_grid = "Phylum~AREA") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1,
size = 8), strip.text.x = element_text(size = 12),
strip.text.y = element_text(size = 12, angle = 0),
axis.text.y = element_text (size = 8)) +
geom_bar ( aes(color = Class, fill = Class), stat="identity",
position="stack"))
#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=10, dpi=200, fig.align='center', fig.cap="Invertebrate class and phylum composition per sampling location, with CSS'd abundances. Note: This is not the same figure as in the main text. (Approx. code line 209)"
grid.arrange(plots[[1]], plots[[2]], plots[[3]], nrow = 3)
#' Saving plots.:
ggsave (file = path_brplts, plot = arrangeGrob (plots[[1]], plots[[2]],
plots[[3]], nrow = 3),
dpi = 200, width = 7, height = 10, units = "in")
#' Garbage collection.:
rm (plots, phsq_obs, phsq_agg)
#' ## Map in addition to QGIS map
#'
#' Here using an old function from repository `pcm_modelling`, which was minimally
#' adjusted for the current repository. Creating the map, plotting is here, and
#' saving it to file. Garbage collection afterwards.
phsq_map <- map_samples(phsq_ob, col_long = "LONG", col_lat = "LATI",
subtitle = "18S data")
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="Sample range of responses and predictors (approx. code line 228)"
plot(phsq_map)
ggsave (path_map_pcm, plot = phsq_map, dpi = 200, width = 5.5, height = 4.5,
units = "in")
rm (phsq_map)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Retrieve and write information from imported `phyloseq` object
#'
#' Expand this section if further information is necessary for writing.
#'
#' ## Mean elevation per sample
#'
#' Get elevation range for locations:
temp <- data.frame (sample_data (phsq_ob)[ , c("ELEV", "AREA")])
#' Output mean elevations per location:
summary (temp [ which (temp$AREA == "MM"), "ELEV" ]) # MM
summary (temp [ which (temp$AREA == "ME"), "ELEV" ]) # ME
summary (temp [ which (temp$AREA == "LT"), "ELEV" ]) # LT
#' ## Write sample coordinates to be used with GIS software
#'
#' Agglomeration beforehand is done only so that the data matches subsequent
#' analyses, just in case agglomerate changes the data (which it shouldn't, but
#' I haven't tested that is doesn't). The `.csv`file is used by
#' in Qgis file `/Zenodo/Qgis/map.qgs` in conjuction with the Quantarctica
#' package. `dplyr` doesn't work here, storing object is necessary.:
coordinates <- sample_data (agglomerate (phsq_ob, "Class"))[ , c ("LONG", "LATI",
"AREA", "GENE")]
#' See previously defined export path:
write.table (as.data.frame (coordinates), file = path_coord, row.names = TRUE,
col.names = TRUE)
#' Garbage collection:
rm (coordinates, temp)
#' # Saving intermediate workspace
#'
#' Saving of temporary work space during coding, so that, code above doesn't
#' have to be run all the time, when changing the tiniest bit downstream.
save.image (path_workspace_a)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Exploratory data analysis
#'
#' ## Isolating data for this analysis
#'
#' Starting, again by getting the data frames. For now, unless `get_list()`
#' is modified, a lot is crammed into vector `pred_cat` which will be returned
#' in `obs`. Can't include `raw_ages` here, since it's rich in `NA`s and these
#' are filtered by `get_list()`.
matr_ana <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (geochems,
minerals, position), pres_abs = FALSE)
#' Returned as `data.frames` in list are `spc`, `obs`, `grp`, and `gen`.
grp <- matr_ana[["grp"]] # for PCA labels
spc <- matr_ana[["spc"]] # species data
#' `obs` is a mixture of things, which need to be treated separately in the
#' following. Hence some isolation work is necessary. The objects are erased from
#' the input list, as a precaution. ** There is a semicolon here!** Afterwards
#' `matr_ana[["obs"]]` is empty.
posi <- matr_ana[["obs"]] [ , position]; matr_ana[["obs"]] [ , position] <- NULL
chem <- matr_ana[["obs"]] [ , geochems]; matr_ana[["obs"]] [ , geochems] <- NULL
minl <- matr_ana[["obs"]] [ , minerals]; matr_ana[["obs"]] [ , minerals] <- NULL
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' ## Checking soil mineral and soil geochemical data
#'
#' ### Showing the initial state of the mineral and chemical data
#'
#' Isolated and combined mineral and chemical data from the `phyloseq` object,
#' congruent with rows in `spc`, `grp`, and `gen`.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=16, fig.height=16, dpi=200, fig.align='center', fig.cap="Isolated and combined mineral and chemical data, unmodified. (approx. code line 317)"
ggpairs ( data.frame (chem, minl, check.rows = TRUE))
summary ( data.frame (chem, minl, check.rows = TRUE))
str (data.frame (chem, minl, check.rows = TRUE))
#' Violin plots for unmodified chemical data.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Chemical data per location, unmodified. (approx. code line 323)"
add_discretex (chem, grp, dfr_x_name = "AREA") %>%
get_violinplotlist ( . ,"AREA") %>%
marrangeGrob ( . , nrow=3, ncol=3)
#' Violin plots for unmodified mineral data.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Mineral data per location, unmodified. (approx. code line 329)"
add_discretex (minl, grp, dfr_x_name = "AREA") %>%
get_violinplotlist ( . ,"AREA") %>%
marrangeGrob ( . , nrow=3, ncol=3)
#' ### Removing outliers in soil geochemical observations
#'
#' Many ordination approaches are sensitive to outliers, and perhaps this also
#' affects modelling approaches. Outliers are removed here and replaced with the
#' mean. If desired this could perhaps be left out at later stages. This is not
#' done for the rank / compositional mineral data, but only the soil geochemical
#' values.
chem <- remove_outliers (chem)
#' ### Transformations
#'
#' These are helpful for all modelling attempts, PCA, and ANOVA approaches.
#'
#' Centred log ration transform for the mineral data as recommended by [@Ranganathan2011],
#' in order to use this data for PCA and alongside the chemical data. The
#' transformation will remove rows from the data farame.
minl <- data.frame (transform_clr (minl))
#' Yeo-Johnson transformation conducted here after recombining the data frame
#' (and garbage collection).
# merging of data frame and garbage collection
obs <- merge(chem, minl, by = "row.names", all = TRUE); rownames (obs) <-
obs$Row.names; obs$Row.names <- NULL ; rm (chem); rm(minl)
# transformation of combined mineral an chemical data
obs <- transform_any (obs, method = c ("center", "scale", "YeoJohnson", "nzv"))
obs <- obs[complete.cases(obs), ]
#' ### Plotting correlations
#'
#' A simple function call to see precisely, which variables are correlated.
#' Writing correlation plots to disk doesn't work (easily) with `corrplot()` using
#' `ggcorrplot()` instead now (shorten this!).
# getting correlations and p-values
corr_all <- ggcorrplot(cor (obs), hc.order = TRUE, type = "lower",
outline.col = "white", p.mat = cor_pmat(obs),
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
corr_min <- ggcorrplot(cor (remove_cocorrelated(obs)), hc.order = TRUE, type = "lower",
outline.col = "white", p.mat = cor_pmat(remove_cocorrelated(obs)),
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
# plot out
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=8, dpi=200, fig.align='center', fig.cap="removal of correlations in yj transformed chemical data and clr yj transformed mineral data. (approx. code line 381)"
grid.arrange (corr_all, corr_min , ncol=2)
# write to dik
ggsave (path_all_cor, plot = corr_all, dpi = 200, width = 8, height = 4.5,
units = "in")
ggsave (path_min_cor, plot = corr_min, dpi = 200, width = 8, height = 4.5,
units = "in")
# garbage cleaning
rm (path_all_cor, path_min_cor, corr_all, corr_min)
#' ### Removing the correlated variables
#'
#' In addition to the plot above, the text confirms which
#' variables are kept, and which ones are removed.
obs <- remove_cocorrelated(obs)
#' ### Plotting out transformed mineral and chemical data
#'
#' Plot the data before it goes to into further analyses, only for checking.
#' Initially a scatterplot.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Scatterplot of centred and scaled mineral and chemical data (approx. code line 404)."
# saving this plot for disk-write
ggpairs (obs) # less correlation, less skew?
summary (obs)
# write data to disk (slooooooow)
pdf(path_sca_tra, height = 10, width = 10)
g <- ggpairs(obs)
print(g)
dev.off()
# garbage collection
rm (g)
#' And the violin plots.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Violin plot of scaled mineral and chemical data. Note that several samples have been excluded when transforming merging mineral and chemical data (approx. code line 419)."
add_discretex (obs, grp, dfr_x_name = "AREA") %>%
get_violinplotlist ( . ,"AREA") %>%
marrangeGrob ( . , nrow=5, ncol=3)
# write data to disk (slooooooow)
g <- add_discretex (obs, grp, dfr_x_name = "AREA") %>%
get_violinplotlist ( . ,"AREA") %>%
marrangeGrob ( . , nrow=5, ncol=3)
ggsave (path_vio_tra, plot = g, dpi = 200, width = 9, height = 9,
units = "in")
# garbage collection
rm (g)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' ## Principal component analysis of observations
#'
#' ### Getting the principal components
#'
#' Principal components are retrieved for the `obs` data frame with the mineral
#' analysis values.
pcs <- prcomp (obs, center = FALSE, scale = FALSE)
#' ### Testing the principal components
#'
#' Testing the principal components with a variance plot. Here on should only
#' interpret principal components with variances above 1.
#+ message=FALSE, warning=FALSE, fig.width=5, fig.height=3, dpi=200, fig.align='center', fig.cap="PCA variance plots of centred and scaled mineral data (approx. code line 452)"
plot_pcvars(pcs)
# write to disk
ggsave (path_pca_var, plot = last_plot (), path = NULL, scale = 1, width = 7,
height = 3, units = "in", dpi = 300)
#' ### Bi-plot
#'
#' The biplot can be generated, after the variables in `grp` are subset to match
#' the `pcs` object, necessary for correct ovals in the plot.
#+ message=FALSE, warning=FALSE, fig.width=8, fig.height=8, dpi=200, fig.align='center', fig.cap="PCA biplots on clr/vj-transformed mineral data and yj-transformed chemical data. (approx. code line 463)"
get_biplot(pcs, shorten_groups (grp, obs)) # `grp` is needed later without type
# conversion - don't expand this
# expression!
# write to disk
ggsave (path_pca_bip, plot = last_plot (), scale = 1, width = 7,
height = 7, units = "in", dpi = 300)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' ## Checking species information
#'
#' ### Showing the initial state of the species information
#'
#' Here used is the initial abundance data, which is abundance corrected using
#' the CSS algorithm in Qiime [@Paulson2013] _"With CSS, raw counts are divided
#' by the cumulative sum of counts up to a percentile determined using a data-driven
#' approach."_ The data is shown using a scatterplot and a violin plot.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Scatterplot of cumulative sum-scaled species observations (approx. code line 485)."
ggpairs (spc)
summary (spc)
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Violin plot of cumulative sum-scaled species observations (approx. code line 488)."
add_discretex ( as.data.frame(spc), grp, dfr_x_name = "AREA") %>%
get_violinplotlist ( . ,"AREA") %>%
marrangeGrob ( . , nrow=3, ncol=2)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Non-Metric Multidimensional Scaling
#'
#' ## Matching up phylotype and factor information.
#'
#' Species observations have to match the observations.
spc <- cmpl_phylotypes(spc, obs)
#' ## Getting a `metaMDS` object from the species data
#'
#' Calculating a `metaMDS` object on the `spc` data, for now without environmental
#' variables. Distance `jaccard` appears to be recommended in `vegan()`,
#' `binary = TRUE` ensures correct distance calculation on presence / absence
#' data. `try` is set higher, to look longer for solutions. Stress > 0.05 provides
#' an excellent representation in reduced dimensions, > 0.1 is great, > 0.2 is
#' fair, and stress > 0.3 provides a poor representation. Inspect the generated
#' model in the end.
spc_mds <- metaMDS(spc, distance = "bray" , k = 2, try = 1000, trymax = 2000,
noshare = FALSE, wascores = TRUE, trace = 0, plot = FALSE, expand = TRUE,
binary = FALSE)
spc_mds # stress value is 0.04411049
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=8, dpi=200, fig.align='center', fig.cap="NMDS stressplot (approx. code line 519)."
stressplot(spc_mds)
#' ## Fitting environmental vectors
#'
env <- envfit(spc_mds, obs, permutations = 9999)
env
#+ message=FALSE, warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="NMDS plot of species and mineral data, SLPH with (*) significance here. Blue: Mount Menzies, Red: Lake Terrasovoje, Green: Mawson Escarpment (approx. code line 527)."
par (mfrow = c (1, 1))
ordiplot (spc_mds, display = "sites" )
# ordihull (spc_mds, shorten_groups (grp, obs), col = c ("coral3", "chartreuse4", "cornflowerblue"))
ordiellipse(spc_mds, shorten_groups (grp, obs) , display = "sites", kind = "se",
conf = 0.95, label = T, col = c ("red", "chartreuse4", "cornflowerblue"))
orditorp (spc_mds, display = "species", col="deepskyblue4", air = 0.01, cex = 1.0)
with (obs, ordisurf (spc_mds, SLPH, add = TRUE, col = "darkgoldenrod4" ))
# with (obs, ordisurf (spc_mds, AMMN, add = TRUE, col = "orange"))
# write data to disk (slooooooow)
pdf(path_mds_hip, height = 9, width = 9)
par (mfrow = c (1, 1))
ordiplot (spc_mds, display = "sites" )
# ordihull (spc_mds, shorten_groups (grp, obs), col = c ("coral3", "chartreuse4", "cornflowerblue"))
ordiellipse(spc_mds, shorten_groups (grp, obs) , display = "sites", kind = "se",
conf = 0.95, label = T, col = c ("red", "chartreuse4", "cornflowerblue"))
orditorp (spc_mds, display = "species", col="deepskyblue4", air = 0.01, cex = 1.0)
with (obs, ordisurf (spc_mds, SLPH, add = TRUE, col = "darkgoldenrod4" ))
# with (obs, ordisurf (spc_mds, AMMN, add = TRUE, col = "orange"))
dev.off()
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' ## `adonis` analysis
#'
#' Class beta diversity, expressed as distance `z = (log(2)-log(2*a+b+c)+log(a+b+c))/log(2)`
#' may be function of group means of SLPH.
ad <- adonis (formula = betadiver ( spc, "z") ~ SLPH, data = obs, perm = 9999)
ad
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#' ## Canonical correspondence analysis
#'
#' ### Calculate the CCA
#'
#' Here using also the Sulphur variables, as it was the only one that showed
#' persistent significance across all tests. Show the result.
inv_slph <- cca(spc ~ SLPH , data = obs)
inv_slph
# output of the summary function is very long, I'll mute this
# summary(inv_slph)
#' ### Testing CCA results
#'
#' Permutation testing
anova (inv_slph, perm = 9999)
#' Testing the (single) axis for significance
anova (inv_slph, by="axis", perm = 1000)
#' Type I test
anova (inv_slph, by="term", perm = 1000) # SLPH
#' Type III test
anova(inv_slph, by="margin", perm = 1000) # SLPH
#' Variance inflation factor
vif.cca(inv_slph)
#' Goodness of fit for classes
goodness(inv_slph)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#' ## Regression analyses
#'
#' ### Across all samples
#'
#' Isolate ages for analysis, get_list() prunes undefined values across the whole returned
#' table.
ages <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (raw_ages, geochems,
minerals), pres_abs = FALSE)
#' Extract observations from list into dataframe to work with them more easily.
ages <- ages[["obs"]];
#' Get the row means across higher and lower estimates, only one column is desired
#' for further analysis.
ages$MNAGE <- -1 * rowMeans (ages[ ,c("HAGE", "LAGE")])
#' #### Plot all regressions
#'
#' Make plotting easier by using `lapply()` to generate plots, for which one
#' needs a vector to loop over.
y_axis <- c("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC", "PHOS", "CARB",
"PHHO")
#' Create a list of graphical objects for plotting on one page.
regr_list <- lapply (y_axis, plot_regrs, ages)
#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="Regressions between all terrrain ages and soil geochemical measurments. (Approx. code line 635)"
marrangeGrob (regr_list, nrow = 3, ncol = 3)
#' Saving plots.:
ggsave (file = path_regr, plot = marrangeGrob (regr_list, nrow = 3, ncol = 3),
dpi = 200, width = 10, height = 10, units = "in")
#' #### Testing all regressions
#'
#' Calculate all regressions, with variables from previous section, and store
#' as list.
regressions <- lapply (y_axis, function(x) {
lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = ages)
})
#' Return summary from list for more specific results
lapply (regressions, summary )
#' ### Region specific
#'
#' Isolate ages for analysis, get_list() prunes undefined values across the whole returned
#' table.
ages <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (raw_ages, geochems,
minerals), pres_abs = FALSE)
#' Extract observations from list into dataframe to work with them more easily.
ages <- cbind(ages[["obs"]], ages[["grp"]])
#' Get the row means across higher and lower estimates, only one column is desired
#' for further analysis.
ages$MNAGE <- -1 * rowMeans (ages[ ,c("HAGE", "LAGE")])
#' Split dataframes by location
agesMM = ages[ which (ages$grp == "MM"), ]
agesME = ages[ which (ages$grp == "ME"), ]
agesLT = ages[ which (ages$grp == "LT"), ]
#' #### Plot all regressions
#'
#' Make plotting easier by using `lapply()` to generate plots, for which one
#' needs a vector to loop over.
y_axis <- c("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC", "PHOS", "CARB",
"PHHO")
#' Create a list of graphical objects for plotting on one page.
regr_listMM <- lapply (y_axis, plot_regrs, agesMM)
regr_listME <- lapply (y_axis, plot_regrs, agesME)
regr_listLT <- lapply (y_axis, plot_regrs, agesLT)
#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="Mount Menzies: Regressions between terrrain age and soil geochemical measurements (Approx. code line 689)."
marrangeGrob (regr_listMM, nrow = 3, ncol = 3)
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="Mawson Escarpment: Regressions between terrrain age and soil geochemical measurements (Approx. code line 692)."
marrangeGrob (regr_listME, nrow = 3, ncol = 3)
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="Lake Terrasovoje: Regressions between terrrain age and soil geochemical measurements (Approx. code line 695)."
marrangeGrob (regr_listLT, nrow = 3, ncol = 3)
#' #### Testing all regressions
#'
#' Calculate all regressions, with variables from previous section, and store
#' as list.
regressionsMM <- lapply (y_axis, function(x) {
lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesMM)
})
regressionsME <- lapply (y_axis, function(x) {
lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesME)
})
regressionsLT <- lapply (y_axis, function(x) {
lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesLT)
})
#' Return summary from list for more specific results
lapply (regressionsMM, summary )
lapply (regressionsME, summary )
lapply (regressionsLT, summary )
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Write data to disk
#'
#' Saved are object created by this script as well as command history and work-space
#' image.
save.image (path_workspace_b) # work-space
#' # Session info
#'
#' The code and output in this document were tested and generated in the
#' following computing environment:
#+ echo=FALSE
sessionInfo()
#' # References