40_main_analysis.r

#' ---
#' title: "Main analysis"
#' author: "Paul Czechowski"
#' date: "December 5th, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#' 
#' This text was re-rendered in the proofing stage, it is slightly newer then
#' the versions available in the released online repositories. This code is 
#' tested using a raw R terminal. Path names are defined relative 
#' to the project directory. This code commentary is included in the R code 
#' itself and can be rendered at any stage using 
#' `rmarkdown::render ("40_main_analysis.r")`. Please check the session info 
#' at the end of the document for further notes on the coding environment.
#' 
#' # Prerequisites to run this analysis
#' 
#' * This script is in the parent directory of the repository. 
#' * Scripts `10_import_predictors.r`, `20_format_predictors.r`,
#'   and `30_format_phyloseq.r`were run.
#' * Script `00_functions.r` is available .
#' * Output of these scripts is available throughout the repository directory
#'   tree.
#' 
#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#' 
#' # Environment preparation
#'
#' ## Package loading and cleaning of work-space
#+ message=FALSE, results='hide'
library ("phyloseq")   # required for working with phyloseq objects
library ("ggbiplot")   # package for simple generation of PCA biplots
library ("ggplot2")    # package used for barplot (for themes()) 
library ("corrplot")   # plotting of correlations ...
library ("ggcorrplot") # ... which can be saved to disk 
library ("gridExtra")  # re-arranging graphical objects 
library ("GGally")     # scatter plot matrices (geochemical and combined data)
library ("vegan")      # CCA and MDS
library ("dplyr")      # for easier reading of complex expressions
rm(list=ls())          # clear R environment
                       # working directory needs to be set manually for
                       #   cross-platform compatibility

#' ## Setting locations for data import and export
#' 
#' This script uses the objects generated by `30_format_phyloseq.r` that are located
#' in the `Zenodo` directory tree. It will also write to that location. The 
#' number in front of the file name denotes the source script.
#'
#' ### Import locations
#'
#' Path to filtered, CSS abundance-corrected data with curated taxonomy and field
#' measurements.
path_phsq_ob <- file.path ("Zenodo/R_Objects/35_phsq_ob.Rdata",
                           fsep = .Platform$file.sep) 
#' ### Export locations
#'
#' Workspace image for future reference, after plotiing and after analyses.
path_workspace_a    <- file.path ("Zenodo/R_Objects/40_010_workspace.Rdata",
                                fsep = .Platform$file.sep)

path_workspace_b    <- file.path ("Zenodo/R_Objects/40_020_workspace.Rdata",
                                fsep = .Platform$file.sep)

#' Graphics for main manuscript and supplemental information.
path_abnds   <- file.path ('Zenodo/R_Output/40-010_abundances.pdf', 
                          fsep = .Platform$file.sep)
path_coord   <- file.path ('Zenodo/R_Output/40-020_coordinates.csv', 
                          fsep = .Platform$file.sep)
path_brplts  <- file.path ('Zenodo/R_Output/40-030_barplots.pdf', 
                          fsep = .Platform$file.sep)
path_map_pcm <- file.path ('Zenodo/R_Output/40-040_map_pcm.pdf', 
                          fsep = .Platform$file.sep)
path_all_cor <- file.path ('Zenodo/R_Output/40-050_cor_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_min_cor <- file.path ('Zenodo/R_Output/40-060_cor_min_vars.pdf', 
                          fsep = .Platform$file.sep)
path_sca_tra <- file.path ('Zenodo/R_Output/40-070_sca_tra_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_vio_tra <- file.path ('Zenodo/R_Output/40-080_vio_tra_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_pca_var <- file.path ('Zenodo/R_Output/40-090_pca_var_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_pca_bip <- file.path ('Zenodo/R_Output/40-110_pca_bip_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_mds_hip <- file.path ('Zenodo/R_Output/40-120_mds_mds_all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_hmp_all <- file.path ('Zenodo/R_Output/40-130_hmp_____all_vars.pdf', 
                          fsep = .Platform$file.sep)
path_regr    <- file.path ('Zenodo/R_Output/40-140_regr_age_chem.pdf', 
                          fsep = .Platform$file.sep)


#' ## Loading functions
#'
#' This script uses many functions, and it impractical  to have them all in here.
#' They are loaded from `00_functions.R`.
source (file.path ("00_functions.r", fsep = .Platform$file.sep))

#' ## Data import
#' 
#' Phylotype data is imported using basic R functionality. Imported are `phyloseq`
#' objects [@McMurdie2013].
load (path_phsq_ob) # object name is "phsq_ob"

#' ## Defining predictor categories
#' 
#' These character vectors can be passed to `get_predictors()` to create data 
#' frames with customised variable content.
geochems <- c ("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC",  "PHOS", "CARB",
               "PHHO")
                        # "AMMN" and "NITR" with 57% and 47% undefined data
                        #   may be removed, these may hold important 
                        #   information. "PHOS" "CARB" may be removed because not 
                        #   relevant for MM?
minerals <- c ("QUTZ", "FDSP", "TTAN", "PRAG", "MICA", "DOLO", "KAOC")  
                        # "CALC" with 33 NA's and "CHLR" with 52 NA's
                        #   are removed, otherwise the MM mineral composition
                        #   can't be analysed 
location <- c ("AREA")
position <- c ("LONG", "LATI") # for spatial distance matrices
raw_ages <- c ("LAGE", "HAGE") # low and high age estimate

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Describe imported `phyloseq` object 
#' 
#' ## Invertebrate abundances per sample
#'
#' The following plots are agglomerated on a specified level by calling 
#' `agglomerate()`. The first plot shows proportional abundance of invertebrates
#' in rarefied data. In the second plot, all abundances are converted to "1",
#' (via `make_binary ()`)and the rank composition is more visible. Create the 
#' plots.:
pl1 <- barplot_samples (agglomerate (phsq_ob, "Class"), "Class")
pl2 <- barplot_samples (make_binary (agglomerate (phsq_ob, "Class")), "Class")

#' Showing the plots.:
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=7, dpi=200, fig.align='center',  fig.cap="Invertebrate abundances at selected taxonomic level. (approx. code line 149)"
grid.arrange(pl1, pl2, nrow = 2)

#' Save the plots.:
ggsave (file = path_abnds, plot = arrangeGrob (pl1, pl2, nrow = 2), 
        dpi = 200, width = 7, height = 7, units = "in")

#' Garbage collection.
rm(pl1, pl2)

#' ## Invertebrate abundances per location
#'
#' Bar-plot taxa  by locations. Firstly, building a list which can be used 
#' to give and take from plotting function. "[[ ]]" will get the atomic 
#' vector rather then a sub-data frame, to give a character vector of length 
#' three (3 locations).
areas <- unique (as.character (as.data.frame (sample_data (phsq_ob))[["AREA"]]))

#' An object will be agglomerated on class level to match the previous plots. 
phsq_agg <- agglomerate (phsq_ob, "Class")

#' `subset_samples()` and `lapply()` are still not friends, but subsetting 
#' per location is still necessary.
phsq_obs <- list (subset_samples (phsq_ob, 
                                  sample_data (phsq_ob)[, "AREA"] == areas[[1]]),
                  subset_samples (phsq_ob, 
                                  sample_data (phsq_ob)[, "AREA"] == areas[[2]]),
                  subset_samples (phsq_ob, 
                                  sample_data (phsq_ob)[, "AREA"] == areas[[3]]))

#' I will also filter for empty sample columns, as no empty are columns wanted 
#' in plots.
phsq_obs <- lapply (phsq_obs, remove_empty)

#' Looks like `plot_bar()` doesn't like `lapply()` either, as it looks, so I 
#' need to create a plot for each location individually. 
plots <- list (plot_bar (phsq_obs[[1]], fill = "Class", facet_grid = "Phylum~AREA") +
                 theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1,
                 size = 8), strip.text.x = element_text(size = 12), 
                 strip.text.y = element_text(size = 12, angle = 0), 
                 axis.text.y = element_text (size = 8)) +
                 geom_bar ( aes(color = Class, fill = Class), stat="identity",
                 position="stack"),
               plot_bar (phsq_obs[[3]], fill = "Class", facet_grid = "Phylum~AREA") + 
                 theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, 
                 size = 8), strip.text.x = element_text(size = 12), 
                 strip.text.y = element_text(size = 12, angle = 0),
                 axis.text.y = element_text (size = 8)) +
                 geom_bar ( aes(color = Class, fill = Class), stat="identity",
                 position="stack"),
               plot_bar (phsq_obs[[2]], fill = "Class", facet_grid = "Phylum~AREA") +
                 theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1,
                 size = 8), strip.text.x = element_text(size = 12), 
                 strip.text.y = element_text(size = 12, angle = 0),
                 axis.text.y = element_text (size = 8)) +
                 geom_bar ( aes(color = Class, fill = Class), stat="identity",
                 position="stack"))

#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=10, dpi=200, fig.align='center',  fig.cap="Invertebrate class and phylum composition per sampling location, with CSS'd abundances. Note: This is not the same figure as in the main text. (Approx. code line 209)"
grid.arrange(plots[[1]], plots[[2]], plots[[3]], nrow = 3)

#' Saving plots.:
ggsave (file = path_brplts, plot = arrangeGrob (plots[[1]], plots[[2]],
                                                plots[[3]], nrow = 3), 
        dpi = 200, width = 7, height = 10, units = "in")

#' Garbage collection.:
rm (plots, phsq_obs, phsq_agg)

#' ## Map in addition to QGIS map 
#'
#' Here using an old function from repository `pcm_modelling`, which was minimally
#' adjusted for the current repository. Creating the map, plotting is here, and
#' saving it to file. Garbage collection afterwards.
phsq_map <- map_samples(phsq_ob, col_long = "LONG", col_lat = "LATI", 
  subtitle = "18S data")

#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center',  fig.cap="Sample range of responses and predictors (approx. code line 228)"
plot(phsq_map)

ggsave (path_map_pcm, plot = phsq_map, dpi = 200, width = 5.5, height = 4.5, 
  units = "in")

rm (phsq_map)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Retrieve and write information from imported `phyloseq` object 
#'
#' Expand this section if further information is necessary for writing.
#' 
#' ## Mean elevation per sample
#'
#' Get elevation range for locations:
temp <- data.frame (sample_data (phsq_ob)[ , c("ELEV", "AREA")])

#' Output mean elevations per location:
summary (temp [ which (temp$AREA == "MM"),  "ELEV" ]) # MM
summary (temp [ which (temp$AREA == "ME"),  "ELEV" ]) # ME
summary (temp [ which (temp$AREA == "LT"),  "ELEV" ]) # LT

#' ## Write sample coordinates to be used with GIS software 
#'
#' Agglomeration beforehand is done only so that the data matches subsequent
#' analyses, just in case agglomerate changes the data (which it shouldn't, but
#' I haven't tested that is doesn't). The `.csv`file is used by
#' in Qgis file `/Zenodo/Qgis/map.qgs` in conjuction with the Quantarctica
#' package. `dplyr` doesn't work here, storing object is necessary.: 
coordinates <- sample_data (agglomerate (phsq_ob, "Class"))[ , c ("LONG", "LATI",
  "AREA", "GENE")]

#' See previously defined export path:
write.table (as.data.frame (coordinates), file = path_coord, row.names	= TRUE,
  col.names = TRUE)
 
#' Garbage collection:
rm (coordinates, temp)

#' # Saving intermediate workspace 
#'
#' Saving of temporary work space during coding, so that, code above doesn't
#' have to be run all the time, when changing the tiniest bit downstream.
save.image (path_workspace_a)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Exploratory data analysis
#'
#' ##  Isolating data for this analysis 
#' 
#' Starting, again by getting the data frames. For now, unless `get_list()`
#' is modified, a lot is crammed into vector `pred_cat` which will be returned
#' in `obs`. Can't include `raw_ages` here, since it's rich in `NA`s and these
#' are filtered by `get_list()`.
matr_ana <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (geochems, 
  minerals, position), pres_abs = FALSE)

#' Returned as `data.frames` in list are `spc`, `obs`, `grp`, and `gen`.
grp <- matr_ana[["grp"]] # for PCA labels
spc <- matr_ana[["spc"]] # species data

#' `obs` is a mixture of things, which need to be treated separately in the
#' following. Hence some isolation work is necessary. The objects are erased from
#' the input list, as a precaution. ** There is a semicolon here!** Afterwards
#' `matr_ana[["obs"]]` is empty. 
posi <- matr_ana[["obs"]] [ , position]; matr_ana[["obs"]] [ , position] <- NULL  
chem <- matr_ana[["obs"]] [ , geochems]; matr_ana[["obs"]] [ , geochems] <- NULL  
minl <- matr_ana[["obs"]] [ , minerals]; matr_ana[["obs"]] [ , minerals] <- NULL

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#' 
#' ##  Checking soil mineral and soil geochemical data
#'
#' ### Showing the initial state of the mineral and chemical data
#'
#' Isolated and combined mineral and chemical data from the `phyloseq` object,
#' congruent with rows in `spc`, `grp`, and `gen`.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=16, fig.height=16, dpi=200, fig.align='center', fig.cap="Isolated and combined mineral and chemical data, unmodified. (approx. code line 317)"
ggpairs ( data.frame (chem, minl, check.rows = TRUE))
summary ( data.frame (chem, minl, check.rows = TRUE))
str (data.frame (chem, minl, check.rows = TRUE))

#' Violin plots for unmodified chemical data.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Chemical data per location, unmodified. (approx. code line 323)"
add_discretex (chem, grp, dfr_x_name = "AREA") %>% 
 get_violinplotlist ( . ,"AREA") %>%
 marrangeGrob ( . , nrow=3, ncol=3)
 
#' Violin plots for unmodified mineral data.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Mineral data per location, unmodified. (approx. code line 329)"
add_discretex (minl, grp, dfr_x_name = "AREA") %>% 
 get_violinplotlist ( . ,"AREA") %>%
 marrangeGrob ( . , nrow=3, ncol=3)

#' ###  Removing outliers in soil geochemical observations
#'
#' Many ordination approaches are sensitive to outliers, and perhaps this also
#' affects modelling approaches. Outliers are removed here and replaced with the
#' mean. If desired this could perhaps be left out at later stages. This is not
#' done for the rank / compositional mineral data, but only the soil geochemical
#' values.
chem <- remove_outliers (chem) 

#' ###  Transformations  
#'
#' These are helpful for all modelling attempts, PCA, and ANOVA approaches. 
#' 
#' Centred log ration transform for the mineral data as recommended by [@Ranganathan2011],
#' in order to use this data for PCA and alongside the chemical data. The 
#' transformation will remove rows from the data farame.
minl  <- data.frame (transform_clr (minl))

#' Yeo-Johnson transformation conducted here after recombining the data frame
#' (and garbage collection). 

# merging of data frame and garbage collection 
obs <- merge(chem, minl, by = "row.names", all = TRUE); rownames (obs) <- 
  obs$Row.names; obs$Row.names <- NULL ; rm (chem); rm(minl)

# transformation of combined mineral an chemical data 
obs <- transform_any (obs, method = c ("center", "scale", "YeoJohnson", "nzv")) 
obs <- obs[complete.cases(obs), ]

#' ### Plotting correlations
#'
#' A simple function call to see precisely, which variables are correlated.
#' Writing correlation plots to disk doesn't work (easily) with `corrplot()` using
#' `ggcorrplot()` instead now (shorten this!).

# getting correlations and p-values
corr_all <- ggcorrplot(cor (obs), hc.order = TRUE, type = "lower",
  outline.col = "white", p.mat = cor_pmat(obs),
  ggtheme = ggplot2::theme_gray,
  colors = c("#6D9EC1", "white", "#E46726"))

corr_min <-  ggcorrplot(cor (remove_cocorrelated(obs)), hc.order = TRUE, type = "lower",
  outline.col = "white", p.mat = cor_pmat(remove_cocorrelated(obs)), 
  ggtheme = ggplot2::theme_gray,
  colors = c("#6D9EC1", "white", "#E46726"))

# plot out
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=8, dpi=200, fig.align='center', fig.cap="removal of correlations  in yj transformed chemical data and clr yj transformed mineral data. (approx. code line 381)"
grid.arrange (corr_all,  corr_min , ncol=2)

# write to dik 
ggsave (path_all_cor, plot = corr_all, dpi = 200, width = 8, height = 4.5, 
  units = "in")
ggsave (path_min_cor, plot = corr_min, dpi = 200, width = 8, height = 4.5,
  units = "in")

# garbage cleaning
rm (path_all_cor, path_min_cor, corr_all, corr_min)

#' ###  Removing the correlated variables 
#' 
#' In addition to the plot above, the text confirms which 
#' variables are kept, and which ones are removed. 
obs <- remove_cocorrelated(obs)

#' ### Plotting out transformed mineral and chemical data
#'
#' Plot the data before it goes to into further analyses, only for checking.
#' Initially a scatterplot.

#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Scatterplot of centred and scaled mineral and chemical data (approx. code line 404)."
# saving this plot for disk-write
ggpairs (obs) # less correlation, less skew?
summary (obs)

# write data to disk (slooooooow)
pdf(path_sca_tra, height = 10, width = 10)
g <- ggpairs(obs)
print(g)
dev.off()

# garbage collection
rm (g)

#' And the violin plots.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Violin plot of scaled mineral and chemical data. Note that several samples have been excluded when transforming merging mineral and chemical data (approx. code line 419)."
add_discretex (obs, grp, dfr_x_name = "AREA") %>% 
 get_violinplotlist ( . ,"AREA") %>%
 marrangeGrob ( . , nrow=5, ncol=3)

# write data to disk (slooooooow)
g <- add_discretex (obs, grp, dfr_x_name = "AREA") %>% 
 get_violinplotlist ( . ,"AREA") %>%
 marrangeGrob ( . , nrow=5, ncol=3)

ggsave (path_vio_tra, plot = g, dpi = 200, width = 9, height = 9,
  units = "in")

# garbage collection
rm (g)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' ## Principal component analysis of observations
#' 
#' ### Getting the principal components
#' 
#' Principal components are retrieved for the `obs` data frame with the mineral
#' analysis values.
pcs <- prcomp (obs, center = FALSE, scale = FALSE) 

#' ### Testing the principal components
#'
#' Testing the principal components with a variance plot. Here on should only
#' interpret principal components with variances above 1.
#+ message=FALSE, warning=FALSE, fig.width=5, fig.height=3, dpi=200, fig.align='center', fig.cap="PCA variance plots of centred and scaled mineral data (approx. code line 452)"
plot_pcvars(pcs)

# write to disk
ggsave (path_pca_var, plot = last_plot (), path = NULL, scale = 1, width = 7, 
  height = 3, units = "in", dpi = 300)

#' ### Bi-plot
#'
#' The biplot can be generated, after the variables in `grp` are subset to match
#' the `pcs` object, necessary for correct ovals in the plot.  
#+ message=FALSE, warning=FALSE, fig.width=8, fig.height=8, dpi=200, fig.align='center', fig.cap="PCA biplots on clr/vj-transformed mineral data and yj-transformed chemical data. (approx. code line 463)"
get_biplot(pcs, shorten_groups (grp, obs)) # `grp` is needed later without type
                                           #  conversion - don't expand this 
                                           #  expression!

# write to disk
ggsave (path_pca_bip, plot = last_plot (), scale = 1, width = 7, 
  height = 7, units = "in", dpi = 300)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#' 
#' ##  Checking species information 
#'
#' ### Showing the initial state of the species information 
#' 
#' Here used is the initial abundance data, which is abundance corrected using 
#' the CSS algorithm in Qiime [@Paulson2013] _"With CSS, raw counts are divided 
#' by the cumulative sum of counts up to a percentile determined using a data-driven 
#' approach."_ The data is shown using a scatterplot and a violin plot. 
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Scatterplot of cumulative sum-scaled species observations (approx. code line 485)."
ggpairs (spc)
summary (spc) 
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=12, dpi=200, fig.align='center', fig.cap="Violin plot of cumulative sum-scaled species observations (approx. code line 488)."
add_discretex ( as.data.frame(spc), grp, dfr_x_name = "AREA") %>% 
 get_violinplotlist ( . ,"AREA") %>%
 marrangeGrob ( . , nrow=3, ncol=2)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Non-Metric Multidimensional Scaling 
#' 
#' ## Matching up phylotype and factor information. 
#'
#' Species observations have to match the observations.
spc <- cmpl_phylotypes(spc, obs)

#' ## Getting a `metaMDS` object from the species data
#' 
#' Calculating a `metaMDS` object on the `spc` data, for now without environmental
#' variables. Distance `jaccard` appears to be recommended in `vegan()`, 
#' `binary = TRUE` ensures correct distance calculation on presence / absence 
#' data. `try` is set higher, to look longer for solutions. Stress > 0.05 provides
#' an excellent representation in reduced dimensions, > 0.1 is great, > 0.2 is 
#' fair, and stress > 0.3 provides a poor representation. Inspect the generated 
#' model in the end. 
spc_mds <- metaMDS(spc, distance = "bray" ,  k = 2, try = 1000, trymax = 2000, 
  noshare = FALSE, wascores = TRUE, trace = 0, plot = FALSE, expand = TRUE, 
  binary = FALSE)

spc_mds # stress value is 0.04411049
#+ message=FALSE, results='hide', warning=FALSE, fig.width=12, fig.height=8, dpi=200, fig.align='center', fig.cap="NMDS stressplot (approx. code line 519)."
stressplot(spc_mds)

#' ## Fitting environmental vectors 
#' 
env <- envfit(spc_mds, obs, permutations = 9999)
env

#+ message=FALSE, warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center', fig.cap="NMDS plot of species and mineral data, SLPH with (*) significance  here. Blue: Mount Menzies, Red: Lake Terrasovoje, Green: Mawson Escarpment (approx. code line 527)."
par (mfrow = c (1, 1))
ordiplot (spc_mds, display = "sites" )
# ordihull (spc_mds, shorten_groups (grp, obs), col = c ("coral3", "chartreuse4", "cornflowerblue"))
ordiellipse(spc_mds, shorten_groups (grp, obs) , display = "sites", kind = "se", 
  conf = 0.95, label = T,  col = c ("red", "chartreuse4", "cornflowerblue"))
orditorp (spc_mds, display = "species", col="deepskyblue4", air = 0.01, cex = 1.0)
with (obs, ordisurf (spc_mds, SLPH, add = TRUE, col = "darkgoldenrod4" ))
# with (obs, ordisurf (spc_mds, AMMN, add = TRUE, col = "orange"))

# write data to disk (slooooooow)
pdf(path_mds_hip, height = 9, width = 9)
par (mfrow = c (1, 1))
ordiplot (spc_mds, display = "sites" )
# ordihull (spc_mds, shorten_groups (grp, obs), col = c ("coral3", "chartreuse4", "cornflowerblue"))
ordiellipse(spc_mds, shorten_groups (grp, obs) , display = "sites", kind = "se", 
  conf = 0.95, label = T,  col = c ("red", "chartreuse4", "cornflowerblue"))
orditorp (spc_mds, display = "species", col="deepskyblue4", air = 0.01, cex = 1.0)
with (obs, ordisurf (spc_mds, SLPH, add = TRUE, col = "darkgoldenrod4" ))
# with (obs, ordisurf (spc_mds, AMMN, add = TRUE, col = "orange"))
dev.off()

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' ## `adonis` analysis
#'
#' Class beta diversity, expressed as distance `z = (log(2)-log(2*a+b+c)+log(a+b+c))/log(2)`
#' may be function of group means of SLPH. 
ad <- adonis (formula =  betadiver ( spc, "z") ~  SLPH, data = obs, perm = 9999)
ad

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->


#' ## Canonical correspondence analysis
#'
#' ### Calculate the CCA
#' 
#' Here using also the Sulphur variables, as it was the only one that showed 
#' persistent significance across all tests. Show the result.
inv_slph <- cca(spc ~ SLPH , data = obs)
inv_slph

# output of the summary function is very long, I'll mute this 
# summary(inv_slph)

#' ### Testing CCA results
#' 
#' Permutation testing 
anova (inv_slph, perm = 9999)

#' Testing the (single) axis for significance
anova (inv_slph, by="axis", perm = 1000)

#' Type I test
anova (inv_slph, by="term", perm = 1000) # SLPH

#' Type III test
anova(inv_slph, by="margin", perm = 1000) # SLPH

#' Variance inflation factor
vif.cca(inv_slph)

#' Goodness of fit for classes
goodness(inv_slph)


#' <!-- #################################################################### -->


#' <!-- #################################################################### -->


#' ## Regression analyses
#' 
#' ### Across all samples
#'
#' Isolate ages for analysis, get_list() prunes undefined values across the  whole returned
#' table.
ages <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (raw_ages, geochems, 
  minerals), pres_abs = FALSE)

#' Extract observations from list into dataframe to work with them more easily.
ages <- ages[["obs"]]; 
 
#'  Get the row means across higher and lower estimates, only one column is desired
#' for further analysis.
ages$MNAGE <- -1 * rowMeans (ages[ ,c("HAGE", "LAGE")])

#' #### Plot all regressions
#' 
#' Make plotting easier by using `lapply()` to generate plots, for which one
#' needs a vector to loop over.
y_axis <- c("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC", "PHOS", "CARB", 
               "PHHO")

#' Create a list of graphical objects for plotting on one page.
regr_list <- lapply (y_axis, plot_regrs, ages)

#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center',  fig.cap="Regressions between all terrrain ages and soil geochemical measurments. (Approx. code line 635)"
marrangeGrob (regr_list, nrow = 3, ncol = 3)

#' Saving plots.:
ggsave (file = path_regr, plot = marrangeGrob (regr_list, nrow = 3, ncol = 3), 
        dpi = 200, width = 10, height = 10, units = "in")

#' #### Testing all regressions
#'
#' Calculate all regressions, with variables from previous section, and store
#' as list. 
regressions <- lapply (y_axis, function(x) {
    lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = ages)
    })

#' Return summary from list for more specific results
lapply (regressions, summary )


#' ### Region specific
#'
#' Isolate ages for analysis, get_list() prunes undefined values across the  whole returned
#' table.
ages <- get_list (phsq_ob, tax_rank = "Class", pred_cat = c (raw_ages, geochems, 
  minerals), pres_abs = FALSE)

#' Extract observations from list into dataframe to work with them more easily.
ages <- cbind(ages[["obs"]], ages[["grp"]]) 
 
#'  Get the row means across higher and lower estimates, only one column is desired
#' for further analysis.
ages$MNAGE <- -1 * rowMeans (ages[ ,c("HAGE", "LAGE")])

#' Split dataframes by location 

agesMM = ages[ which (ages$grp == "MM"), ]
agesME = ages[ which (ages$grp == "ME"), ]
agesLT = ages[ which (ages$grp == "LT"), ]

#' #### Plot all regressions
#' 
#' Make plotting easier by using `lapply()` to generate plots, for which one
#' needs a vector to loop over.
y_axis <- c("AMMN", "NITR", "POTA", "SLPH", "COND", "PHCC", "PHOS", "CARB", 
               "PHHO")

#' Create a list of graphical objects for plotting on one page.
regr_listMM <- lapply (y_axis, plot_regrs, agesMM)
regr_listME <- lapply (y_axis, plot_regrs, agesME)
regr_listLT <- lapply (y_axis, plot_regrs, agesLT)


#' Plots can now be shown, and saved to the output directory. Objects are then
#' discarded.
#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center',  fig.cap="Mount Menzies: Regressions between terrrain age and soil geochemical measurements (Approx. code line 689)."
marrangeGrob (regr_listMM, nrow = 3, ncol = 3)

#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center',  fig.cap="Mawson Escarpment: Regressions between terrrain age and soil geochemical measurements (Approx. code line 692)."
marrangeGrob (regr_listME, nrow = 3, ncol = 3)

#+ message=FALSE, results='hide', warning=FALSE, fig.width=10, fig.height=10, dpi=200, fig.align='center',  fig.cap="Lake Terrasovoje: Regressions between terrrain age and soil geochemical measurements (Approx. code line 695)."
marrangeGrob (regr_listLT, nrow = 3, ncol = 3)

#' #### Testing all regressions
#'
#' Calculate all regressions, with variables from previous section, and store
#' as list. 
regressionsMM <- lapply (y_axis, function(x) {
    lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesMM)
    })

regressionsME <- lapply (y_axis, function(x) {
    lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesME)
    })

regressionsLT <- lapply (y_axis, function(x) {
    lm ( substitute (MNAGE ~ i, list(i = as.name(x))), data = agesLT)
    })

#' Return summary from list for more specific results
lapply (regressionsMM, summary )
lapply (regressionsME, summary )
lapply (regressionsLT, summary )

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Write data to disk 
#'
#' Saved are object created by this script as well as command history and work-space
#' image.
save.image (path_workspace_b) # work-space

#' # Session info
#' 
#' The code and output in this document were tested and generated in the 
#' following computing environment:
#+ echo=FALSE
sessionInfo()

#' # References