20_format_predictors.R

#' ---
#' title: "Preparation of predictor data "
#' author: "Paul Czechowski"
#' date: "October 22nd, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#' 
#' This code is tested using a raw R terminal. Path names are 
#' defined relative to the project directory. This code commentary is included 
#' in the R code itself and can be rendered at any stage using 
#' `rmarkdown::render ("./20_format_predictors.r")`. Please check the session info
#' at the end of the document for further notes on the coding environment.
#'
#' # Prerequisites
#' 
#' * `./10_import_data.R` was run, or the objects to be read in are available in
#' a folder tree `Zenodo` in the project parent directory.
#'
#' # Environment preparation
#'
#' ## Packages loading and cleaning of work-space. Functions may also load packages.
#+ message=FALSE, results='hide'

library ("DataCombine")  # here used for string replacement in data frames
library ("data.table")   # here used for variable renaming
rm(list=ls())            # clear R environment
                         #   working directory is current directory by default and need not
                         #   to be set

#' ## Setting locations for data import and export
#' 
#' This script uses the objects generated by `10_import_data.R` that are located
#' in the `Zenodo` directory tree. It will also write to that location.

# import locations
path_predictors_in  <- "./Zenodo/R_Objects/10_predictors.Rdata"

# export locations
path_workspace      <- "./Zenodo/R_Objects/20_workspace.Rdata"
path_predictors_out <- "./Zenodo/R_Objects/20_predictors.Rdata"

#' # Formatting the abiotic predictor data
#' 
#' ## Data import
#' 
#' Soil geochemical and X-Ray diffraction data is imported using basic R functionality

load(path_predictors_in) # loading data frame "predictors"
                         # generated through "10_import_data.R"

#' ## Minor Adjustments, removal of superfluous sampling locations 
#' 
#' There may be some duplicate entries in the data which need to be removed:

predictors <- predictors[!duplicated(predictors$Sample),]

#' The `rownames()` are numerical so far, but should be something more meaningful.
#' Hence, the samples are re-labelled with data of one column of the original data
#' frame.

rownames(predictors) <- predictors$Sample

#' X-ray diffraction data from Duanne White contained minor rounding errors due to `Excel` 
#' particularities. The rounding errors are corrected by slightly re-scaling
#' those values here.

# check prior to adjusting
rowSums ( predictors[grep ("x_",colnames (predictors))])

# adjusting
predictors[grep ("x_",colnames (predictors))] <- 
  t (apply (predictors[grep ("x_",colnames (predictors))], 1, 
  function (x) {x/sum(x)}))

# check prior to adjusting
rowSums ( predictors[grep ("x_",colnames (predictors))])

#' The data also contains sample information from samples that are not included
#' in the current project. These samples will be removed from the data.

predictors <- predictors[(predictors$Location != "Reinbolt_Hills"),]

#' ## Renaming area identifiers and sample identifiers 
#' 
#' The long sampling location strings are an annoyance during plotting, and
#' are shortened here. 

#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors)  -->

# "FindReplace()" needs a data frame for replacing things 
Replaces <- data.frame ( from = c("Mount_Menzies", "Mawson_Escarpment", 
  "Lake_Terrasovoje"), to = c ("MM", "ME", "LT"))

# Re-writing the predictor data frame with replaced strings
predictors <- FindReplace(predictors, "Location", Replaces, from = "from", to = "to",
  exact = TRUE, vector = FALSE)

# garbage collection
rm(Replaces)

#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors)  -->

#' The `.PCM` ate the end of the plate position is also not needed and will likely
#' be annoying much later. This is cut out here. 

predictors$Sample <- gsub('.{4}$', '', predictors$Sample) 

#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors)  -->

#' In fact, the sample names can be used as the row names, and then be deleted.
rownames(predictors) <- predictors$Sample
predictors$Sample <- NULL

#' ## Removal of unused variables  
#' 
#' Also several other variables not needed for this analysis are dropped.

#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors)  -->

predictors$CSBP_id     <- NULL  # meta-data for sample sorting and assignment
predictors$XRD.id      <- NULL  # meta-data for sample sorting and assignment
predictors$o_Mites     <- NULL  # spotty observation data, needs improved encoding
predictors$o_Moss      <- NULL  # spotty observation data, needs improved encoding
predictors$o_Salts     <- NULL  # spotty observation data, needs improved encoding
predictors$o_Moisture  <- NULL  # spotty observation data, needs improved encoding
predictors$c_Texture   <- NULL  # spotty observation data, needs improved encoding

#' ## Setting and naming of variables used for analysis  
#' 
#' Now, the variable names are changed to four letter abbreviations, to aid 
#' later analysis. Also the types will be set properly. This does not need to be
#' written as a function, because this will only be done once.

# rename  variables - "setnames()" does this by reference, to avoid errors
setnames (predictors,
  old = c ("Location", "Genes", "c_Ammonium", "c_Nitrate", "c_Phosphorus",
           "c_Potassium", "c_Sulphur", "c_Org_Carbon", "c_Conductivity",
           "c_pH_CaCl2", "c_pH_H2O", "x_Quartz", "x_Feltspar", "x_Titanite",
           "x_Pyr_Amp_Gar", "x_Micas", "x_Dolomite", "x_Kao_Chlor", "x_Calcite",
           "x_Chlorite", "g_Latitude", "g_Longitude", "g_Elevation", "s_Slope", 
           "s_Aspect", "t_Soil_Temp", "low_age", "high_age", "c_Gravel", 
           "ATP"),
  new = c ("AREA", "GENE", "AMMN", "NITR", "PHOS", "POTA", "SLPH", "CARB",
           "COND", "PHCC", "PHHO", "QUTZ", "FDSP", "TTAN", "PRAG", "MICA",
           "DOLO", "KAOC", "CALC", "CHLR", "LATI", "LONG", "ELEV", "SLPE",
           "ASPT", "SPTT", "LAGE", "HAGE", "GRVL", "MATP"))

#' <!-- just checking here -->
#' <!-- View(predictors) -->
#' <!-- dim(predictors)  -->
 
# correct variable types to make everything easier subsequently
  
predictors$AREA <- as.factor(predictors$AREA)
predictors$GENE <- as.factor(predictors$GENE)
predictors$GRVL <- as.numeric(predictors$GRVL)
predictors$AMMN <- as.numeric(predictors$AMMN)
predictors$NITR <- as.numeric(predictors$NITR)
predictors$PHOS <- as.numeric(predictors$PHOS)
predictors$POTA <- as.numeric(predictors$POTA)
predictors$SLPH <- as.numeric(predictors$SLPH)
predictors$CARB <- as.numeric(predictors$CARB)
predictors$COND <- as.numeric(predictors$COND)
predictors$PHCC <- as.numeric(predictors$PHCC)
predictors$PHHO <- as.numeric(predictors$PHHO)
predictors$QUTZ <- as.numeric(predictors$QUTZ)
predictors$FDSP <- as.numeric(predictors$FDSP)
predictors$TTAN <- as.numeric(predictors$TTAN)
predictors$PRAG <- as.numeric(predictors$PRAG)
predictors$MICA <- as.numeric(predictors$MICA)
predictors$DOLO <- as.numeric(predictors$DOLO)
predictors$KAOC <- as.numeric(predictors$KAOC)
predictors$CALC <- as.numeric(predictors$CALC)
predictors$CHLR <- as.numeric(predictors$CHLR)
predictors$LATI <- as.numeric(predictors$LATI)
predictors$LONG <- as.numeric(predictors$LONG)
predictors$ELEV <- as.numeric(predictors$ELEV)
predictors$SLPE <- as.numeric(predictors$SLPE)
predictors$ASPT <- as.numeric(predictors$ASPT)
predictors$SPTT <- as.numeric(predictors$SPTT)
predictors$MATP <- as.numeric(predictors$MATP)
predictors$LAGE <- as.numeric(predictors$LAGE)
predictors$HAGE <- as.numeric(predictors$HAGE)

#' ## Checking variables used for analysis  
#' 
#' The formatted data is shown here.

# data structure
str(predictors)

# data summary
summary (predictors) 

#' Correcting a mistake in marker availability data (excluding one more
#' that is only relevant when analysing COI data as well):
predictors[which (rownames(predictors) %in% c("2.10.E", "2.10.C")), "GENE"] <- "18Sonly"

#' # Write data to disk 
#'
#' Saved are object created by this script as well as command history and work-space
#' image. The number in front of the file name denotes the source script.
save (predictors, file = path_predictors_out)  # data frame
save.image (path_workspace)                    # work-space

#' # Session info
#' 
#' The code and output in this document were tested and generated in the 
#' following computing environment:
#+ echo=FALSE
sessionInfo()