diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100755 index 0000000..6cf1f1f --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,10 @@ +Package: mdatools +Title: Multivariate Data Analysis +Version: 0.2.0 +Date: 2013-04-29 +Author: Sergey Kucheryavskiy +Maintainer: Sergey Kucheryavskiy +Description: The package contains useful functions for preprocessing, exploring and analysis of multivariate data. +Suggests: +Depends: +License: GPL-2 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..9c9f9ac --- /dev/null +++ b/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[^\\.]") diff --git a/R/.DS_Store b/R/.DS_Store new file mode 100755 index 0000000..5008ddf Binary files /dev/null and b/R/.DS_Store differ diff --git a/R/classres.R b/R/classres.R new file mode 100644 index 0000000..e41eabe --- /dev/null +++ b/R/classres.R @@ -0,0 +1,32 @@ +# methods for classification results # +classres = function(res, ...) UseMethod("classres") + +classres.getPredictionPerformance = function(res) +{ + fn = NULL + fp = NULL + tp = NULL + sensitivity = NULL + specificity = NULL + f1 = NULL + + if (!is.null(res.creference)) + { + fn = sum((res.creference == 1) & (res.cpredictions == 0)) + fp = sum((res.creference == 0) & (res.cpredictions == 1)) + tp = sum((res.creference == 1) & (res.cpredictions == 1)) + + sensitivity = tp / (tp + fn) + specificity = tp / (tp + fp) + f1 = 2 * sensitivity * specificity / (sensitivity + specificity) + } + + res$fn = fn + res$fp = fp + res$tp = tp + res$sensitivity = sensitivity + res$specificity = specificity + res$f1 = f1 + + return (res) +} \ No newline at end of file diff --git a/R/crossval.R b/R/crossval.R new file mode 100644 index 0000000..30c50b3 --- /dev/null +++ b/R/crossval.R @@ -0,0 +1,22 @@ +# function returns indices for cross-validation loop +crossval = function(nobj, nseg = NULL) +{ + if (is.null(nseg)) + { + if (nobj < 24) { nseg = nobj} + else if (nobj >= 24 && nobj < 40) { nseg = 8} + else if (nobj > 40) { nseg = 4 } + } + else if (nseg == 1) + { + nseg = nobj + } + + seglen = ceiling(nobj / nseg) + fulllen = seglen * nseg + + idx = c(sample(1:nobj), rep(NA, fulllen - nobj)) + idx = matrix(idx, nrow = nseg, byrow = T) + + return (idx) +} \ No newline at end of file diff --git a/R/ldecomp.R b/R/ldecomp.R new file mode 100755 index 0000000..9d8e161 --- /dev/null +++ b/R/ldecomp.R @@ -0,0 +1,318 @@ +# class and methods for linear decomposition X = TP' + E # +ldecomp = function(scores, loadings, residuals, fullvar, ...) UseMethod("ldecomp") + +ldecomp.default = function(scores, loadings, residuals, fullvar, tnorm = NULL, ncomp.selected = NULL) +{ + # Creates an object of ldecomp class. + # + # Arguments: + # scores: matrix with score values (nobj x ncomp). + # loadings: matrix with loading values (nvar x ncomp). + # residuals: matrix with data residuals + # fullvar: full variance of original data, preprocessed and centered + # tnorm: singular values for score normalization + # ncomp.selected: number of selected components + # + # Returns: + # object (list) of class ldecomp with following fields: + # obj$scores: matrix with score values (nobj x ncomp). + # obj$residuals: matrix with residuals (nobj x nvar). + # obj$fullvar: full variance of original data + # obj$Q2: matrix with Q2 residuals (nobj x ncomp). + # obj$T2: matrix with T2 distances (nobj x ncomp) + # obj$ncomp.selected: selected number of components + # obj$expvar: explained variance for each component + # obj$cumexpvar: cumulative explained variance + + scores = as.matrix(scores) + loadings = as.matrix(loadings) + residuals = as.matrix(residuals) + + # check dimension + if (ncol(scores) != ncol(loadings) || + nrow(scores) != nrow(residuals) || nrow(loadings) != ncol(residuals)) + stop('Dimensions of scores, loadings and data do not correspond to each other!') + + # set names for scores and loadings + rownames(scores) = rownames(residuals) + colnames(scores) = paste('Comp', 1:ncol(scores)) + rownames(loadings) = colnames(residuals) + colnames(loadings) = paste('Comp', 1:ncol(loadings)) + + if (is.null(ncomp.selected)) + ncomp.selected = ncol(scores) + + # calculate residual distances and explained variance + obj = ldecomp.getDistances(scores, loadings, residuals, tnorm) + var = ldecomp.getVariances(obj$Q2, fullvar) + + obj$expvar = var$expvar + obj$cumexpvar = var$cumexpvar + obj$scores = scores + obj$residuals = residuals + obj$fullvar = fullvar + obj$ncomp.selected = ncomp.selected + obj$call = match.call() + + class(obj) = "ldecomp" + + return (obj) +} + +ldecomp.getDistances = function(scores, loadings, residuals, tnorm = NULL) +{ + # Computes residual distances (Q2 and T2) for a decomposition. + # The distances are calculated for every 1:n components, where n + # goes from 1 to ncomp (number of columns in scores and loadings) + # + # Arguments: + # scores: matrix with scores (nobj x ncomp). + # loadings: matrix with loadings (nvar x ncomp) + # residuals: matrix with data residuals + # + # Returns: + # res$Q2: matrix with Q2 residuals (nobj x ncomp). + # res$T2: matrix with T2 distances (nobj x ncomp) + + ncomp = ncol(scores) + nobj = nrow(scores) + T2 = matrix(0, nrow = nobj, ncol = ncomp) + Q2 = matrix(0, nrow = nobj, ncol = ncomp) + + # calculate normalized scores + if (is.null(tnorm)) + tnorm = sqrt(colSums(scores ^ 2)/(nrow(scores) - 1)); + scoresn = sweep(scores, 2L, tnorm, '/', check.margin = F); + + # calculate distances for each set of components + for (i in 1:ncomp) + { + if (i < ncomp) + res = residuals + scores[, (i + 1):ncomp, drop = F] %*% t(loadings[, (i + 1):ncomp, drop = F]) + else + res = residuals + + Q2[, i] = rowSums(res^2) + T2[, i] = rowSums(scoresn[, 1:i, drop = F]^2) + } + + # set dimnames and return results + colnames(Q2) = colnames(T2) = colnames(scores) + rownames(Q2) = rownames(T2) = rownames(scores) + + res = list( + Q2 = Q2, + T2 = T2, + tnorm = tnorm + ) +} + +ldecomp.getVariances = function(Q2, fullvar) +{ + # Computes explained variance and cumulative explained variance + # for every component of a decomposition. + # + # Arguments: + # scores: matrix with scores. + # loadings: matrix with loadings. + # residuals: matrix with residuals. + # Q2: matrix with Q2 values + # Returns: + # res$expvar: vector with explained variance for every component + # res$cumexpvar: vector with cumulative explained variance + + cumresvar = colSums(Q2) / fullvar * 100 + cumexpvar = 100 - cumresvar + expvar = c(cumexpvar[1], diff(cumexpvar)) + + res = list( + expvar = expvar, + cumexpvar = cumexpvar + ) +} + +ldecomp.getResLimits = function(eigenvals, nobj, ncomp, alpha = 0.05) +{ + # Computes statistical limits for Q2 residuals and T2 distances. + # + # Arguments: + # eigenvals: vector with eigenvalues for a model. + # nobj: number of objects in calibration data + # ncomp: number of selected components + # alpha: significance level for Q2 limits + # + # Returns: + # res$Q2lim: limit for Q2 residuals + # res$T2lim: limit for T2 distances + + # calculate T2 limit using Hotelling statistics + if (nobj == ncomp) + T2lim = 0 + else + T2lim = (ncomp * (nobj - 1) / (nobj - ncomp)) * qf(1 - alpha, ncomp, nobj - ncomp); + + # calculate Q2 limit using F statistics + conflim = 100 - alpha * 100; + + nvar = length(eigenvals) + + if (ncomp < nvar) + { + eigenvals = eigenvals[(ncomp + 1):nvar] + + cl = 2 * conflim - 100 + t1 = sum(eigenvals) + t2 = sum(eigenvals^2) + t3 = sum(eigenvals^3) + h0 = 1 - 2 * t1 * t3/3/(t2^2); + + if (h0 < 0.001) + h0 = 0.001 + + ca = sqrt(2) * erfinv(cl/100) + h1 = ca * sqrt(2 * t2 * h0^2)/t1 + h2 = t2 * h0 * (h0 - 1)/(t1^2) + Q2lim = t1 * (1 + h1 + h2)^(1/h0) + } + else + Q2lim = 0 + + res = list( + T2lim = T2lim, + Q2lim = Q2lim + ) +} + +plotCumVariance.ldecomp = function(obj, show.labels = F) +{ + # Shows cumulative explained variance plot. + # + # Arguments: + # obj: object of ldecomp class. + # show.labels: show or not labels for plot points + + data = cbind(1:length(obj$cumexpvar), obj$cumexpvar) + data = rbind(c(0, 0), data) + colnames(data) = c('Components', 'Explained variance, %') + rownames(data) = round(c(0, obj$cumexpvar), 1) + mdaplots.linescatter(data, main = 'Cumulative variance', show.labels = show.labels) +} + +plotVariance.ldecomp = function(obj, show.labels = F) +{ + # Shows explained variance plot. + # + # Arguments: + # obj: object of ldecomp class. + # show.labels: show or not labels for plot points + # + + data = cbind(1:length(obj$expvar), obj$expvar) + colnames(data) = c('Components', 'Explained variance, %') + rownames(data) = round(obj$expvar, 1) + mdaplots.linescatter(data, main = 'Variance', show.labels = show.labels) +} + +plotScores.ldecomp = function(obj, comp = c(1, 2), cgroup = NULL, + show.labels = F, show.colorbar = T, + show.axes = T) +{ + # Shows scores plot. + # + # Arguments: + # obj: object of ldecomp class. + # comp: which components to show on x and y axis. + # cgroup: variable for color grouping of plot points. + # show.labels: show or not labels for plot points. + # show.colorbar: show or not a colorbar legend if cgroup is provided. + # show.axes: show or not axes crossing (0, 0) point. + + if (length(comp) == 1) + { + # scores vs objects + data = cbind(1:nrow(obj$scores), obj$scores[, comp]) + colnames(data) = c('Objects', colnames(obj$scores)[comp]) + rownames(data) = rownames(obj$scores) + mdaplots.scatter(data, main = 'Scores', cgroup = cgroup, + show.labels = show.labels, + show.colorbar = show.colorbar + ) + } + else if (length(comp) == 2) + { + # scores vs scores + data = obj$scores[, c(comp[1], comp[2])] + + if (show.axes == T) + show.lines = c(0, 0) + else + show.lines = F + + mdaplots.scatter(data, main = 'Scores', cgroup = cgroup, + show.labels = show.labels, + show.colorbar = show.colorbar, + show.lines = show.lines) + } + else + { + stop('Wrong number of components!') + } +} + +plotResiduals.ldecomp = function(obj, ncomp = NULL, cgroup = NULL, + show.labels = F, show.colorbar = T) +{ + # Shows T2 vs Q2 residuals plot. + # + # Arguments: + # obj: object of ldecomp class. + # ncomp: number of components for the residuals + # cgroup: variable for color grouping of plot points. + # show.labels: show or not labels for plot points. + # show.colorbar: show or not a colorbar legend if cgroup is provided. + + if (is.null(ncomp)) + ncomp = obj$ncomp.selected + + data = cbind(obj$T2[, ncomp], obj$Q2[, ncomp]) + colnames(data) = c('T2', 'Q2') + mdaplots.scatter(data, main = sprintf('Residuals (ncomp = %d)', ncomp), + cgroup = cgroup, show.labels = show.labels, + show.colorbar = show.colorbar) +} + +print.ldecomp = function(obj, str = NULL) +{ + if (is.null(str)) + str ='Results of data decomposition (class ldecomp)' + + cat('\n') + cat(str, '\n') + cat('\nMajor fields:\n') + cat('$scores - matrix with score values (nobj x ncomp)\n') + cat('$T2 - matrix with T2 distances (nobj x ncomp)\n') + cat('$Q2 - matrix with Q2 residuals (nobj x ncomp)\n') + cat('$ncomp.selected - selected number of components\n') + cat('$expvar - explained variance for each component\n') + cat('$cumexpvar - cumulative explained variance\n\n') +} + +summary.ldecomp = function(obj, str = NULL) +{ + if (is.null(str)) + str ='Summary for data decomposition (class ldecomp)' + + cat('\n') + cat(str, '\n') + cat(sprintf('\nSelected components: %d\n\n', obj$ncomp.selected)) + + data = cbind(round(obj$expvar, 2), + round(obj$cumexpvar, 2)) + + colnames(data) = c('Exp. var', 'Cum. exp. var') + show(data) +} + +erfinv = function (x) qnorm((1 + x)/2)/sqrt(2) + + diff --git a/R/mdaplots.R b/R/mdaplots.R new file mode 100755 index 0000000..1ce5b5b --- /dev/null +++ b/R/mdaplots.R @@ -0,0 +1,354 @@ +# class and methods for plotting # +mdaplots = function(scores, loadings, data, ...) UseMethod("mdaplots") + +mdaplots.getAxesLim = function(data, show.colorbar = F, multi.y = F, show.legend = F, show.lines = F) +{ + scale = 0.05 # 5% + if (multi.y == F ) + { + if (is.list(data)) + { + xmax = max(data[[1]][, 1]) + xmin = min(data[[1]][, 1]) + ymax = max(data[[1]][, 2]) + ymin = min(data[[1]][, 2]) + + for (i in 1:length(data)) + { + xmax = max(xmax, data[[i]][, 1]) + xmin = min(xmin, data[[i]][, 1]) + ymax = max(ymax, data[[i]][, 2]) + ymin = min(ymin, data[[i]][, 2]) + } + } + else + { + xmax = max(data[, seq(1, ncol(data), 2)]) + xmin = min(data[, seq(1, ncol(data), 2)]) + ymax = max(data[, seq(2, ncol(data), 2)]) + ymin = min(data[, seq(2, ncol(data), 2)]) + } + } + else + { + if (is.list(data)) + { + } + else + { + xmax = max(data[, 1]) + xmin = min(data[, 1]) + ymax = max(data[, 2:ncol(data)]) + ymin = min(data[, 2:ncol(data)]) + } + } + + if (is.numeric(show.lines)) + { + if (!is.na(show.lines[1])) + { + xmax = max(xmax, show.lines[1]) + xmin = min(xmin, show.lines[1]) + } + + if (!is.na(show.lines[2])) + { + ymax = max(ymax, show.lines[2]) + ymax = max(ymax, show.lines[2]) + } + } + + dx = (xmax - xmin) * scale + dy = (ymax - ymin) * scale + + xlim = c(xmin - dx, xmax + dx) + ylim = c(ymin - dy, ymax + dy) + + if (show.colorbar == T) + ylim[2] = ylim[2] + dy * 2 + + if (show.legend == T) + { + xlim[2] = xlim[2] + dx + ylim[2] = ylim[2] + dy + } + + lim = list( + xlim = xlim, + ylim = ylim + ) +} + +mdaplots.showColorbar = function(col, cgroup) +{ + # TODO(svk): format data for colorbar legend + + col = mdaplots.getColors(length(unique(col))) + ncol = length(col) + cgroup = levels(cut(as.vector(cgroup), ncol)) + lvals = as.numeric( sub("\\((.+),.*", "\\1", cgroup) ) + rvals = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", cgroup) ) + + lim = par('usr') + dx = lim[2] - lim[1] + dy = lim[4] - lim[3] + + w = (dx * 0.8)/ncol + h = dy * 0.015 + + x = lim[1] + dx * 0.1 + y = lim[4] - (h + 0.1 * h); + + for (i in 1:length(col)) + { + rect(x + w * (i - 1), y, x + w * i, y - h, col = col[i], border = NA) + text(x + w * (i - 1), y - h, format(lvals[i], digits = 3), + cex = 0.6, pos = 1, col = 'gray') + } + text(x + w * i, y - h, rvals[i], cex = 0.6, pos = 1, col = 'gray') +} + +mdaplots.getColors = function(n = 1, cgroup = NULL) +{ + rgbcol = c(213, 62, 79, + 244, 109, 67, + 253, 174, 97, + 254, 224, 139, + 230, 245, 152, + 171, 221, 164, + 102, 194, 165, + 50, 136, 189) + rgbcol = matrix(rgbcol, ncol = 3, byrow = T) + ngroups = nrow(rgbcol) + col8 = rgb(rgbcol[, 1], rgbcol[, 2], rgbcol[, 3], maxColorValue = 255) + col8 = col8[seq(length(col8), 1, -1)] + + if (!is.null(cgroup)) + { + cgroup = cut(as.vector(cgroup), ngroups, labels = 1:ngroups) + col = col8[cgroup] + } + else + { + if (length(n) == 1) + { + if (n == 1) + col = col8[1] + else if (n == 2) + col = col8[c(1, 8)] + else if (n == 3) + col = col8[c(1, 5, 8)] + else + col = col8 + } + } + + return (col) +} + +mdaplots.showLegend = function(legend, col, pch = NULL, lty = NULL, pos = 'topright') +{ + legend(pos, legend, col = col, pch = pch, lty = lty, + cex = 0.8, inset = 0.01, bg = 'white') +} + +mdaplots.showLabels = function(data) +{ + if (is.null(rownames(data))) + rownames(data) = 1:nrow(data) + + text(data[, 1], data[, 2], rownames(data), cex = 0.6, pos = 3, col = 'gray') +} + +mdaplots.showLines = function(point) +{ + if (!is.na(point[2])) + abline(h = point[2], lty = 2, col = 'darkgray') + if (!is.na(point[1])) + abline(v = point[1], lty = 2, col = 'darkgray') +} + +mdaplots.scatterg = function(data, pch = 16, legend = NULL, xlab = NULL, + ylab = NULL, show.labels = F, show.lines = F, ...) +{ + lim = mdaplots.getAxesLim(data, show.lines = show.lines) + col = mdaplots.getColors(length(data)) + + if (is.null(xlab)) + xlab = colnames(data[[1]])[1] + if (is.null(ylab)) + ylab = colnames(data[[1]])[2] + + for (i in 1:length(data)) + { + if (i == 1) + { + plot(data[[i]][, 1], data[[i]][, 2], type = 'p', + col = col[i], pch = pch, + xlim = lim$xlim, + ylim = lim$ylim, + xlab = xlab, + ylab = ylab, + ...) + } + else + { + points(data[[i]][, 1], data[[i]][, 2], type = 'p', + col = col[i], pch = pch) + } + if (show.labels == T) + mdaplots.showLabels(data[[i]]) + + if (!is.null(legend) && length(legend) > 1) + mdaplots.showLegend(legend, col, pch = pch) + } + + grid() + + if (is.numeric(show.lines) && length(show.lines) == 2 ) + mdaplots.showLines(show.lines) + +} + +mdaplots.scatter = function(data, pch = 16, cgroup = NULL, show.labels = F, + show.colorbar = T, show.lines = F, ...) +{ + data = as.matrix(data) + + if (!is.null(cgroup)) + { + lim = mdaplots.getAxesLim(data, show.colorbar = show.colorbar, show.lines = show.lines) + col = mdaplots.getColors(cgroup = cgroup) + } + else + { + lim = mdaplots.getAxesLim(data, show.lines = show.lines) + col = mdaplots.getColors(1) + } + + plot(data[, 1], data[, 2], type = 'p', + col = col, pch = pch, + xlab = colnames(data)[1], + ylab = colnames(data)[2], + xlim = lim$xlim, + ylim = lim$ylim, + ...) + + if (show.labels == T) + mdaplots.showLabels(data) + + grid() + + if (is.numeric(show.lines) && length(show.lines) == 2 ) + mdaplots.showLines(show.lines) + + if (!is.null(cgroup) && show.colorbar == T) + mdaplots.showColorbar(col, cgroup) +} + +mdaplots.line = function(data, type = 'l', show.legend = F, pch = 16, + xlab = NULL, ylab = NULL, show.labels = F, ...) +{ + data = as.matrix(data) + ny = ncol(data) - 1 + col = mdaplots.getColors(ny) + lim = mdaplots.getAxesLim(data, multi.y = T) + + if (!(type == 'l' | type == 'b')) + type = 'l' + + if (is.null(xlab)) + xlab = colnames(data)[1] + + if (is.null(ylab)) + ylab = colnames(data)[2] + + for (i in 1:ny) + { + if (i == 1) + { + if (nrow(data) <= 10) + plot(data[, 1], data[, i + 1], col = col[i], type = type, + xlab = xlab, + ylab = ylab, + ylim = lim$ylim, + xaxt = 'n', + pch = pch, + ...) + else + plot(data[, 1], data[, i + 1], col = col[i], type = type, + xlab = xlab, + ylab = ylab, + ylim = lim$ylim, + pch = pch, + ...) + + } + else + { + lines(data[, 1], data[, i + 1], col = col[i], type = type, pch = pch) + } + } + + if (nrow(data) < 10) + axis(side = 1, at = seq(1, nrow(data)), rownames(data)) + + if (show.legend == T && ny > 1) + mdaplots.showLegend(colnames(data[, 2:ncol(data)]), col, lty = 1) + grid() +} + +mdaplots.lineg = function(data, pch = 16, type = 'l', show.labels = F, + legend = NULL, xlab = NULL, ylab = NULL, ...) +{ + lim = mdaplots.getAxesLim(data) + col = mdaplots.getColors(length(data)) + + if (is.null(xlab)) + xlab = colnames(data[[1]])[1] + if (is.null(ylab)) + ylab = colnames(data[[1]])[2] + + for (i in 1:length(data)) + { + if (i == 1) + { + plot(data[[i]][, 1], data[[i]][, i + 1], col = col[i], type = type, + xlab = xlab, + ylab = ylab, + ylim = lim$ylim, + pch = pch, + ...) + } + else + { + lines(data[[i]][, 1], data[[i]][, 2], col = col[i], type = type, pch = pch) + } + + if (show.labels == T) + mdaplots.showLabels(data[[i]]) + + } + + if (!is.null(legend) && length(legend) > 1) + mdaplots.showLegend(legend, col, pch = pch) + grid() +} + +mdaplots.linescatter = function(data, pch = 16, show.labels = F, ...) +{ + data = as.matrix(data) + col = mdaplots.getColors(1) + lim = mdaplots.getAxesLim(data) + + plot(data[, 1], data[, 2], type = 'b', + col = col, pch = pch, + xlab = colnames(data)[1], + ylab = colnames(data)[2], + xlim = lim$xlim, + ylim = lim$ylim, + ...) + if (show.labels == T) + mdaplots.showLabels(data) + grid() +} diff --git a/R/pca.R b/R/pca.R new file mode 100755 index 0000000..6d748c8 --- /dev/null +++ b/R/pca.R @@ -0,0 +1,571 @@ +# class and methods for Principal component analysis # +pca = function(data, ...) UseMethod("pca") + +pca.default = function(data, ncomp = 20, center = T, scale = F, cv = NULL, + test.data = NULL, alpha = 0.05, info = '', ...) +{ + data = as.matrix(data) + + # check if data has missing values + if (sum(is.na(data)) > 0) + { + warning('Data has missing values, will try to fix using pca.mvreplace.') + data = pca.mvreplace(data, center = center, scale = scale) + } + + # calibrate model and select number of components + ncomp = min(ncomp, ncol(data), nrow(data) - 1) + res = pca.cal(data, ncomp, center = center, scale = scale) + model = list( + loadings = res$loadings, + eigenvals = res$eigenvals, + tnorm = res$tnorm, + center = res$center, + scale = res$scale, + ncomp = ncol(res$loadings) + ) + + model$ncomp.selected = model$ncomp + model$info = info + model$alpha = alpha + + # apply model to calibration set + model$calres = predict.pca(model, data) + + # do cross-validation if needed + if (!is.null(cv)) + model$cvres = pca.crossval(model, data, cv) + + # apply model to test set if any + if (!is.null(test.data)) + model$testres = predict.pca(model, test.data) + + # calculate and assign limit values for T2 and Q2 residuals + lim = ldecomp.getResLimits(model$eigenvals, nrow(data), model$ncomp.selected, model$alpha) + model$T2lim = lim$T2lim + model$Q2lim = lim$Q2lim + + model$call = match.call() + class(model) = "pca" + + return (model) +} + +selectCompNum.pca = function(model, ncomp) +{ + if (is.null(model)) + stop('Object with model is not specified!') + + if (ncomp < 1 || ncomp > model$ncomp) + stop('Wrong number of selected components!') + + model$ncomp.selected = ncomp + lim = ldecomp.getResLimits(model$eigenvals, nrow(model$calres$scores), + model$ncomp.selected, model$alpha) + model$T2lim = lim$T2lim + model$Q2lim = lim$Q2lim + + model$calres$ncomp.selected = ncomp + + if (!is.null(model$testres)) + model$testres$ncomp.selected = ncomp + + if (!is.null(model$cvres)) + model$cvres$ncomp.selected = ncomp + + return (model) +} + +pca.mvreplace = function(data, center = T, scale = F, maxncomp = 7, + expvarlim = 0.95, covlim = 10^-6, maxiter = 100) +{ + # initial estimates with mean values + cdata = data + mvidx = is.na(cdata) + + for (i in 1:ncol(data)) + { + mv = is.na(data[, i]) + + if (sum(mv)/length(data[, i]) > 0.2) + stop(sprintf('To many missing values in column #%d', i)) + + cdata[mv, i] = mean(data[, i], na.rm = T) + } + + # autoscale + cdata = scale(cdata, center = center, scale = scale) + + if (scale == T) + gsd = attr(cdata, 'scaled:scale') + + if (center == T) + gmean = attr(cdata, 'scaled:center'); + + data = cdata + + # iterations + n = 1 + scoresp = 0 + scores = 1 + cond = 1 + while (cond > covlim && n < maxiter) + { + n = n + 1 + + cdata = scale(cdata, center = T, scale = F) + lmean = attr(cdata, 'scaled:center') + + res = pca.svd(cdata, maxncomp) + + expvar = cumsum(res$eigenvals/sum(res$eigenvals)) + ncomp = min(which(expvar >= expvarlim), maxncomp) + + if (ncomp == 0) + ncomp = 1 + if (ncomp == length(expvar)) + ncomp = ncomp - 1 + + scoresp = scores + loadings = res$loadings[, 1:ncomp] + scores = cdata %*% loadings + newdata = scores %*% t(loadings) + + newdata = sweep(newdata, 2L, lmean, '+', check.margin = F) + + cdata = data + cdata[mvidx] = newdata[mvidx] + + if (n > 2) + { + # calculate difference between scores + ncompcond = min(ncol(scores), ncol(scoresp)) + cond = sum((scores[, 1:ncompcond] - scoresp[, 1:ncompcond])^2) + } + } + + # rescale the data back and return + if (scale == T) + cdata = sweep(cdata, 2L, gsd, '*', check.margin = F) + + if (center == T) + cdata = sweep(cdata, 2L, gmean, '+', check.margin = F) + + return (cdata) +} + +pca.cal = function(data, ncomp, center = T, scale = F) +{ + data = prep.autoscale(data, center = center, scale = scale) + res = pca.svd(data, ncomp) + + res$tnorm = sqrt(colSums(res$scores ^ 2)/(nrow(res$scores) - 1)); + + rownames(res$loadings) = colnames(data) + colnames(res$loadings) = paste('Comp', 1:ncol(res$loadings)) + res$center = attr(data, 'prep:center') + res$scale = attr(data, 'prep:scale') + + return (res) +} + +pca.svd = function(data, ncomp) +{ + nobj = nrow(data) + nvar = ncol(data) + ncomp = min(ncomp, nobj - 1, nvar) + + s = svd(data) + loadings = s$v[, 1:ncomp] + + res = list( + loadings = loadings, + scores = data %*% loadings, + eigenvals = (s$d^2)/(nrow(data) - 1) + ) +} + +pca.nipals = function(data, ncomp) +{ + nobj = nrow(data) + nvar = ncol(data) + ncomp = min(ncomp, nobj - 1, nvar) + + scores = matrix(0, nrow = nobj, ncol = ncomp) + loadings = matrix(0, nrow = nvar, ncol = ncomp) + eigenvals = rep(0, ncomp); + + E = data + for (i in 1:ncomp) + { + ind = which.max(apply(E, 2, sd)) + t = E[, ind, drop = F] + tau = 99999 + th = 9999 + + while (th > 0.000001) + { + p = (t(E) %*% t) / as.vector((t(t) %*% t)) + p = p / as.vector(t(p) %*% p) ^ 0.5 + t = (E %*% p)/as.vector(t(p) %*% p) + th = abs(tau - as.vector(t(t) %*% t)) + tau = as.vector(t(t) %*% t) + } + + E = E - t %*% t(p) + scores[, i] = t + loadings[, i] = p + eigenvals[i] = tau / (nobj - 1) + } + + res = list( + loadings = loadings, + scores = scores, + eigenvals = eigenvals + ) +} + +pca.crossval = function(model, data, cv) +{ + scale = model$scale + center = model$center + ncomp = model$ncomp + + nobj = nrow(data) + nvar = ncol(data) + + # get matrix with indices for cv segments + idx = crossval(nobj, cv) + + seglen = ncol(idx); + + Q2 = matrix(0, ncol = ncomp, nrow = nobj) + T2 = matrix(0, ncol = ncomp, nrow = nobj) + + # loop over segments + for (i in 1:nrow(idx)) + { + ind = na.exclude(idx[i,]) + + if (length(ind) > 0) + { + datac = data[-ind, , drop = F] + datat = data[ind, , drop = F] + + m = pca.cal(datac, ncomp, model$center, model$scale) + res = predict.pca(m, datat, stripped = T) + Q2[ind, ] = res$Q2 + T2[ind, ] = res$T2 + } + } + + rownames(Q2) = rownames(T2) = rownames(data) + colnames(Q2) = colnames(T2) = colnames(model$scores) + + res = pcacvres(T2, Q2, model$calres$fullvar) +} + +predict.pca = function(model, data, stripped = F) +{ + data = prep.autoscale(data, model$center, model$scale) + scores = data %*% model$loadings + residuals = data - scores %*% t(model$loadings) + + if (stripped == F) + { + fullvar = sum(data^2) + res = pcares(scores, model$loadings, residuals, fullvar, model$tnorm, model$ncomp.selected) + } + else + { + res = ldecomp.getDistances(scores, model$loadings, residuals, model$tnorm) + } + + return (res) +} + + +plotCumVariance.pca = function(obj, show.labels = F, show.legend = T) +{ + legend = NULL + + cdata = cbind(0:length(obj$calres$cumexpvar), c(0, obj$calres$cumexpvar)) + colnames(cdata) = c('Components', 'Explained variance, %') + rownames(cdata) = c(0, round(obj$calres$cumexpvar, 1)) + data = list(cdata = cdata) + + if (show.legend == T) + legend = 'cal' + + if (!is.null(obj$cvres)) + { + cvdata = cbind(0:length(obj$cvres$cumexpvar), c(0, obj$cvres$cumexpvar)) + colnames(cvdata) = c('Components', 'Explained variance, %') + rownames(cvdata) = c(0, round(obj$cvres$cumexpvar, 1)) + + data$cvdata = cvdata + if (show.legend == T) + legend = c(legend, 'cv') + } + + if (!is.null(obj$testres)) + { + tdata = cbind(0:length(obj$testres$cumexpvar), c(0, obj$testres$cumexpvar)) + colnames(tdata) = c('Components', 'Explained variance, %') + rownames(tdata) = c(0, round(obj$testres$cumexpvar, 1)) + + data$tdata = tdata + if (show.legend == T) + legend = c(legend, 'test') + } + + mdaplots.lineg(data, main = 'Cumulative variance', ylab = 'Explained variance, %', + show.labels = show.labels, legend = legend, pch = 16, type = 'b') +} + +plotVariance.pca = function(obj, show.labels = F, show.legend = T) +{ + legend = NULL + + cdata = cbind(1:length(obj$calres$expvar), obj$calres$expvar) + colnames(cdata) = c('Components', 'Explained variance, %') + rownames(cdata) = round(obj$calres$expvar, 1) + data = list(cdata = cdata) + + if (show.legend == T) + legend = 'cal' + + if (!is.null(obj$cvres)) + { + cvdata = cbind(1:length(obj$cvres$expvar), obj$cvres$expvar) + colnames(cvdata) = c('Components', 'Explained variance, %') + rownames(cvdata) = round(obj$cvres$expvar, 1) + data$cvdata = cvdata + + if (show.legend == T) + legend = c(legend, 'cv') + } + + if (!is.null(obj$testres)) + { + tdata = cbind(1:length(obj$testres$expvar), obj$testres$expvar) + colnames(tdata) = c('Components', 'Explained variance, %') + rownames(tdata) = round(obj$testres$expvar, 1) + data$tdata = tdata + + if (show.legend == T) + legend = c(legend, 'test') + } + + mdaplots.lineg(data, main = 'Variance', ylab = 'Explained variance, %', + show.labels = show.labels, legend = legend, pch = 16, type = 'b') +} + +plotScores.pca = function(obj, comp = c(1, 2), + show.labels = F, show.legend = T, + show.axes = T) +{ + legend = NULL; + if (length(comp) == 1) + { + nobj.cal = nrow(obj$calres$scores) + + # scores vs objects + cdata = cbind(1:nobj.cal, obj$calres$scores[, comp]) + colnames(cdata) = c('Objects', colnames(obj$calres$scores)[comp]) + rownames(cdata) = rownames(obj$calres$scores) + + data = list(cdata = cdata) + + if (!is.null(obj$testres)) + { + nobj.test = nrow(obj$testres$scores) + tdata = cbind((nobj.cal + 1):(nobj.cal + nobj.test), obj$testres$scores[, comp]) + colnames(tdata) = c('Objects', colnames(obj$testres$scores)[comp]) + rownames(tdata) = rownames(obj$testres$scores) + data$tdata = tdata + if (show.legend == T) + legend = c('cal', 'test') + } + + mdaplots.scatterg(data, main = 'Scores', + show.labels = show.labels, + legend = legend + ) + } + else if (length(comp) == 2) + { + # scores vs scores + cdata = cbind(obj$calres$scores[, comp[1]], obj$calres$scores[, comp[2]]) + colnames(cdata) = colnames(obj$calres$scores)[comp] + rownames(cdata) = rownames(obj$calres$scores) + + data = list(cdata = cdata) + + if (!is.null(obj$testres)) + { + tdata = cbind(obj$testres$scores[, comp[1]], obj$testres$scores[, comp[2]]) + colnames(tdata) = colnames(obj$testres$scores)[comp] + rownames(tdata) = rownames(obj$testres$scores) + data$tdata = tdata + if (show.legend == T) + legend = c('cal', 'test') + } + + if (show.axes == T) + show.lines = c(0, 0) + else + show.lines = F + + mdaplots.scatterg(data, main = 'Scores', + show.labels = show.labels, + legend = legend, + show.lines = show.lines) + + } + else + { + stop('Wrong number of components!') + } +} + +plotResiduals.pca = function(obj, ncomp = NULL, show.labels = F, show.legend = T, show.limits = T) +{ + if (show.limits == T && (is.null(ncomp) || ncomp == obj$ncomp.selected)) + show.lines = c(obj$T2lim, obj$Q2lim) + else + show.lines = F + + if (is.null(ncomp)) + ncomp = obj$ncomp.selected + + cdata = cbind(obj$calres$T2[, ncomp], obj$calres$Q2[, ncomp]) + colnames(cdata) = c('T2', 'Q2') + rownames(cdata) = rownames(obj$calres$scores) + + data = list(cdata = cdata) + legend = NULL + + if (show.legend == T) + legend = 'cal' + + if (!is.null(obj$cvres)) + { + cvdata = cbind(obj$cvres$T2[, ncomp], obj$cvres$Q2[, ncomp]) + colnames(cvdata) = c('T2', 'Q2') + rownames(cvdata) = rownames(obj$cvres$T2) + + data$cvdata = cvdata + if (show.legend == T) + legend = c(legend, 'cv') + } + + if (!is.null(obj$testres)) + { + tdata = cbind(obj$testres$T2[, ncomp], obj$testres$Q2[, ncomp]) + colnames(tdata) = c('T2', 'Q2') + rownames(tdata) = rownames(obj$testres$scores) + + data$tdata = tdata + if (show.legend == T) + legend = c(legend, 'test') + } + + mdaplots.scatterg(data, main = sprintf('Residuals (ncomp = %d)', ncomp), + show.labels = show.labels, + legend = legend, + show.lines = show.lines) +} + +plotLoadings.pca = function(obj, comp = c(1, 2), show.labels = T, + show.legend = T, type = 'p') +{ + ncomp = length(comp) + + if (ncomp == 2 && type == 'p') + { + # scatter plot + data =obj$loadings[, c(comp[1], comp[2])] + mdaplots.scatter(data, show.labels = show.labels, main = 'Loadings', show.lines = c(0, 0)) + } + else if (ncomp < 1 | ncomp > 8 ) + { + stop ('Number of components must be between 1 and 8!') + } + else + { + if (type == 'p') + type = 'l' + + # line plot + data = cbind(1:nrow(obj$loadings), obj$loadings[, comp, drop = F]) + mdaplots.line(data, show.legend = show.legend, type = type, main = 'Loadings', + ylab = 'Loadings', xlab = 'Variables') + } +} + +plot.pca = function(obj, comp = c(1, 2), show.labels = F, show.legend = T) +{ + par(mfrow = c(2, 2)) + plotScores(obj, comp = comp, show.labels = show.labels, + show.legend = show.legend) + plotLoadings(obj, comp = comp, show.labels = show.labels, + show.legend = show.legend) + plotResiduals(obj, ncomp = obj$ncomp.selected, + show.labels = show.labels, show.legend = show.legend, show.limits = T) + plotCumVariance(obj, show.legend = show.legend) + par(mfrow = c(1, 1)) +} + +print.pca = function(model, ...) +{ + cat('\nPCA model (class pca)\n') + + if (length(model$info) > 0) + { + cat('\nInfo:\n') + cat(model$info) + } + + cat('\nCall:\n') + print(model$call) + + cat('\nMajor fields and methods:\n') + cat('$loadings - matrix with loadings\n') + cat('$eigenvals - eigenvalues for components\n') + cat('$ncomp - number of calculated components\n') + cat('$ncomp.selected - selected number of components\n') + cat('$center - values for centering data\n') + cat('$scale - values for scaling data\n') + cat('$cv - number of segments for cross-validation\n') + cat('$alpha - significance level for Q2 residuals\n\n') + cat('$calres - results (scores, etc) for calibration set\n') + + if (!is.null(model$cvres)) + { + cat('$cvres - results for cross-validation\n') + } + if (!is.null(model$testres)) + { + cat('$testres - results for test set\n') + } + cat('\nTry also: show(model$calres), summary(model) and plot(model)\n') + +} + +summary.pca = function(model) +{ + ncomp = model$ncomp.selected + cat('\nPCA model (class pca) summary\n') + + if (length(model$info) > 0) + cat(sprintf('\nInfo:\n%s\n\n', model$info)) + + data = cbind(round(model$eigenvals[1:model$ncomp], 3), + round(model$calres$expvar, 2), + round(model$calres$cumexpvar, 2)) + + colnames(data) = c('Eigvals', 'Exp. var', 'Cum. exp. var') + show(data) +} + diff --git a/R/pcacvres.R b/R/pcacvres.R new file mode 100755 index 0000000..5cbace3 --- /dev/null +++ b/R/pcacvres.R @@ -0,0 +1,72 @@ +# class and methods for PCA cross-validates results # +pcacvres = function(T2, Q2, fullvar, ...) UseMethod("pcacvres") + +pcacvres.default = function(T2, Q2, fullvar, ncomp.selected = NULL, ...) +{ + # Creates an object of pcacvres class. The returned object also inherits class + # ldecomp and some of its methods. + # + # Arguments: + # T2: matrix with T2 distances (nobj x ncomp) for cross-validation. + # Q2: matrix with Q2 residuals (nobj x ncomp) for cross-validation. + # fullvar: full variance of data + # ncomp.selected: number of selected components + # + # Returns: + # list with cross-validation results (object of class pcacvres) + # obj$Q2: matrix with Q2 residuals (nobj x ncomp). + # obj$T2: matrix with T2 distances (nobj x ncomp) + # obj$ncomp.selected: selected number of components + # obj$expvar: explained variance for each component + # obj$cumexpvar: cumulative explained variance + + if (is.null(ncomp.selected)) + ncomp.selected = ncol(T2) + + obj = list( + T2 = T2, + Q2 = Q2 + ) + + var = ldecomp.getVariances(obj$Q2, fullvar) + + obj$expvar = var$expvar + obj$cumexpvar = var$cumexpvar + obj$ncomp.selected = ncomp.selected + + obj$call = match.call() + class(obj) = c('pcacvres', 'ldecomp') + + return (obj) +} + +plotScores.pcacvres = function(obj, ...) +{ + # stub functon to show that scores plot is not available for cv results + + stop('Scores plot is not available for cross-validated results.') +} + +print.pcacvres = function(obj) +{ + cat('\nCross-validation results for PCA model (class pcacvres) \n') + + cat('\nMajor fields:\n') + cat('$T2 - matrix with T2 distances (nobj x ncomp)\n') + cat('$Q2 - matrix with Q2 residuals (nobj x ncomp)\n') + cat('$ncomp.selected - selected number of components\n') + cat('$expvar - explained variance for each component\n') + cat('$cumexpvar - cumulative explained variance\n\n') +} + +summary.pcacvres = function(obj) +{ + cat('\nSummary for cross-validation of PCA model\n') + cat(sprintf('Selected components: %d\n', obj$ncomp.selected)) + + data = cbind(round(obj$expvar, 2), + round(obj$cumexpvar, 2)) + + colnames(data) = c('Exp. var', 'Cum. exp. var') + show(data) +} \ No newline at end of file diff --git a/R/pcares.R b/R/pcares.R new file mode 100755 index 0000000..42424a1 --- /dev/null +++ b/R/pcares.R @@ -0,0 +1,24 @@ +# class and methods for PCA results # +pcares = function(scores, loadings, residuals, fullvar, ...) UseMethod("pcares") + +pcares.default = function(scores, loadings, residuals, fullvar, tnorm = NULL, ncomp.selected = NULL, ...) +{ + # Creates an object of pcares class. In fact the class is a wrapper for ldecomp and + # uses its methods and attributes. + + pcares = ldecomp(scores, loadings, residuals, fullvar, tnorm = tnorm, ncomp.selected = ncomp.selected, ...) + class(pcares) = c('pcares', 'ldecomp') + + return (pcares) +} + + +print.pcares = function(obj) +{ + print.ldecomp(obj, 'Results for PCA decomposition (class pcares)') +} + +summary.pcares = function(obj) +{ + summary.ldecomp(obj, 'Summary for PCA results') +} \ No newline at end of file diff --git a/R/plotCumVariance.R b/R/plotCumVariance.R new file mode 100755 index 0000000..e6896c2 --- /dev/null +++ b/R/plotCumVariance.R @@ -0,0 +1,4 @@ +plotCumVariance = function(object, ...) +{ + UseMethod("plotCumVariance") +} \ No newline at end of file diff --git a/R/plotLoadings.R b/R/plotLoadings.R new file mode 100755 index 0000000..a8396bc --- /dev/null +++ b/R/plotLoadings.R @@ -0,0 +1,4 @@ +plotLoadings = function(object, ...) +{ + UseMethod("plotLoadings") +} \ No newline at end of file diff --git a/R/plotPredictions.R b/R/plotPredictions.R new file mode 100755 index 0000000..11c58cb --- /dev/null +++ b/R/plotPredictions.R @@ -0,0 +1,4 @@ +plotPredictions = function(object, ...) +{ + UseMethod("plotPredictions") +} \ No newline at end of file diff --git a/R/plotRMSE.R b/R/plotRMSE.R new file mode 100755 index 0000000..ddee12b --- /dev/null +++ b/R/plotRMSE.R @@ -0,0 +1,4 @@ +plotRMSE = function(object, ...) +{ + UseMethod("plotRMSE") +} \ No newline at end of file diff --git a/R/plotRegcoeffs.R b/R/plotRegcoeffs.R new file mode 100755 index 0000000..1a14214 --- /dev/null +++ b/R/plotRegcoeffs.R @@ -0,0 +1,4 @@ +plotRegcoeffs = function(object, ...) +{ + UseMethod("plotRegcoeffs") +} \ No newline at end of file diff --git a/R/plotResiduals.R b/R/plotResiduals.R new file mode 100755 index 0000000..b18062e --- /dev/null +++ b/R/plotResiduals.R @@ -0,0 +1,4 @@ +plotResiduals = function(object, ...) +{ + UseMethod("plotResiduals") +} \ No newline at end of file diff --git a/R/plotScores.R b/R/plotScores.R new file mode 100755 index 0000000..243a387 --- /dev/null +++ b/R/plotScores.R @@ -0,0 +1,4 @@ +plotScores = function(object, ...) +{ + UseMethod("plotScores") +} \ No newline at end of file diff --git a/R/plotVariance.R b/R/plotVariance.R new file mode 100755 index 0000000..1bc018d --- /dev/null +++ b/R/plotVariance.R @@ -0,0 +1,4 @@ +plotVariance = function(object, ...) +{ + UseMethod("plotVariance") +} \ No newline at end of file diff --git a/R/plotXResiduals.R b/R/plotXResiduals.R new file mode 100755 index 0000000..7209744 --- /dev/null +++ b/R/plotXResiduals.R @@ -0,0 +1,4 @@ +plotXResiduals = function(object, ...) +{ + UseMethod("plotXResiduals") +} \ No newline at end of file diff --git a/R/plotXYLoadings.R b/R/plotXYLoadings.R new file mode 100755 index 0000000..1bc018d --- /dev/null +++ b/R/plotXYLoadings.R @@ -0,0 +1,4 @@ +plotVariance = function(object, ...) +{ + UseMethod("plotVariance") +} \ No newline at end of file diff --git a/R/plotXYScores.R b/R/plotXYScores.R new file mode 100755 index 0000000..1472b26 --- /dev/null +++ b/R/plotXYScores.R @@ -0,0 +1,4 @@ +plotXYScores = function(object, ...) +{ + UseMethod("plotXYScores") +} \ No newline at end of file diff --git a/R/plotYResiduals.R b/R/plotYResiduals.R new file mode 100755 index 0000000..f56ce91 --- /dev/null +++ b/R/plotYResiduals.R @@ -0,0 +1,4 @@ +plotYResiduals = function(object, ...) +{ + UseMethod("plotYResiduals") +} \ No newline at end of file diff --git a/R/pls.R b/R/pls.R new file mode 100644 index 0000000..6d237de --- /dev/null +++ b/R/pls.R @@ -0,0 +1,513 @@ +# class and methods for Partial Least Squares regression # +pls = function(X, y, ...) UseMethod("pls") + +pls.default = function(X, y, ncomp = 12, cv = 0, autoscale = 1, Xt = NULL, yt = NULL, ...) +{ + X = as.matrix(X) + y = as.matrix(y) + + ncomp = min(ncol(X), nrow(X) - 1, ncomp) + + # build a model and apply to calibration set + model = pls.cal(X, y, ncomp, autoscale) + model$calres = predict.pls(model, X, y) + + # do cross-validation if needed + if (cv > 0) + model$cvres = pls.crossval(model, X, y, cv) + + # do test set validation if provided + if (!is.null(Xt) && !is.null(yt)) + { + Xt = as.matrix(Xt) + yt = as.matrix(yt) + model$testres = predict.pls(model, Xt, yt) + } + + model$ncomp.selected = ncomp + model$call = match.call() + + class(model) = "pls" + + model +} + +pls.cal = function(X, y, ncomp, autoscale) +{ + X = as.matrix(X) + y = as.matrix(y) + + if (autoscale > 0) + { + # find mean values for X and y + mX = apply(X, 2, mean) + my = apply(y, 2, mean) + + if (autoscale == 2) + { + # calculate stadnard deviations for X variables + sdX = apply(X, 2, sd) + sdy = apply(y, 2, sd) + } + else + { + # use vector with ones if no standardization is needed + sdX = rep(1, ncol(X)) + sdy = rep(1, ncol(y)) + } + + # autoscale X and y + X = scale(X, center = mX, scale = sdX) + #y = scale(y, center = my, scale = sdy) + y = y - my + } + + + # do SIMPLS + model = pls.simpls(X, y, ncomp) + + model$autoscale = autoscale + + if (autoscale > 0) + { + model$mX = mX + model$sdX = sdX + model$my = my + model$sdy = sdy + } + + model$ncomp = ncomp + + return (model) +} + +## SIMPLS algorithm ### +pls.simpls = function(X, y, ncomp, stripped = FALSE) +{ + X = as.matrix(X) + y = as.matrix(y) + + objnames = rownames(X); + prednames = colnames(X); + respnames = colnames(y); + + nobj = dim(X)[1] + npred = dim(X)[2] + nresp = 1 + + V = R = matrix(0, nrow = npred, ncol = ncomp) + tQ = matrix(0, nrow = ncomp, ncol = nresp) + B = array(0, dim = c(npred, nresp, ncomp)) + + if (!stripped) { + P = R + U = TT = matrix(0, nrow = nobj, ncol = ncomp) + } + + S = crossprod(X, y) + for (a in 1:ncomp) { + q.a = 1 + r.a = S %*% q.a + t.a = X %*% r.a + t.a = t.a - mean(t.a) + tnorm = sqrt(c(crossprod(t.a))) + + t.a = t.a/tnorm + r.a = r.a/tnorm + p.a = crossprod(X, t.a) + q.a = crossprod(y, t.a) + v.a = p.a + if (a > 1) { + v.a = v.a - V %*% crossprod(V, p.a) + } + v.a = v.a/sqrt(c(crossprod(v.a))) + S = S - v.a %*% crossprod(v.a, S) + R[, a] = r.a + tQ[a, ] = q.a + V[, a] = v.a + B[, , a] = R[, 1:a, drop = FALSE] %*% tQ[1:a, , drop = FALSE] + if (!stripped) { + u.a = y %*% q.a + if (a > 1) + u.a = u.a - TT %*% crossprod(TT, u.a) + P[, a] = p.a + TT[, a] = t.a + U[, a] = u.a + } + } + + B = B[, 1, ] + + if (stripped) { + list(coeffs = B) + } + else { + + lvnames = paste("Comp", 1:ncomp) + ncompnames = paste(1:ncomp, " components") + rownames(B) = prednames + colnames(B) = ncompnames + rownames(TT) = rownames(U) = objnames + colnames(TT) = colnames(U) = lvnames + list( + coeffs = B, + xloadings = P, + yloadings = t(tQ), + weights = R + ) + } +} + +pls.crossval = function(model, X, y, cv) +{ + X = as.matrix(X) + y = as.matrix(y) + + autoscale = model$autoscale + ncomp = model$ncomp + nobj = nrow(X) + nvar = ncol(X) + + # get matrix with indices for cv segments + idx = crossval(nobj, cv) + + seglen = ncol(idx); + + yp = matrix(0, nrow = nobj, ncol = ncomp) + + # loop over segments + for (i in 1:nrow(idx)) + { + ind = na.exclude(idx[i,]) + + if (length(ind) > 0) + { + Xc = X[-ind, , drop = F] + yc = y[-ind, , drop = F] + Xt = X[ind, , drop = F] + yt = y[ind, , drop = F] + + m = pls.cal(Xc, yc, ncomp, autoscale) + res = predict.pls(m, Xt, stripped = T) + yp[ind, ] = res$yp + } + } + + res = plsresult(yp, y) +} + +## select optimal ncomp for the model ## +pls.selectncomp = function(model, ncomp) +{ + if (ncomp <= model$ncomp && ncomp > 0) + { + model$ncomp.selected = ncomp; + model$calres$ncomp.selected = ncomp + + if (!is.null(model$cvres)) + model$cvres$ncomp.selected = ncomp + + if (!is.null(model$testres)) + model$testres$ncomp.selected = ncomp + } + + return (model) +} + +predict.pls = function(model, X, y = NULL, stripped = FALSE) +{ + + if (model$autoscale > 0) + X = scale(X, center = model$mX, scale = model$sdX) + + yp = X %*% as.matrix(model$coeffs) + + if (model$autoscale > 0) + yp = yp + model$my + + if (stripped == FALSE) + { + xscores = X %*% (model$weights %*% solve(t(model$xloadings) %*% model$weights)) + + if (!is.null(y)) + { + yy = y - model$my + yscores = as.matrix(yy) %*% model$yloadings + } + + rownames(xscores) = rownames(yscores) = rownames(X) + colnames(xscores) = colnames(yscores) = paste("LV", 1:model$ncomp) + + res = plsresult(yp, y, X = X, + xscores = xscores, + yscores = yscores, + xloadings = model$xloadings, + yloadings = model$yloadings + ) + } + else + { + res = list(yp = yp) + } +} + + +plotRMSE.pls = function(model, + main = 'RMSE', xlab = 'ncomps', ylab = 'RMSE', + type = 'b', + show.legend = T) +{ + ncomp = model$ncomp + + legend = c('cal') + cdata = cbind(1:ncomp, model$calres$rmse) + data = list(cdata = cdata) + + if (!is.null(model$cvres)) + { + cvdata = cbind(1:ncomp, model$cvres$rmse) + data$cvdata = cvdata + legend = c(legend, 'cv') + } + + if (!is.null(model$testres)) + { + testdata = cbind(1:ncomp, model$testres$rmse) + data$testdata = testdata + legend = c(legend, 'test') + } + + if (show.legend == F) + legend = NULL + + mdaplots.lineg(data, legend = legend, type = type, + main = main, xlab = xlab, ylab = ylab) +} + +plotXYScores.pls = function(model, ncomp = 1, main = 'XY Scores', + show.labels = F, show.legend = T) +{ + main = sprintf('XY scores (ncomp = %d)', ncomp) + + cdata = cbind(model$calres$xscores[, ncomp], model$calres$yscores[, ncomp]) + colnames(cdata) = c('X scores', 'Y scores') + data = list(cdata = cdata) + legend = c('cal') + + if (!is.null(model$testres)) + { + tdata = cbind(model$testres$xscores[, ncomp], model$testres$yscores[, ncomp]) + data$tdata = tdata + legend = c(legend, 'test') + } + + if (show.legend == F) + legend = NULL + + mdaplots.scatterg(data, legend = legend, show.labels = show.labels, main = main) +} + +## plot with measured vs predicted y values ## +plotPredictions.pls = function(model, ncomp = 0, main = 'Predictions', + xlab = 'y, measured', + ylab = 'y, predicted', + show.labels = F, + show.legend = T) +{ + if (ncomp == 0) + ncomp = model$ncomp.selected + + if (ncomp > model$ncomp) + { + warning(sprintf('\nChosen ncomp is larger than model has. Use %d instead.', model$ncomp)) + ncomp = model$ncomp + } + + cdata = cbind(model$calres$y, model$calres$yp[, ncomp]) + colnames(cdata) = c('y, measured', 'y, predicted') + rownames(cdata) = rownames(model$calres$yp) + legend = c('cal') + data = list(cdata = cdata) + + if (!is.null(model$cvres)) + { + cvdata = cbind(model$cvres$y, model$cvres$yp[, ncomp]) + colnames(cvdata) = c('y, measured', 'y, predicted') + rownames(cvdata) = rownames(model$cvres$yp) + legend = c(legend, 'cv') + data$cvdata = cvdata + } + + if (!is.null(model$testres)) + { + testdata = cbind(model$testres$y, model$testres$yp[, ncomp]) + colnames(testdata) = c('y, measured', 'y, predicted') + rownames(testdata) = rownames(model$testres$yp) + legend = c(legend, 'test') + data$testdata = testdata + } + + if (show.legend == F) + legend = NULL + + mdaplots.scatterg(data, legend = legend, show.labels = show.labels, + main = main) +} + +plotYResiduals.pls = function(model, ncomp = 0, main = 'Y residuals', + xlab = 'y residuals', + ylab = 'y values', + show.labels = F, + show.legend = T) +{ + if (ncomp == 0) + ncomp = model$ncomp.selected + + if (ncomp > model$ncomp) + { + warning(sprintf('\nChosen ncomp is larger than model has. Use %d instead.', model$ncomp)) + ncomp = model$ncomp + } + + cdata = cbind(model$calres$y, model$calres$yp[, ncomp] - model$calres$y) + colnames(cdata) = c('y values', 'y residuals') + rownames(cdata) = rownames(model$calres$yp) + legend = c('cal') + data = list(cdata = cdata) + + if (!is.null(model$cvres)) + { + cvdata = cbind(model$cvres$y, model$cvres$yp[, ncomp] - model$cvres$y) + colnames(cvdata) = c('y values', 'y residuals') + rownames(cvdata) = rownames(model$cvres$yp) + legend = c(legend, 'cv') + data$cvdata = cvdata + } + + if (!is.null(model$testres)) + { + testdata = cbind(model$testres$y, model$testres$yp[, ncomp] - model$testres$y) + colnames(testdata) = c('y values', 'y residuals') + rownames(testdata) = rownames(model$testres$yp) + legend = c(legend, 'test') + data$testdata = testdata + } + + if (show.legend == F) + legend = NULL + + mdaplots.scatterg(data, legend = legend, show.labels = show.labels, + main = sprintf('Y residuals (ncomp = %d)', ncomp)) +} + +plotRegcoeffs.pls = function(model, main = 'Regression coefficients', + show.labels = F, ncomp = 0) +{ + if (ncomp == 0) + ncomp = model$ncomp.selected + + if (ncomp > model$ncomp) + { + warning(sprintf('\nChosen ncomp is larger than model has. Use %d instead.', model$ncomp)) + ncomp = model$ncomp + } + + data = cbind(1:nrow(model$coeffs), model$coeffs[, ncomp]) + colnames(data) = c('Variables', 'Coefficients') + rownames(data) = rownames(model$xloadings) + mdaplots.line(data, type = 'l', main = main, show.labels = show.labels) +} + + +plotXResiduals.pls = function(model, ncomp = NULL, show.labels = F, show.legend = T) +{ + if (is.null(ncomp)) + ncomp = model$ncomp.selected + cdata = cbind(model$calres$T2[, ncomp], model$calres$Q2[, ncomp]) + + colnames(cdata) = c('T2', 'Q2') + rownames(cdata) = rownames(model$calres$scores) + + data = list(cdata = cdata) + legend = NULL + + if (!is.null(model$testres)) + { + tdata = cbind(model$testres$T2[, ncomp], model$testres$Q2[, ncomp]) + colnames(tdata) = c('T2', 'Q2') + rownames(tdata) = rownames(model$testres$scores) + + data$tdata = tdata + if (show.legend == T) + legend = c('cal', 'test') + } + + mdaplots.scatterg(data, main = sprintf('X residuals (ncomp = %d)', ncomp), + show.labels = show.labels, + legend = legend) +} + +## makes a plot with regression results ## +plot.pls = function(model, show.legend = T, show.labels = F) +{ + par(mfrow = c(2, 2)) + plotXYScores(model, show.labels = show.labels, show.legend = show.legend) + plotRegcoeffs(model, show.labels = show.labels) + plotRMSE(model, show.legend = show.legend) + plotPredictions(model, show.labels = show.labels, show.legend = show.legend) + par(mfrow = c(1, 1)) +} + + +## show summary for a model ## +summary.pls = function(model) +{ + ncomp = model$ncomp.selected + cat('\nPLS model (class pls) summary\n') + cat('\nPerformance and validation:\n') + cat(sprintf('Selected LVs: %d\n\n', ncomp)) + cat(' ') + cat(sprintf('%6s\t', colnames(as.matrix(model$calres)))) + cat('\n') + cat('Cal: ') + cat(sprintf('%.4f\t', as.matrix(model$calres)[ncomp, , drop = F])) + cat('\n') + + if (!is.null(model$cvres)) + { + cat('CV: ') + cat(sprintf('%.4f\t', as.matrix(model$cvres)[ncomp, , drop = F])) + cat('\n') + } + + if (!is.null(model$testres)) + { + cat('Test: ') + cat(sprintf('%.4f\t', as.matrix(model$testres)[ncomp, , drop = F])) + cat('\n') + } + +} + +## print information about a model ## +print.pls = function(model, ...) +{ + cat('\nPLS model (class pls)\n') + cat('\nCall:\n') + print(model$call) + cat('\nMajor fields:\n') + cat('$coeffs - vector with regression coefficients\n') + cat('$xloadings - vector with X loadings\n') + cat('$yloadings - vector with Y loadings\n') + cat('$calres - results for calibration set\n') + if (!is.null(model$cvres)) + { + cat('$cvres - results for cross-validation\n') + } + if (!is.null(model$testres)) + { + cat('$testres - results for test set\n') + } + cat('\nTry summary(model) and plot(model) to see the results of validation\n') + +} diff --git a/R/plsregcoeffs.R b/R/plsregcoeffs.R new file mode 100644 index 0000000..4850675 --- /dev/null +++ b/R/plsregcoeffs.R @@ -0,0 +1,81 @@ +# class and methods for PLS regression coefficients # +plsregcoeffs = function(coeffs, ...) UseMethod("plsregcoeffs") + +## default method ## +plsregcoeffs.default = function(coeffs) +{ + plsregcoeffs = list(values = as.matrix(coeffs)) + plsregcoeffs$call = match.call() + + class(plsregcoeffs) = "plsregcoeffs" + + plsregcoeffs +} + +as.matrix.plsregcoeffs = function(plsregcoeffs, nlv = 0, ...) +{ + if (nlv == 0) + { + return (plsregcoeffs$values) + } + else + { + return (plsregcoeffs$values[, nlv]) + } +} + +print.plsregcoeffs = function(coeffs, nlv = 0, digits = 3, ...) +{ + show(coeffs) + cat('\nRegression coefficients (class plsregcoeffs)\n') + if (nlv == 0) { nlv = ncol(coeffs.values)} + print(round(coeffs$values[, nlv], digits)) +} + +plot.plsregcoeffs = function(plsregcoeffs, nlv = 0, main = 'Regression coefficients', + xlab = 'Variables', ylab = 'Coefficients', + pch = 16, col = 'blue', ...) +{ + + if (nlv == 0) { nlv = ncol(plsregcoeffs$values)} + + coeffs = plsregcoeffs$values[, nlv] + ncoeff = length(coeffs) + + # select limits for y axis + ylim = max(abs(coeffs)) + + # chose plot type depending on number of coefficients + if (ncoeff < 30) { type = 'b' }else{ type = 'l' } + + # show plot + plot(coeffs, type = type, col = col, pch = pch, + main = main, + xlab = xlab, + ylab = ylab, + ylim = c(-ylim, ylim), + axes = F + ) + abline(c(0, 0), c(0, ncoeff)) + + if (is.null(dim(coeffs))) { names = names(coeffs)} + else {names = rownames(coeffs)} + + # show axes and labels if needed + if (ncoeff > 20) + { + atx = seq(1, ncoeff, ncoeff/10) + } + else + { + atx = 1:ncoeff + } + axis(1, at = atx, labels = names[atx], cex.axis = 0.85) + axis(2, cex.axis = 0.85) + if (ncoeff < 30) + { + text(1:ncoeff, coeffs, names, cex = 0.6, pos = 3, col = 'gray') + } + grid() + box() +} \ No newline at end of file diff --git a/R/plsresult.R b/R/plsresult.R new file mode 100644 index 0000000..343f74c --- /dev/null +++ b/R/plsresult.R @@ -0,0 +1,286 @@ +plsresult = function(yp, y, ...) UseMethod("plsresult") + +plsresult.default = function(yp, y = NULL, X = NULL, nlv = 0, xscores = NULL, yscores = NULL, + xloadings = NULL, yloadings = NULL) +{ + yp = as.matrix(yp) + + if (!is.null(xloadings)) + { + xdist = ldecomp.getDistances(xscores, xloadings, X) + xvar = ldecomp.getVariances(xdist$T2, sum(X^2)) + Q2 = xdist$Q2 + T2 = xdist$T2 + xexpvar = xvar$expvar + } + else + { + xexpvar = NULL + Q2 = NULL + T2 = NULL + } + + if (!is.null(yloadings) && !is.null(y)) + { + ydist = ldecomp.getDistances(yscores, yloadings, y) + yvar = ldecomp.getVariances(ydist$T2, sum(y^2)) + yexpvar = yvar$expvar + } + else + { + yexpvar = NULL + } + + if (!is.null(y)) + { + y = as.numeric(y) + + plsresult = list( + yp = yp, + y = y, + xscores = xscores, + yscores = yscores, + T2 = T2, + Q2 = Q2, + xexpvar = xexpvar, + yexpvar = yexpvar, + rmse = plsresult.rmse(y, yp), + slope = plsresult.slope(y, yp), + r2 = cor(y, yp)^2, + bias = apply(y - yp, 2, mean) + ) + } + else + { + plsresult = list( + yp = yp, + y = y, + xscores = xscores, + yscores = yscores, + T2 = T2, + Q2 = Q2, + xexpvar = xexpvar, + yexpvar = yexpvar, + rmse = NULL, + slope = NULL, + r2 = NULL, + bias = NULL + ) + } + + if (nlv == 0) { nlv = ncol(yp) } + + plsresult$nlvselected = nlv + plsresult$call = match.call() + class(plsresult) = "plsresult" + + plsresult +} + +# calculates root mean squared error for prediction +plsresult.rmse = function(y, yp) +{ + return (sqrt(colSums((y - yp)^2)/length(y))) +} + +# calculates slope of a line fit for (y, yp) values +plsresult.slope = function(y, yp) +{ + nlv = dim(yp)[2] + slope = 1:nlv + for (a in 1:nlv) + { + m = lm(yp[, a] ~ y) + slope[a] = m$coefficients[2] + } + + return (slope) +} + +plsresult.plot_rmse = function(result, col = 'blue', type = 'b', main = 'RMSE', xlab = 'LVs', + ylab = 'RMSE', pch = 16, xlim = NULL, ylim = NULL) +{ + if (!is.null(result$rmse)) + { + nlv = length(result$rmse) + + if (is.null(xlim) || is.null(ylim)) + { + plot(1:nlv, result$rmse, type = type, col = col, main = main, pch = pch, + xlab = xlab, ylab = ylab) + } + else + { + plot(1:nlv, result$rmse, type = type, col = col, main = main, pch = pch, + xlab = xlab, ylab = ylab, xlim = xlim, ylim = ylim) + } + points(result$nlvselected, result$rmse[result$nlvselected], col = 'red') + grid() + } +} + +plsresult.points_rmse = function(result, col = 'blue', type = 'b', pch = 16) +{ + if (!is.null(result$rmse)) + { + nlv = length(result$rmse) + lines(1:nlv, result$rmse, type = type, col = col, pch = pch) + points(result$nlvselected, result$rmse[result$nlvselected], col = 'red') + grid() + } +} + +plsresult.plot_predictions = function(result, nlv = 0, col = 'blue', + labels = F, + main = 'Predictions', + xlab = NULL, ylab = 'y, predicted', + pch = 16, ...) +{ + if (nlv == 0) { nlv = result$nlvselected } + + if (!is.null(result$y)) + { + if (is.null(xlab)) { xlab = 'y, measured' } + + # show predicted versus measured values plot + plot(result$y, result$yp[, nlv], col = col, pch = pch, + main = main, xlab = xlab, ylab = ylab, cex.axis = 0.85, ...) + abline(lm(result$yp[, nlv] ~ result$y), col = col) + if (labels == T) + { + text(result$y, result$yp[,nlv], rownames(result$yp), cex = 0.6, pos = 3, col = 'gray') + } + } + else + { + if (is.null(xlab)) { xlab = 'Objects' } + + # show predicted y values for every object + plot(1:length(result$yp[, nlv]), result$yp[, nlv], + col = col, pch = pch, + main = main, + xlab = xlab, + ylab = ylab, + cex.axis = 0.85, + ... + ) + if (labels == T) + { + text(1:length(result$yp[, nlv]), result$yp[,nlv], rownames(result$yp), cex = 0.6, pos = 3, col = 'gray') + } + } + grid() +} + +plsresult.plot_xscores = function(result, nlv = c(1, 2), col = 'blue', main = 'X scores', + pch = 16, labels = F, ...) +{ + plot(result$xscores[, nlv[1]], result$xscores[, nlv[2]], col = col, pch = pch, + xlab = colnames(result$xscores)[nlv[1]], + ylab = colnames(result$xscores)[nlv[2]], + main = main, ...) + + if (labels == T) + { + text(result$xscores[, nlv[1]], result$xscores[, nlv[2]], rownames(result$xscores), + cex = 0.6, pos = 3, col = 'gray') + } +} + +plsresult.points_xscores = function(result, nlv = c(1, 2), col = 'blue', labels = F, pch = 16, ...) +{ + points(result$xscores[, nlv[1]], result$xscores[, nlv[2]], col = col, pch = pch, ...) + if (labels == T) + { + text(result$xscores[, nlv[1]], result$xscores[, nlv[2]], rownames(result$xscores), + cex = 0.6, pos = 3, col = 'gray') + } +} + +plsresult.plot_xyscores = function(result, nlv = 1) +{ + plot(result$xscores[, nlv], result$yscores[, nlv]) +} + +plsresult.plot_xresiduals = function() +{ + +} + +plsresult.plot_yresiduals = function() +{ + +} + +plsresult.points_predictions = function(result, nlv = 0, labels = F, col = 'blue', pch = 16, ...) +{ + if (nlv == 0) { nlv = result$nlvselected } + + if (!is.null(result$y)) + { + # show predicted versus measured values plot + points(result$y, result$yp[, nlv], col = col, pch = pch, ... ) + abline(lm(result$yp[, nlv] ~ result$y), col = col) + if (labels == T) + { + text(result$y, result$yp[,nlv], rownames(result$yp), cex = 0.6, pos = 3, col = 'gray') + } + } + else + { + # show predicted y values for every object + points(1:length(result$yp[, nlv]), result$yp[, nlv], col = col, pch = pch, ... ) + if (labels == T) + { + text(1:length(result$yp[, nlv]), result$yp[,nlv], rownames(result$yp), cex = 0.6, pos = 3, col = 'gray') + } + } +} + +plot.plsresult = function(result) +{ + plsresult.plot_predictions(result) +} + +as.matrix.plsresult = function(result) +{ + nlv = result$nlvselected + + if (!is.null(result$y)) + { + res = matrix(c(result$rmse, result$r2, result$slope, result$bias), ncol = 4) + colnames(res) = c('RMSE', 'R^2', 'Slope', 'Bias') + } + else + { + res = NULL + } + return (res) +} + +print.plsresult = function(result, ...) +{ + cat('\nPLS results (class plsresult)\n') + cat('\nPredictions:\n') +} + +summary.plsresult = function(result, ...) +{ + cat('\nPLS results (class plsresult) summary\n') + if (!is.null(result$y)) + { + nlv = result$nlvselected + res = as.matrix(result)[nlv, , drop = FALSE] + + cat('\nPerformance:') + cat('\n') + cat(sprintf('\n\t%s: %d', 'Number of LVs', nlv)) + cat(sprintf('\n\t%s: %.4f', colnames(res), res)) + } + else + { + cat('No reference data provided to calculate prediction performance.') + } +} + + diff --git a/R/prep.R b/R/prep.R new file mode 100644 index 0000000..0bea761 --- /dev/null +++ b/R/prep.R @@ -0,0 +1,68 @@ +prep = + setRefClass('prep', + fields = list(methods = 'list'), + methods = list( + add = function(name, ...) + { + p = as.list(match.call(expand.dots = TRUE)[-1]) + methods <<- c(methods, c(name, p)) + } + ) + ) + + +prep.autoscale = function(data, center = T, scale = F) +{ + if (is.logical(center) && center == T ) + center = apply(data, 2, mean) + else if (is.numeric(center)) + center = center + + if (is.logical(scale) && scale == T) + scale = apply(data, 2, sd) + else if(is.numeric(scale)) + scale = scale + + data = scale(data, center = center, scale = scale) + attr(data, 'scaled:center') = NULL + attr(data, 'scaled:scale') = NULL + attr(data, 'prep:center') = center + attr(data, 'prep:scale') = scale + + return (data) +} + +prep.snv = function(X) +{ + X = t(X) + X = scale(X, center = T, scale = T) + X = t(X) +} + +prep.savgol <- function(TT, fl, forder = 1, idorder = 0) +{ + nobj = nrow(TT) + TT2 = matrix(0, ncol = ncol(TT), nrow = nrow(TT)) + + for (i in 1:nobj) + { + T = TT[i,] + m <- length(T) + dorder = idorder + 1 + + fc <- (fl - 1)/2 + X <- outer(-fc:fc, 0:forder, FUN="^") + + Y <- pinv(X); + T2 <- convolve(T, rev(Y[dorder, ]), type="o") + T2 <- T2[(fc+1):(length(T2)-fc)] + TT2[i,] = T2 + } + TT2 +} + +pinv <- function(A) +{ + s <- svd(A) + s$v %*% diag(1/s$d) %*% t(s$u) +} \ No newline at end of file diff --git a/R/regcoeffs.R b/R/regcoeffs.R new file mode 100644 index 0000000..89cde75 --- /dev/null +++ b/R/regcoeffs.R @@ -0,0 +1,73 @@ +# class and methods for regression coefficients # +regcoeffs = function(coeffs, ...) UseMethod("regcoeffs") + +## default method ## +regcoeffs.default = function(coeffs) +{ + regcoeffs = list(values = coeffs) + regcoeffs$call = match.call() + + class(regcoeffs) = "regcoeffs" + + regcoeffs +} + +as.matrix.regcoeffs = function(regcoeffs, ...) +{ + return (regcoeffs$values) +} + +print.regcoeffs = function(regcoeffs, digits = 3, ...) +{ + cat('\nRegression coefficients (class regcoeffs)\n') + print(round(regcoeffs$values, digits)) +} + +plot.regcoeffs = function(regcoeffs, main = 'Regression coefficients', + xlab = 'Variables', ylab = 'Coefficients', + pch = 16, col = 'blue', ...) +{ + + + # remove Intercept + ncoeff = length(regcoeffs$values) + coeffs = regcoeffs$values[2:ncoeff] + ncoeff = ncoeff - 1 + + # select limits for y axis + ylim = max(abs(coeffs)) + + # chose plot type depending on number of coefficients + if (ncoeff < 30) { type = 'b' }else{ type = 'l' } + + # show plot + plot(coeffs, type = type, col = col, pch = pch, + main = main, + xlab = xlab, + ylab = ylab, + ylim = c(-ylim, ylim), + axes = F + ) + abline(c(0, 0), c(0, ncoeff)) + + if (is.null(dim(coeffs))) { names = names(coeffs)} + else {names = rownames(coeffs)} + + # show axes and labels if needed + if (ncoeff > 20) + { + atx = seq(1, ncoeff, ncoeff/10) + } + else + { + atx = 1:ncoeff + } + axis(1, at = atx, labels = names[atx], cex.axis = 0.85) + axis(2, cex.axis = 0.85) + if (ncoeff < 30) + { + text(1:ncoeff, coeffs, names, cex = 0.6, pos = 3, col = 'gray') + } + grid() + box() +} \ No newline at end of file diff --git a/R/selectCompNum.R b/R/selectCompNum.R new file mode 100755 index 0000000..70d0dc9 --- /dev/null +++ b/R/selectCompNum.R @@ -0,0 +1,4 @@ +selectCompNum = function(object, ...) +{ + UseMethod("selectCompNum") +} \ No newline at end of file diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100755 index 0000000..fd87c4b Binary files /dev/null and b/data/.DS_Store differ diff --git a/data/Jam.RData b/data/Jam.RData new file mode 100755 index 0000000..c98c61e Binary files /dev/null and b/data/Jam.RData differ diff --git a/data/People.RData b/data/People.RData new file mode 100755 index 0000000..df44f92 Binary files /dev/null and b/data/People.RData differ diff --git a/data/Simdata.RData b/data/Simdata.RData new file mode 100755 index 0000000..4bad916 Binary files /dev/null and b/data/Simdata.RData differ diff --git a/man/.DS_Store b/man/.DS_Store new file mode 100755 index 0000000..78002ef Binary files /dev/null and b/man/.DS_Store differ diff --git a/man/ldecomp.Rd b/man/ldecomp.Rd new file mode 100644 index 0000000..d0fc883 --- /dev/null +++ b/man/ldecomp.Rd @@ -0,0 +1,65 @@ +\name{ldecomp} +\alias{ldecomp} + +\title{ +Linear decomposition results. +} + +\description{ +\code{ldecomp} is used to store and visualize results for linear decomposition of data matrix X in form X = TP' + E (e.g. for PCA, MCR, PLS, etc). +} + +\usage{ +ldecomp(scores, loadings, residuals, fullvar, tnorm = NULL, ncomp.selected = NULL) +} + +\arguments{ + \item{scores }{a matrix with score values (nobj x ncomp).} + \item{loadings }{a matrix with loading values (nvar x ncomp).} + \item{residuals }{matrix with data residuals.} + \item{fullvar }{full variance for data matrix.} + \item{tnorm }{vector with singular values for scores normalization.} + \item{ncomp.selected }{selected number of components for decomposition.} +} + +\details{ +\code{ldecomp} is a general class for decomposition X = TP' + E. It is used, for example, for PCA results (\code{\link{pcares}}), in PLS and other methods. The class also includes methods for calculation of residuals, variances, and so on. + +There is no need to use the \code{ldecomp} manually. For example, when build PCA model with \code{\link{pca}}, the results will automatically inherit all methods of \code{ldecomp}. +} + +\value{ +Returns an object (list) of \code{ldecomp} class with following fields: +\item{scores }{matrix with score values (nobj x ncomp).} +\item{residuals }{matrix with data residuals (nobj x nvar).} +\item{T2 }{matrix with T2 distances (nobj x ncomp).} +\item{Q2 }{matrix with Q2 distances (nobj x ncomp).} +\item{ncomp.selected }{selected number of components.} +\item{expvar }{explained variance for each component.} +\item{cumexpvar }{cumulative explained variance.} +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\seealso{ +Methods for \code{ldecomp} objects: + \tabular{ll}{ + \code{print.ldecomp} \tab shows information about the object.\cr + \code{summary.ldecomp} \tab shows statistics for the decomposition.\cr + \code{\link{plotScores.ldecomp}} \tab makes scores plot.\cr + \code{\link{plotVariance.ldecomp}} \tab makes explained variance plot.\cr + \code{\link{plotCumVariance.ldecomp}} \tab makes commulative explained variance plot.\cr + \code{\link{plotResiduals.ldecomp}} \tab makes Q2 vs. T2 residuals plot.\cr + } + +Other methods, implemented in the class: + \tabular{ll}{ + \code{\link{ldecomp.getDistances}} \tab calculates T2 and Q2 distances.\cr + \code{\link{ldecomp.getVariances}} \tab calculates explained variance.\cr + \code{\link{ldecomp.getResLimits}} \tab computes statistical limits for T2 and Q2 distances.\cr + } +} + +\keyword{ ~kwd1 } diff --git a/man/ldecomp.getDistances.Rd b/man/ldecomp.getDistances.Rd new file mode 100644 index 0000000..7013c99 --- /dev/null +++ b/man/ldecomp.getDistances.Rd @@ -0,0 +1,47 @@ +\name{ldecomp.getDistances} +\alias{ldecomp.getDistances} + +\title{ +Calculate T2 and Q2 distances +} + +\description{ +\code{ldecomp.getDistances} is used to calculate T2 and Q2 distances for every object in the +\code{ldecomp} decomposition. +} + +\usage{ +ldecomp.getDistances(scores, loadings, residuals, tnorm = NULL) +} + +\arguments{ + \item{scores}{a matrix with score values (nobj x ncomp).} + \item{loadings}{a matrix with loading values (nvar x ncomp).} + \item{residuals}{a matrix with data residuals.} + \item{tnorm}{a vector with singular values for scores, used for normalization.} +} + +\details{ +The T2 values (Hotelling T2 values) are calculated for normalized scores, so in fact it is a squared Mahalanobis distance between the projection of an object to latent variable space and origin of the space. + +The Q2 values are squared distances from an original object to its projection. They show individual object variance, which is not explained by latent variable space. + +The distances are calculated automatically when a \code{ldecomp} based model (e.g. PCA) is calibrated and/or applied. +} + +\value{ +Returns a list with following fields: +\item{Q2 }{a matrix with calculated Q2 values (nobj x ncomp)} +\item{T2 }{a matrix with calculated T2 values (nobj x ncomp)} +\item{tnorm}{a vector with singular values for scores normalization} +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} + +\keyword{ ~kwd1 } diff --git a/man/pca.Rd b/man/pca.Rd new file mode 100644 index 0000000..5926304 --- /dev/null +++ b/man/pca.Rd @@ -0,0 +1,113 @@ +\name{pca} +\alias{pca} + +\title{ +Principal Component Analysis +} + +\description{ +pca is used to build and explore a principal component analysis (PCA) model. +} + +\usage{ +pca(data, ncomp = 20, center = T, scale = F, cv = NULL, info = '', + test.data = NULL, alpha = 0.05) +} + +\arguments{ + \item{data}{a numerical matrix with calibration data.} + \item{ncomp}{maximum number of components to calculate.} + \item{center}{logical, do mean centering of data or not.} + \item{scale}{logical, do sdandardization of data or not.} + \item{cv}{number of segments for random cross-validation (1 for full cross-validation).} + \item{info}{a short text line with model description.} + \item{test.data}{a numerical matrix with test data.} + \item{alpha}{significance level for calculating limit for Q2 residuals.} +} + +\details{ +Principal components are calculated using SVD (Singular Value Decompisition) method. By default \code{pca} uses number of components (\code{ncomp}) as a minimum of number of objects - 1, number of variables and default value (20). Besides this, there is also a parameter for selecting an optimal number of components (\code{ncomp.selected}). The optimal number of components is used to build a residuals plot (with Q2 residuals vs. T2 values), calculate confidence limits for Q2 residuals, as well as for SIMCA classification. + +If data contains missing values (NA) the \code{pca} will use an iterative algorithm to fit the values with most probable ones. The algorithm is implemented in a function \code{\link{pca.mvreplace}}. The same center and scale options will be used. You can also do this step manually before calling \code{pca} and play with extra options. + +} + +\value{ +Returns an object of \code{pca} class with following fields: +\item{ncomp }{number of components included to the model.} +\item{loadings }{matrix with loading values (nvar x ncomp).} +\item{eigenvals }{vector with eigenvalues for all existent components.} +\item{expvar }{vector with explained variance for each component (in percent).} +\item{cumexpvar }{vector with cumulative explained variance for each component (in percent).} +\item{ncomp.selected }{selected (optimal) number of components.} +\item{T2lim }{statistical limit for T2 distance.} +\item{Q2lim }{statistical limit for Q2 distance.} +\item{info }{information about the model, provided by user when build the model.} +\item{calres }{an object of class \code{\link{pcares}} with PCA results for a calibration data.} +\item{testres }{an object of class \code{\link{pcares}} with PCA results for a test data, if the last were provided.} +\item{cvres }{an object of class \code{\link{pcacvres}} with PCA results for cross-validation, if this option was chosen.} +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +Methods for \code{pca} objects: + \tabular{ll}{ + \code{plot.pca} \tab makes an overview of PCA model with four main plot.\cr + \code{summary.pca} \tab shows some statistics for the model.\cr + \code{\link{selectCompNum.pca}} \tab set number of optimal components in the model\cr + \code{\link{predict.pca}} \tab applies PCA model to a new data.\cr + \code{\link{plotScores.pca}} \tab shows scores plot.\cr + \code{\link{plotLoadings.pca}} \tab shows loadings plot.\cr + \code{\link{plotVariance.pca}} \tab shows explained variance plot.\cr + \code{\link{plotCumVariance.pca}} \tab shows commulative explained variance plot.\cr + \code{\link{plotResiduals.pca}} \tab shows Q2 vs. T2 residuals plot.\cr + } + Most of the methods for plotting data are also available for PCA results + (\code{\link{pcares}}) objects. + +Other methods implemented in \code{pca}: + \tabular{ll}{ + \code{\link{pca.mvreplace}} \tab replaces missing values in a data matrix with approximated using iterative PCA decomposition.\cr + } +} +\examples{ +## makes a PCA for People data with autoscaling and full cross-validation ## + +library(mdatools) + +data(People) +model = pca(people, cv = 1, center = T, scale = T, info = 'Simple PCA model') +model = selectCompNum(model, 5) +summary(model) +plot(model, show.labels = T) + +## add some missing values, make a new model and compare score plots +peoplemv = people +peoplemv[2, 7] = NA +peoplemv[6, 2] = NA +peoplemv[10, 4] = NA +peoplemv[22, 12] = NA + +modelmv = pca(peoplemv, center = T, scale = T, info = 'Model with missing values') +par(mfrow = c(1, 2)) +plotScores(model, show.labels = T) +plotScores(modelmv, show.labels = T) +par(mfrow = c(1, 1)) + +## show scores plot for calibration set with coloring points +plotScores(model$calres, comp = c(1, 3), cgroup = people[, 'Income'], + show.labels = T, show.axes = F) + +} + +\keyword{ ~pca } diff --git a/man/pca.mvreplace.Rd b/man/pca.mvreplace.Rd new file mode 100644 index 0000000..ca62a7a --- /dev/null +++ b/man/pca.mvreplace.Rd @@ -0,0 +1,61 @@ +\name{pca.mvreplace} +\alias{pca.mvreplace} + +\title{ +Replace missing values in data +} + +\description{ +\code{pca.mvreplace} is used to replace missing values in a data matrix with approximated by iterative PCA decomposition. +} + +\usage{ +data = pca.mvreplace(mvdata, center = T, scale = F, maxncomp = 7, expvarlim = 0.95, + covlim = 10^-6, maxiter = 100) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{mvdata}{a matrix with data, containing missing values.} + \item{center}{logical, do centering of data values or not.} + \item{scale}{logical, do standardization of data values or not.} + \item{maxncomp}{maximum number of components in PCA model.} + \item{expvarlim}{minimum amount of variance, explained by chosen components (used for selection of optimal number of components in PCA models).} + \item{covlim}{convergence criterion.} + \item{maxiter}{maximum number of iterations if convergence criterion is not met.} +} + +\details{ +The function uses iterative PCA modeling of the data to approximate and impute missing values. The result is most optimal for data sets with low or moderate level of noise and with number of missing values less than 10\% for small dataset and up to 20\% for large data. +} + +\value{ +Returns the same matrix \code{data} where missing values are replaced with approximated. +} + +\references{ +Philip R.C. Nelson, Paul A. Taylor, John F. MacGregor. Missing data methods in PCA and PLS: Score calculations with incomplete observations. Chemometrics and Intelligent Laboratory Systems, 35 (1), 1996. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## A very simple example of imputing missing values in a data with no noise + +# generate a matrix with values +s = 1:6 +odata = cbind(s, 2*s, 4*s) + +# make a matrix with missing values +mdata = odata +mdata[5, 2] = mdata[2, 3] = NA + +# replace missing values with approximated +rdata = pca.mvreplace(mdata, scale = T) + +# show all matrices together +show(cbind(odata, mdata, round(rdata, 2))) +} + +\keyword{ ~kwd1 } diff --git a/man/pcacvres.Rd b/man/pcacvres.Rd new file mode 100644 index 0000000..b2c48ce --- /dev/null +++ b/man/pcacvres.Rd @@ -0,0 +1,74 @@ +\name{pcacvres} +\alias{pcacvres} +\title{ +Cross-validation results for PCA model +} + +\description{ +\code{pcacvres} is used to store results for cross-validation of PCA model. +} +\usage{ +pcacvres(T2, Q2, fullvar, ncom.selected = NULL) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{T2 }{matrix with T2 distances calculated using cross-validation.} + \item{Q2 }{matrix with Q2 residuals calculated using cross-validation.} + \item{fullvar }{full variance of the data matrix.} + \item{ncomp.selected }{selected number of components.} +} + +\details{ +There is no need to create a pcacvres object manually, it is created automatically when build a pca model (see \code{\link{pca}}). The object can be used to show summary and plots for the results. +} + +\value{ +Returns an object (list) of \code{pcacvres} and \code{ldecomp} classes with following fields: +\item{T2 }{matrix with T2 distances (nobj x ncomp).} +\item{Q2 }{matrix with Q2 distances (nobj x ncomp).} +\item{ncomp.selected }{selected number of components.} +\item{expvar }{explained variance for each component.} +\item{cumexpvar }{cumulative explained variance.} +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\note{ +} + +\seealso{ +Methods for \code{pcacvres} objects: + \tabular{ll}{ + \code{print.pcacvres} \tab shows information about the object.\cr + \code{summary.pcacvres} \tab shows some statistics for the object.\cr + } + +Methods, inherited from \code{\link{ldecomp}} class: + \tabular{ll}{ + \code{\link{plotVariance.ldecomp}} \tab makes explained variance plot.\cr + \code{\link{plotCumVariance.ldecomp}} \tab makes commulative explained variance plot.\cr + \code{\link{plotResiduals.ldecomp}} \tab makes Q2 vs. T2 residuals plot.\cr + } + Check also \code{\link{pca}} and \code{\link{pcares}}. +} +\examples{ +## make PCA model with full cross-validation and show cv results +library(mdatools) +data(People) + +model = pca(people, scale = T, cv = 1) +model = pca.selectCompNum(model, 5) + +print(model$cvres) +summary(model$cvres) + +par(mfrow = c(1, 2)) +plotResiduals(model$cvres, show.labels = T) +plotVariance(model$cvres) +par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } diff --git a/man/pcares.Rd b/man/pcares.Rd new file mode 100644 index 0000000..b8079b9 --- /dev/null +++ b/man/pcares.Rd @@ -0,0 +1,78 @@ +\name{pcares} +\alias{pcares} + +\title{ +Results of PCA decomposition +} + +\description{ +\code{pcares} is used to store results for PCA decomposition of data. +} + +\usage{ +pcares(scores, loadings, residuals, fullvar, ncomp.selected = NULL) +} +\arguments{ + \item{scores}{matrix with score values (nobj x ncomp).} + \item{loadings}{matrix with loading values (nvar x ncomp).} + \item{residuals }{matrix with data residuals.} + \item{fullvar }{full variance for data matrix.} + \item{ncomp.selected}{selected number of components.} +} +\details{ +In fact \code{pcares} is a wrapper for \code{\link{ldecomp}} - general class for storing +results for linear decomposition X = TP' + E. So, most of the methods, arguments and +returned values are inherited from \code{ldecomp}. + +There is no need to create a pcacvres object manually, it is created automatically when build a pca model (see \code{\link{pca}}) or apply the model to a new data (see \code{\link{predict.pca}}). The object can be used to show summary and plots for the results. +} +\value{ +Returns an object (list) of class \code{pcares} and \code{ldecomp} with following fields: +\item{scores }{matrix with score values (nobj x ncomp)} +\item{T2 }{matrix with T2 distances (nobj x ncomp)} +\item{Q2 }{matrix with Q2 distances (nobj x ncomp)} +\item{ncomp.selected }{selected number of components} +\item{expvar }{explained variance for each component} +\item{cumexpvar }{cumulative explained variance} +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + + +\seealso{ +Methods for \code{pcares} objects: + \tabular{ll}{ + \code{print.pcacvres} \tab shows information about the object.\cr + \code{summary.pcacvres} \tab shows some statistics for the object.\cr + } + +Methods, inherited from \code{\link{ldecomp}} class: + \tabular{ll}{ + \code{\link{plotScores.ldecomp}} \tab makes scores plot.\cr + \code{\link{plotVariance.ldecomp}} \tab makes explained variance plot.\cr + \code{\link{plotCumVariance.ldecomp}} \tab makes cumulative explained variance plot.\cr + \code{\link{plotResiduals.ldecomp}} \tab makes Q2 vs. T2 residuals plot.\cr + } + Check also \code{\link{pca}} and \code{\link{ldecomp}}. +} + +\examples{ +## make PCA model and show results for calibration set +library(mdatools) +data(People) + +model = pca(people, scale = T) +model = pca.selectCompNum(model, 5) + +print(model$calres) +summary(model$calres) + +par(mfrow = c(1, 2)) +plotScores(model$calres, show.labels = T) +plotResiduals(model$calres, show.labels = T) +par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } diff --git a/man/plot.pca.Rd b/man/plot.pca.Rd new file mode 100644 index 0000000..4029aa9 --- /dev/null +++ b/man/plot.pca.Rd @@ -0,0 +1,47 @@ +\name{plot.pca} +\alias{plot.pca} + +\title{ +Plot overview of PCA model +} + +\description{ +Shows four plots for a \code{pca} object: scores, loadings, residuals (Q2 vs. T2) and cumulative explained variance. +} + +\usage{ +plot(model, comp = c(1, 2), show.labels = F, show.legend = T) +} + +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{model}{\code{pca} object (PCA model)} + \item{comp}{x and y components for scores and loadings plots} + \item{show.labels}{logical, show labels for objects on scores and residuals plots or not} + \item{show.legend}{logical, show legend or not (legend is available if test or cross-validation used)} +} + +\details{ +If model was validated with test set or cross-validation the results of calibration and validation are shown on scores, residuals and explained variance plots as several groups of points (or lines) separated with colors. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavskiy@gmail.com) +} + +\seealso{ +Main plots for pca model and results: \code{\link{plotScores.pca}}, \code{\link{plotLoadings.pca}}, \code{\link{plotResiduals.pca}}, \code{\link{plotVariance.pca}}, \code{\link{plotCumVariance.pca}}, \code{\link{plotEigenvalues.pca}}. +} + +\examples{ +## make a PCA model for people data and show main plots +library(mdatools) + +data(People) +model = pca(people, scale = T, cv = 1) +model = pca.selectCompNum(model, 5) +plot(model, c(1, 3), show.labels = T) + +} + +\keyword{ ~pca } diff --git a/man/plotCumVariance.ldecomp.Rd b/man/plotCumVariance.ldecomp.Rd new file mode 100644 index 0000000..c179607 --- /dev/null +++ b/man/plotCumVariance.ldecomp.Rd @@ -0,0 +1,47 @@ +\name{plotCumVariance.ldecomp} +\alias{plotCumVariance.ldecomp} + +\title{ +Cumulative explained variance plot +} +\description{ +\code{plotVariance.ldecomp} is used to make a plot with cumulative data variance, explained by components. +} +\usage{ +plotCumVariance(obj, show.labels = F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{obj }{an object of \code{ldecomp} class (e.g. PCA results).} + \item{show.labels }{logical, show or not labels with variance values for plot points.} +} + +\details{ +Cumulative explained variance for component N is a sum of explained variance values for components from 1 to N. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\seealso{ +\code{\link{plotVariance.ldecomp}} makes explained variance plot. +} + + +\examples{ +## make a PCA model and show cumulative explained variance plot for calibration set + +library(mdatools) +data(People) + +model = pca(people, scale = T) + +par(mfrow = c(1, 2)) +plotCumVariance(model$calres) +plotCumVariance(model$calres, show.labels = T) +par(mfrow = c(1, 1)) + +} + +\keyword{ ~kwd1 } diff --git a/man/plotCumVariance.pca.Rd b/man/plotCumVariance.pca.Rd new file mode 100644 index 0000000..66e546d --- /dev/null +++ b/man/plotCumVariance.pca.Rd @@ -0,0 +1,31 @@ +\name{plotCumVariance.pca} +\alias{plotCumVariance.pca} + +\title{ +Cumulative explained variance plot for PCA model +} + +\description{ +\code{plotCumVariance.pca} is used to make a plot with cumulative variance, explained by each component in PCA model. +} + +\usage{ +plotCumVariance(obj, show.labels = F, show.legend = T) +} +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{show.labels}{logical, show or not labels for the plot points.} + \item{show.legend}{logical, show or not a legend for the plot.} +} +\details{ +If test set or cross-validation was used, the plot shows cumulative explained variance for the validation results as well. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +} + +\keyword{ ~pca } diff --git a/man/plotLoadings.pca.Rd b/man/plotLoadings.pca.Rd new file mode 100644 index 0000000..7c9eff8 --- /dev/null +++ b/man/plotLoadings.pca.Rd @@ -0,0 +1,45 @@ +\name{plotLoadings.pca} +\alias{plotLoadings.pca} + +\title{ +Loadings plot for PCA model +} +\description{ +\code{plotLoadings.pca} is used to make loadings plot (scatter or line) for a PCA model. +} + +\usage{ +plotLoadings(obj, comp = c(1, 2), show.labels = T, show.legend = T, type = "p") +} +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{comp}{number of components to make the plot for. If two values is used a scatter plot will be shown (otherwise line plot). } + \item{show.labels}{logical, show or not labels for the plot points.} + \item{show.legend}{logical, show or not a legend if a line plot for several components is shown.} + \item{type}{if \code{comp} has two values \code{type} allows to force line plot ('l') instead of scatter 'p'. Also possible to use line-scatter plot (type = 'b').} +} +\details{ +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## make a PCA model for People data and show loading plots + +library(mdatools) +data(People) + +model = pca(people, scale = T) + +par(mfrow = c(2, 2)) +plotLoadings(model) +plotLoadings(model, c(1, 3)) +plotLoadings(model, c(1, 3), type = 'l') +plotLoadings(model, 1:3, type = 'b') +par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } + diff --git a/man/plotResiduals.ldecomp.Rd b/man/plotResiduals.ldecomp.Rd new file mode 100644 index 0000000..94ccbb4 --- /dev/null +++ b/man/plotResiduals.ldecomp.Rd @@ -0,0 +1,50 @@ +\name{plotResiduals.ldecomp} +\alias{plotResiduals.ldecomp} + +\title{ +Residuals plot +} +\description{ +\code{ploResiduals} is used to make a plot with Hotelling T2 values vs. Q2 residuals. +} + +\usage{ +plotResiduals(obj, ncomp = NULL, cgroup = NULL, show.labels = F, show.colorbar = T) +} + +\arguments{ + \item{obj}{an object of \code{ldecomp} class (e.g. PCA results).} + \item{ncomp}{number of "selected" components, the distances are calculated for.} + \item{cgroup}{vector with values used for color grouping of plot points (number of values should + be the same as number of objects).} + \item{show.labels}{logical, show or not labels for plot points.} + \item{show.colorbar}{logical, show or not a colorbar legend if \code{cgroup} is provided.} +} + +\details{ +By default number of components, selected in a particular model, is used. But since the T2 and Q2 values in \code{ldecomp} are calculated for all components, it is possible to look at the plot for any number. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + + +\examples{ +## make a PCA model and show residual plots for calibration set + +library(mdatools) +data(People) + +model = pca(people, scale = T) +model = pca.selectCompNum(model, 5) + +par(mfrow = c(2, 2)) +plotResiduals(model$calres) +plotResiduals(model$calres, 2, show.labels = T) +plotResiduals(model$calres, cgroup = people[, 'Height']) +plotResiduals(model$calres, 2, cgroup = people[, 'Height'], show.colorbar = F) +par(mfrow = c(1, 1)) +} + +\keyword{ ~kwd1 } diff --git a/man/plotResiduals.pca.Rd b/man/plotResiduals.pca.Rd new file mode 100644 index 0000000..0a1b008 --- /dev/null +++ b/man/plotResiduals.pca.Rd @@ -0,0 +1,61 @@ +\name{plotResiduals.pca} +\alias{plotResiduals.pca} + +\title{ +Residuals plot for PCA model +} +\description{ +\code{plotResiduals.pca} is used to show plot for Hotelling T2 values vs. Q2 residuals for PCA model +} +\usage{ +plotResiduals(obj, ncomp = NULL, show.labels = F, show.legend = T, show.limits = T) +} +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{comp}{a number of component to show the residuals for.} + \item{show.labels}{logical, show or not labels for the plot points.} + \item{show.legend}{logical, show or not a legend if test or cross-validated results are also presented in the plot.} + \item{show.limits}{logical, show or not significance limits for T2 and Q2 values.} +} +\details{ +If number of components is not provided, the selected value (\code{ncomp.selected}) is used. If number of components is not equal to the selected value, limits are not shown even if \code{show.limits} is set to \code{TRUE}. +} + +\author{ +Sergey Kucheryvskiy (svkucheryavski@gmail.com) +} + +\examples{ +## Examples of using residuals plot + +library(mdatools) +data(People) + +# make a PCA model and show residuals plots +model = pca(people, scale = T) +model = selectCompNum(model, 5) + +par(mfrow = c(2, 2)) +plotResiduals(model) +plotResiduals(model, show.labels = T) +plotResiduals(model, 3) +plotResiduals(model, 4, show.labels = T) +par(mfrow = c(1, 1)) + +# make a model with full cross-validation and show residuals plots +model = pca(people, scale = T, cv = 1) +model = selectCompNum(model, 4) + +par(mfrow = c(2, 2)) +plotResiduals(model, show.labels = T) +plotResiduals(model, show.labels = T, show.limits = F) +plotResiduals(model, 3, show.legend = F) +plotResiduals(model, 4, show.labels = T) +par(mfrow = c(1, 1)) + +} + +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/plotScores.ldecomp.Rd b/man/plotScores.ldecomp.Rd new file mode 100644 index 0000000..368a6a0 --- /dev/null +++ b/man/plotScores.ldecomp.Rd @@ -0,0 +1,51 @@ +\name{plotScores.ldecomp} +\alias{plotScores.ldecomp} + +\title{ +Scores plot +} +\description{ +\code{plotScores.pca} is used to make a scores plot for \code{ldecomp} objects (PCA or PLS results, etc) +} + +\usage{ +plotScores(obj, comp = c(1, 2), cgroup = NULL, show.labels = F, +show.colorbar = T, show.axes = T) +} +\arguments{ + \item{obj}{an object of \code{ldecomp} class (e.g. PCA results).} + \item{comp}{either one value or vector with two values - number of components + to make the plot for. If one value is provided, the plot will be made for scores vs. objects.} + \item{cgroup}{vector with values used for color grouping of plot points (number of values should + be the same as number of objects).} + \item{show.labels}{logical, show or not labels for plot points.} + \item{show.colorbar}{logical, show or not a colorbar legend if \code{cgroup} is provided.} + \item{show.axes}{logical, show or not coordinate axes, that cross the origin (0, 0)ю} +} + +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## make a PCA model and show various score plots for calibration set + +library(mdatools) +data(People) + +model = pca(people, scale = T) + +par(mfrow = c(2, 2)) +plotScores(model$calres, show.labels = T) +plotScores(model$calres, c(1, 3), show.labels = T, cgroup = people[, 'Income']) +plotScores(model$calres, 2, cgroup = people[, 'Beer']) +plotScores(model$calres, 2, cgroup = people[, 'Beer'], show.colorbar = F) +par(mfrow = c(1, 1)) + +} + +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/man/plotScores.pca.Rd b/man/plotScores.pca.Rd new file mode 100644 index 0000000..2a53d9e --- /dev/null +++ b/man/plotScores.pca.Rd @@ -0,0 +1,53 @@ +\name{plotScores.pca} +\alias{plotScores.pca} + +\title{ +Scores plot for PCA model +} + +\description{ +\code{scoresPlot.pca} is used to make scores plot for a PCA model. +} + +\usage{ +plotScores(obj, comp = c(1, 2), show.labels = F, show.legend = T, show.axes = T) +} + +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{comp}{either one value or vector with two values - number of components + to make the plot for. If one value is provided, the plot will be made for scores vs. objects.} + \item{show.labels}{logical, show or not labels for the plot points.} + \item{show.axes}{logical, show or not coordinate axes, that cross the origin (0, 0)ю} + \item{show.legend}{logical, show or not a legend if test results are also presented in the plot.} +} + +\details{ +The plot shows scores for objects from calibration set and test set, if the last was provided. Color grouping (as in scores plot for PCA results) is not available here, because color is used to separate calibration and test objects. +} + + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## make a PCA model with test set validation and show score plots + +library(mdatools) +data(People) + +tdata = people[seq(1, nrow(people), 4), ] +cdata = people[-seq(1, nrow(people), 4), ] + +model = pca(cdata, scale = T, test.data = tdata) + +par(mfrow = c(2, 2)) +plotScores(model) +plotScores(model, c(1, 3), show.labels = T) +plotScores(model, c(1, 3), show.labels = T, show.legend = F) +plotScores(model, 2, show.labels = T) +par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } diff --git a/man/plotVariance.ldecomp.Rd b/man/plotVariance.ldecomp.Rd new file mode 100644 index 0000000..4b69624 --- /dev/null +++ b/man/plotVariance.ldecomp.Rd @@ -0,0 +1,45 @@ +\name{plotVariance.ldecomp} +\alias{plotVariance.ldecomp} + +\title{ +Explained variance plot +} + +\description{ +\code{plotVariance.ldecomp} is used to make a plot with data variance, explained by each component. +} +\usage{ +plotVariance(obj, show.labels = F) +} +\arguments{ + \item{obj}{an object of \code{ldecomp} class (e.g. PCA results).} + \item{show.labels}{logical, show or not labels with variance values for plot points.} +} + +\details{ +The variance values are in percent of total variance. +} + +\seealso{ +\code{\link{plotCumVariance.ldecomp}} makes cumulative explained variance plot. +} +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## make a PCA model and show explained variance plot for calibration set + +library(mdatools) +data(People) + +model = pca(people, scale = T) + +par(mfrow = c(1, 2)) +plotVariance(model$calres) +plotVariance(model$calres, show.labels = T) +par(mfrow = c(1, 1)) + +} + +\keyword{ ~kwd1 } diff --git a/man/plotVariance.pca.Rd b/man/plotVariance.pca.Rd new file mode 100644 index 0000000..43cb193 --- /dev/null +++ b/man/plotVariance.pca.Rd @@ -0,0 +1,44 @@ +\name{plotVariance.pca} +\alias{plotVariance.pca} + +\title{ +Explained variance plot for PCA model +} + +\description{ +\code{plotVariance.pca} is used to make a plot with variance, explained by each component in PCA model. +} + +\usage{ +plotVariance(obj, show.labels = F, show.legend = T) +} + +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{show.labels}{logical, show or not labels for the plot points.} + \item{show.legend}{logical, show or not a legend for the plot.} +} + +\details{ +If test set or cross-validation was used, the plot shows explained variance for the validation results as well. +} + +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + +\examples{ +## make a PCA model for People data with full cross-validation and show variance plots + +library(mdatools) +data(People) + +model = pca(people, scale = T, cv = 1) + +par(mfrow = c(1, 2)) +plotVariance(model) +plotVariance(model, show.labels = T, show.legend = F) +par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } diff --git a/man/predict.pca.Rd b/man/predict.pca.Rd new file mode 100644 index 0000000..cdaf11c --- /dev/null +++ b/man/predict.pca.Rd @@ -0,0 +1,61 @@ +\name{predict.pca} +\alias{predict.pca} + +\title{ +Predict method for Principal Component Analysis +} + +\description{ +Projects data to principal component space defined by a \code{pca} object. +} + +\usage{ +predict.pca(obj, data) +} + +\arguments{ + \item{obj}{object of class \code{pca} with PCA model.} + \item{data}{a numerical matrix with data.} +} + +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +Returns object of class \code{pcares} with results for the data decomposition. +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} + + +\seealso{ +\code{\link{pca}} to make PCA model, \code{\link{pcares}} to store and explore PCA results. +} + +\examples{ + library(mdatools) + data(People) + + # take every fourth row as a new data set + tdata = people[seq(1, nrow(people), 4), ] + + # take the rest as a calibration set + cdata = people[-seq(1, nrow(people), 4), ] + + # make a PCA model + model = pca(cdata, scale = T) + model = selectCompNum(model, 4) + + # apply model to the new data and show scores and residuals + res = predict(model, tdata) + par(mfrow = c(1, 2)) + plotScores(res, show.labels = T) + plotResiduals(res, show.labels = T) + par(mfrow = c(1, 1)) +} + +\keyword{ ~pca } diff --git a/man/regresult.Rd b/man/regresult.Rd new file mode 100755 index 0000000..1ed62b5 --- /dev/null +++ b/man/regresult.Rd @@ -0,0 +1,41 @@ +\name{regresult} +\alias{regresult} +\title{ +Regression Results +} +\description{ +\code{regresult} is used to represent and visualise regression results: predictions and model performance. +} +\usage{ +regresult(yp, y = NULL) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{yp}{ response values, predicted by a regression model. } + \item{y}{ reference response values} +} +\details{ +The regression results are returned by a corresponding model (e.g. \code{mlr} or \code{pls}), there is no need to use the class manually. The class is used to show summary for the results, if reference values are provided, and plot with predictions. +} +\value{ + The prediction statistics (RMSE, R2, slope and bias) are calculated only if reference values were provided. + \item{yp}{ response values, predicted by a regression model. } + \item{y}{ reference response values} + \item{rmse}{ root mean squared error of predictions } + \item{slope}{ slope of fitted line for predicted vs. reference values } + \item{r2}{ determination coefficient for predicted vs reference values} + \item{bias}{ bias (average error of predictions) } +} +\references{ +} +\author{ +Sergey Kucheryavskiy (svkucheryavski@gmail.com) +} +\note{ +} + +\seealso{ +\code{\link{plot.regresult}} shows plots with predicted y values (or predicted vs. reference y values if the were provided), \code{\link{summary.regresult}} shows summary statistics for predictions, if reference values were provided. +} +\examples{ +} diff --git a/man/selectCompNum.pca.Rd b/man/selectCompNum.pca.Rd new file mode 100644 index 0000000..2e56166 --- /dev/null +++ b/man/selectCompNum.pca.Rd @@ -0,0 +1,52 @@ +\name{selectCompNum.pca} +\alias{selectCompNum.pca} + +\title{ +Set number of optimal components +} + +\description{ +\code{selectCompNum} is used to select number of optimal components in a model. +} + +\usage{ +selectCompNum(obj, ncomp) +} + +\arguments{ + \item{obj}{an object of \code{pca} class (PCA model).} + \item{ncomp}{number of components to select.} +} + +\value{ +Returns the same model as provided as an argument, but where the parameter \code{ncomp.selected} is set to \code{ncomp} and all distances and limits are recalculated. If number is not set, number of all computed components will be considered as optimal. +} + +\author{ +Sergey Kucheryavski (svkucheryavski@gmail.com) +} + + +\examples{ +## make PCA model and show residuals for different number of selected components + +library(mdatools) +data(People) + +model = pca(people, ncomp = 8, scale = T) + +par(mfrow = c(2, 2)) +plotResiduals(model, show.labels = T) + +model = selectCompNum(model, 5) +plotResiduals(model, show.labels = T) + +model = selectCompNum(model, 3) +plotResiduals(model, show.labels = T) + +model = selectCompNum(model, 2) +plotResiduals(model, show.labels = T) + +} + +\keyword{ ~kwd1 } diff --git a/mdatools.Rproj b/mdatools.Rproj new file mode 100755 index 0000000..a323e5b --- /dev/null +++ b/mdatools.Rproj @@ -0,0 +1,16 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: No + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 3 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageInstallArgs: --no-multiarch diff --git a/test/.DS_Store b/test/.DS_Store new file mode 100644 index 0000000..53cca31 Binary files /dev/null and b/test/.DS_Store differ diff --git a/test/matlab/People.csv b/test/matlab/People.csv new file mode 100644 index 0000000..32a96b9 --- /dev/null +++ b/test/matlab/People.csv @@ -0,0 +1 @@ +,Height,Weight,Hairleng,Shoesize,Age,Income,Beer,Wine,Sex,Swim,Region,IQ Lars,198,92,-1,48,48,4.50E+04,420,115,-1,98,-1,100 Peter,184,84,-1,44,33,3.30E+04,350,102,-1,92,-1,130 Rasmus,183,83,-1,44,37,3.40E+04,320,98,-1,91,-1,127 Lene,166,47,-1,36,32,2.80E+04,270,78,1,75,-1,112 Mette,170,60,1,38,23,2.00E+04,312,99,1,81,-1,110 Gitte,172,64,1,39,24,2.20E+04,308,91,1,82,-1,102 Jens,182,80,-1,42,35,3.00E+04,398,65,-1,85,-1,140 Erik,180,80,-1,43,36,3.00E+04,388,63,-1,84,-1,129 Lotte,169,51,1,36,24,2.30E+04,250,89,1,78,-1,98 Heidi,168,52,1,37,27,2.35E+04,260,86,1,78,-1,100 Kaj,183,81,-1,42,37,3.50E+04,345,45,-1,90,-1,105 Gerda,157,47,1,36,32,3.20E+04,235,92,1,70,-1,127 Anne,164,50,1,38,41,3.40E+04,255,134,1,76,-1,101 Britta,162,49,1,37,40,3.40E+04,265,124,1,75,-1,108 Magnus,180,82,-1,44,43,3.70E+04,355,82,-1,88,-1,109 Casper,180,81,-1,44,46,4.20E+04,362,90,-1,86,-1,113 Luka,185,82,-1,45,26,1.60E+04,295,180,-1,92,1,109 Federico,187,84,-1,46,27,1.65E+04,299,178,-1,95,1,119 Dona,168,50,1,37,49,3.40E+04,170,162,1,76,1,135 Fabrizia,166,49,1,36,21,1.40E+04,150,245,1,75,1,123 Lisa,158,46,1,34,30,1.80E+04,120,120,1,70,1,119 Benito,177,65,-1,41,26,1.80E+04,209,160,-1,86,1,120 Franko,180,72,-1,43,33,1.90E+04,236,175,-1,85,1,115 Alessandro,181,75,-1,43,42,3.10E+04,198,161,-1,83,1,105 Leonora,163,50,1,36,18,1.10E+04,143,136,1,75,1,102 Giuliana,162,50,1,36,20,1.15E+04,133,146,1,74,1,132 Giovanni,176,68,-1,42,50,3.60E+04,195,177,-1,82,1,96 Leonardo,175,67,1,42,55,3.80E+04,185,187,-1,80,1,105 Marta,165,51,1,36,36,2.60E+04,121,129,1,76,1,126 Rosetta,161,48,1,35,41,3.15E+04,116,196,1,75,1,120 Romeo,178,75,-1,42,30,2.40E+04,203,208,-1,81,1,118 Romina,160,48,1,35,40,3.10E+04,118,198,1,74,1,129 \ No newline at end of file diff --git a/test/matlab/People.mat b/test/matlab/People.mat new file mode 100644 index 0000000..096fec6 Binary files /dev/null and b/test/matlab/People.mat differ diff --git a/test/matlab/mand.m b/test/matlab/mand.m new file mode 100644 index 0000000..ac978a7 --- /dev/null +++ b/test/matlab/mand.m @@ -0,0 +1,23 @@ +rmin = 1; +rmax = 4; +rstep = 0.01; + +figure +hold on +for r = rmin:rstep:rmax + + xinit = 0.01; + + for j = 1:50 + xnext = r * xinit * (1 - xinit); + xinit = xnext; + end + + for j = 1:16 + xnext = r * xinit * (1 - xinit); + xinit = xnext; + scatter(r, xnext, '.b', 'SizeData', 2); + end + +end +hold off \ No newline at end of file diff --git a/test/matlab/pca_mvreplace.m b/test/matlab/pca_mvreplace.m new file mode 100644 index 0000000..82c73b7 --- /dev/null +++ b/test/matlab/pca_mvreplace.m @@ -0,0 +1,18 @@ +clear + +%% simple example +x = 1:10 +data = [x' 2*x' 3*x']; +data(5, 1) = NaN; +data(8, 3) = NaN; + +options = mdcheck('options'); +options.display = 'on'; + +[a, b, newdata] = mdcheck(data, options) + +newdata + +%% People example + +%load('People.mat') \ No newline at end of file diff --git a/test/matlab/test_pca.m b/test/matlab/test_pca.m new file mode 100644 index 0000000..2ed71e4 --- /dev/null +++ b/test/matlab/test_pca.m @@ -0,0 +1,30 @@ +clear +load 'People.mat'; + +options = pca('options'); +options.plots = 'none'; +options.preprocessing = {'autoscale'}; + +cvoptions = crossval('options'); +cvoptions.preprocessing = 2; +options.plots = 'none'; + +ncomp = 4; +model = pca(data, ncomp, options); +modelcv = crossval(data, [], 'pca', {'loo'}, ncomp, cvoptions); + +x1 = model.tsqs{1, 1}; +y1 = model.ssqresiduals{1, 1}; +x2 = modelcv.tsqs{1, 1}; +y2 = modelcv.ssqresiduals{1, 1}; +figure +hold on +scatter(x1, y1, '.b'); +scatter(x2, y2, '.r'); +hold off +text(x1, y1, obj_names); +text(x2, y2, obj_names); +line([model.detail.tsqlim{1} model.detail.tsqlim{1}], [min(y1) max(y1)]) +line([min(x1) max(x1)], [model.detail.reslim{1} model.detail.reslim{1}]) + +disp(model.detail.reslim) \ No newline at end of file diff --git a/test/test_mvreplace.R b/test/test_mvreplace.R new file mode 100755 index 0000000..5ad10b2 --- /dev/null +++ b/test/test_mvreplace.R @@ -0,0 +1,40 @@ +library(mdatools) +data(People) + +data = people +mvdata = data + +# add missing values +mvdata[c(1, 11, 17), 1] = NA +mvdata[3, 4] = NA +mvdata[c(5, 7, 9, 23), 8] = NA +mvdata[c(25, 27, 31, 32), 2] = NA +mvdata[c(4, 2), 12] = NA + +# make PCA models +mvdata2 = pca.mvreplace(mvdata, center = T, scale = T) +#res = prep.autoscale(data, scale = T) +#data = res$x + +show(cbind(data[, 1], mvdata[, 1], mvdata2[, 1])) + +model1 = pca(data, scale = T) +model2 = pca(mvdata2, scale = T) +par(mfrow = c(2, 2)) +plotScores(model1, show.labels = T) +plotScores(model2, show.labels = T) +plotLoadings(model1, show.labels = T) +plotLoadings(model2, show.labels = T) +par(mfrow = c(1, 1)) + +a = matrix(c(1:10, 2*(1:10), 3 * (1:10)), ncol = 3) +a[5, 1] = NA +a[8, 3] = NA + + +ap = pca.mvreplace(a, center = T, scale = F) + +show(cbind(a, round(ap, 2))) + + + diff --git a/test/test_pca.R b/test/test_pca.R new file mode 100755 index 0000000..fc21e40 --- /dev/null +++ b/test/test_pca.R @@ -0,0 +1,124 @@ +library(mdatools) + +do_people = T + +if (do_people == T) { + data(People) + data = people + data[4, 4] = NA + pcamodel = pca(data, ncomp = 8, scale = T, cv = 1, info = 'My first model') + pcamodel = selectCompNum(pcamodel, 5) +} else +{ + ncobj = 200 + ntobj = 100 + nvar = 2000 + ncomp = 10 + values = rnorm((ncobj + ntobj) * nvar, 2, 2) + data = matrix(values[1:(ncobj * nvar)], nrow = ncobj, ncol = nvar) + tdata = matrix(values[(ncobj * nvar + 1):length(values)], nrow = ntobj, ncol = nvar) + + t1 = system.time({ + pcamodel = pca(data, ncomp = ncomp, scale = T, cv = 1, test.data = tdata) + }) + t2 = system.time({ + pcamodel = selectCompNum(pcamodel, 8) + }) + show(t1) + show(t2) +} + +show(pcamodel) +summary(pcamodel) + +cat('\n1. Check variance plot for cv results\n') +par(mfrow = c(2, 2)) +plotVariance(pcamodel$cvres) +plotVariance(pcamodel$cvres, show.labels = T) +plotCumVariance(pcamodel$cvres) +plotCumVariance(pcamodel$cvres, show.labels = T) +readline('Press Enter to continue...') + +cat('\n2. Check residual plot for cv results\n') +par(mfrow = c(2, 2)) +plotResiduals(pcamodel$cvres) +plotResiduals(pcamodel$cvres, show.labels = T) +plotResiduals(pcamodel$cvres, ncomp = 2) +plotResiduals(pcamodel$cvres, ncomp = 2, show.labels = T) +readline('Press Enter to continue...') + +cat('\n3. print and summary for cv results\n') +print(pcamodel$cvres) +summary(pcamodel$cvres) +readline('Press Enter to continue...') + +cat('\n4. Check scores plots for cal results\n') +par(mfrow = c(2, 2)) +plotScores(pcamodel$calres) +plotScores(pcamodel$calres, comp = 1, cgroup = data[, 1]) +plotScores(pcamodel$calres, c(1, 3), show.labels = T) +plotScores(pcamodel$calres, c(1, 2), cgroup = data[, 1], + show.labels = T, show.colorbar = F, show.axes = F) +readline('Press Enter to continue...') + +cat('\n5. Check residuals plots for cal results\n') +par(mfrow = c(2, 2)) +plotResiduals(pcamodel$calres) +plotResiduals(pcamodel$calres, ncomp = 3, show.labels = T) +plotResiduals(pcamodel$calres, ncomp = 3, cgroup = data[, 1], show.colorbar = T) +plotResiduals(pcamodel$calres, ncomp = 3, cgroup = data[, 2], show.labels = T) +readline('Press Enter to continue...') + +cat('\n6. Check variance plot for cal results\n') +par(mfrow = c(2, 2)) +plotVariance(pcamodel$calres) +plotVariance(pcamodel$calres, show.labels = T) +plotCumVariance(pcamodel$calres) +plotCumVariance(pcamodel$calres, show.labels = T) +readline('Press Enter to continue...') + +cat('\n7. print and summary for cal results\n') +print(pcamodel$calres) +summary(pcamodel$calres) +readline('Press Enter to continue...') + +cat('\n8. Check scores plots for pca model\n') +par(mfrow = c(2, 2)) +plotScores(pcamodel, 1, show.labels = T) +plotScores(pcamodel) +plotScores(pcamodel, c(1, 3), show.labels = T, show.legend = F) +plotScores(pcamodel, c(1, 2), show.axes = F) +readline('Press Enter to continue...') + +cat('\n9. Check loadings plots for pls model\n') +par(mfrow = c(2, 2)) +plotLoadings(pcamodel) +plotLoadings(pcamodel, c(1, 3), show.labels = F) +plotLoadings(pcamodel, c(1, 3), type = 'b') +plotLoadings(pcamodel, 1:4, show.legend = T) +readline('Press Enter to continue...') + +cat('\n10. Check residuals plots for pca model\n') +par(mfrow = c(2, 2)) +plotResiduals(pcamodel) +plotResiduals(pcamodel, ncomp = 2) +plotResiduals(pcamodel, show.labels = T) +plotResiduals(pcamodel, ncomp = 1, show.legend = F) +readline('Press Enter to continue...') + +cat('\n11. Check variance plots for pca model\n') +par(mfrow = c(2, 2)) +plotVariance(pcamodel) +plotVariance(pcamodel, show.legend = F, show.labels = T) +plotCumVariance(pcamodel) +plotCumVariance(pcamodel, show.legend = F, show.labels = T) +readline('Press Enter to continue...') + +cat('\n12. Check summary plot for pca model\n') +par(mfrow = c(1, 1)) +plot(pcamodel, show.labels = F) +readline('Press Enter to continue...') + + + + diff --git a/test/test_pls.R b/test/test_pls.R new file mode 100644 index 0000000..a3f9596 --- /dev/null +++ b/test/test_pls.R @@ -0,0 +1,35 @@ +library('mdatools') + +data(Simdata) + +nobj = nrow(spectra) +nvar = ncol(spectra) + +varstep = 1 +yvar = 1 + +X = spectra[1:70, seq(1, nvar, varstep)] +y = conc[1:70, yvar] +# y[5] = y[5] * 2 +Xt = spectra[71:nobj, seq(1, nvar, varstep)] +yt = conc[71:nobj, yvar] + +plsmodel = pls(X, y, Xt = Xt, yt = yt, autoscale = 1, cv = 0) +plsmodel = pls.selectncomp(plsmodel, 2) + +summary(plsmodel) +plot(plsmodel) +#par(mfrow = c(1, 2)) +#plotYResiduals(plsmodel, show.labels = T) +#plotXResiduals(plsmodel, show.labels = T) +#par(mfrow = c(2, 2)) +#plotRMSE(plsmodel) +plotXYScores(plsmodel, 1, show.labels = T) +#plotRegcoeffs(plsmodel) +plotPredictions(plsmodel, show.labels = T) + +#par(mfrow = c(1, 1)) +##plotResiduals(plsmodel, ncomp = 1) +#plotXLoadings(plsmodel, 2) +#plotWeights(plsmodel) +#plotYVariance(plsmodel) diff --git a/test/test_prep.R b/test/test_prep.R new file mode 100644 index 0000000..37d81b5 --- /dev/null +++ b/test/test_prep.R @@ -0,0 +1,5 @@ +x = prep() +x$add('savgol', 1, 2, 3) +x$add('autoscale') + +x \ No newline at end of file