Skip to content

Commit

Permalink
v. 0.2.0
Browse files Browse the repository at this point in the history
- various improvements and documentation for PCA and related methods
  • Loading branch information
svkucheryavski committed Apr 29, 2013
0 parents commit 342e2ff
Show file tree
Hide file tree
Showing 63 changed files with 3,835 additions and 0 deletions.
10 changes: 10 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Package: mdatools
Title: Multivariate Data Analysis
Version: 0.2.0
Date: 2013-04-29
Author: Sergey Kucheryavskiy
Maintainer: Sergey Kucheryavskiy <[email protected]>
Description: The package contains useful functions for preprocessing, exploring and analysis of multivariate data.
Suggests:
Depends:
License: GPL-2
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
exportPattern("^[^\\.]")
Binary file added R/.DS_Store
Binary file not shown.
32 changes: 32 additions & 0 deletions R/classres.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# methods for classification results #
classres = function(res, ...) UseMethod("classres")

classres.getPredictionPerformance = function(res)
{
fn = NULL
fp = NULL
tp = NULL
sensitivity = NULL
specificity = NULL
f1 = NULL

if (!is.null(res.creference))
{
fn = sum((res.creference == 1) & (res.cpredictions == 0))
fp = sum((res.creference == 0) & (res.cpredictions == 1))
tp = sum((res.creference == 1) & (res.cpredictions == 1))

sensitivity = tp / (tp + fn)
specificity = tp / (tp + fp)
f1 = 2 * sensitivity * specificity / (sensitivity + specificity)
}

res$fn = fn
res$fp = fp
res$tp = tp
res$sensitivity = sensitivity
res$specificity = specificity
res$f1 = f1

return (res)
}
22 changes: 22 additions & 0 deletions R/crossval.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# function returns indices for cross-validation loop
crossval = function(nobj, nseg = NULL)
{
if (is.null(nseg))
{
if (nobj < 24) { nseg = nobj}
else if (nobj >= 24 && nobj < 40) { nseg = 8}
else if (nobj > 40) { nseg = 4 }
}
else if (nseg == 1)
{
nseg = nobj
}

seglen = ceiling(nobj / nseg)
fulllen = seglen * nseg

idx = c(sample(1:nobj), rep(NA, fulllen - nobj))
idx = matrix(idx, nrow = nseg, byrow = T)

return (idx)
}
318 changes: 318 additions & 0 deletions R/ldecomp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
# class and methods for linear decomposition X = TP' + E #
ldecomp = function(scores, loadings, residuals, fullvar, ...) UseMethod("ldecomp")

ldecomp.default = function(scores, loadings, residuals, fullvar, tnorm = NULL, ncomp.selected = NULL)
{
# Creates an object of ldecomp class.
#
# Arguments:
# scores: matrix with score values (nobj x ncomp).
# loadings: matrix with loading values (nvar x ncomp).
# residuals: matrix with data residuals
# fullvar: full variance of original data, preprocessed and centered
# tnorm: singular values for score normalization
# ncomp.selected: number of selected components
#
# Returns:
# object (list) of class ldecomp with following fields:
# obj$scores: matrix with score values (nobj x ncomp).
# obj$residuals: matrix with residuals (nobj x nvar).
# obj$fullvar: full variance of original data
# obj$Q2: matrix with Q2 residuals (nobj x ncomp).
# obj$T2: matrix with T2 distances (nobj x ncomp)
# obj$ncomp.selected: selected number of components
# obj$expvar: explained variance for each component
# obj$cumexpvar: cumulative explained variance

scores = as.matrix(scores)
loadings = as.matrix(loadings)
residuals = as.matrix(residuals)

# check dimension
if (ncol(scores) != ncol(loadings) ||
nrow(scores) != nrow(residuals) || nrow(loadings) != ncol(residuals))
stop('Dimensions of scores, loadings and data do not correspond to each other!')

# set names for scores and loadings
rownames(scores) = rownames(residuals)
colnames(scores) = paste('Comp', 1:ncol(scores))
rownames(loadings) = colnames(residuals)
colnames(loadings) = paste('Comp', 1:ncol(loadings))

if (is.null(ncomp.selected))
ncomp.selected = ncol(scores)

# calculate residual distances and explained variance
obj = ldecomp.getDistances(scores, loadings, residuals, tnorm)
var = ldecomp.getVariances(obj$Q2, fullvar)

obj$expvar = var$expvar
obj$cumexpvar = var$cumexpvar
obj$scores = scores
obj$residuals = residuals
obj$fullvar = fullvar
obj$ncomp.selected = ncomp.selected
obj$call = match.call()

class(obj) = "ldecomp"

return (obj)
}

ldecomp.getDistances = function(scores, loadings, residuals, tnorm = NULL)
{
# Computes residual distances (Q2 and T2) for a decomposition.
# The distances are calculated for every 1:n components, where n
# goes from 1 to ncomp (number of columns in scores and loadings)
#
# Arguments:
# scores: matrix with scores (nobj x ncomp).
# loadings: matrix with loadings (nvar x ncomp)
# residuals: matrix with data residuals
#
# Returns:
# res$Q2: matrix with Q2 residuals (nobj x ncomp).
# res$T2: matrix with T2 distances (nobj x ncomp)

ncomp = ncol(scores)
nobj = nrow(scores)
T2 = matrix(0, nrow = nobj, ncol = ncomp)
Q2 = matrix(0, nrow = nobj, ncol = ncomp)

# calculate normalized scores
if (is.null(tnorm))
tnorm = sqrt(colSums(scores ^ 2)/(nrow(scores) - 1));
scoresn = sweep(scores, 2L, tnorm, '/', check.margin = F);

# calculate distances for each set of components
for (i in 1:ncomp)
{
if (i < ncomp)
res = residuals + scores[, (i + 1):ncomp, drop = F] %*% t(loadings[, (i + 1):ncomp, drop = F])
else
res = residuals

Q2[, i] = rowSums(res^2)
T2[, i] = rowSums(scoresn[, 1:i, drop = F]^2)
}

# set dimnames and return results
colnames(Q2) = colnames(T2) = colnames(scores)
rownames(Q2) = rownames(T2) = rownames(scores)

res = list(
Q2 = Q2,
T2 = T2,
tnorm = tnorm
)
}

ldecomp.getVariances = function(Q2, fullvar)
{
# Computes explained variance and cumulative explained variance
# for every component of a decomposition.
#
# Arguments:
# scores: matrix with scores.
# loadings: matrix with loadings.
# residuals: matrix with residuals.
# Q2: matrix with Q2 values
# Returns:
# res$expvar: vector with explained variance for every component
# res$cumexpvar: vector with cumulative explained variance

cumresvar = colSums(Q2) / fullvar * 100
cumexpvar = 100 - cumresvar
expvar = c(cumexpvar[1], diff(cumexpvar))

res = list(
expvar = expvar,
cumexpvar = cumexpvar
)
}

ldecomp.getResLimits = function(eigenvals, nobj, ncomp, alpha = 0.05)
{
# Computes statistical limits for Q2 residuals and T2 distances.
#
# Arguments:
# eigenvals: vector with eigenvalues for a model.
# nobj: number of objects in calibration data
# ncomp: number of selected components
# alpha: significance level for Q2 limits
#
# Returns:
# res$Q2lim: limit for Q2 residuals
# res$T2lim: limit for T2 distances

# calculate T2 limit using Hotelling statistics
if (nobj == ncomp)
T2lim = 0
else
T2lim = (ncomp * (nobj - 1) / (nobj - ncomp)) * qf(1 - alpha, ncomp, nobj - ncomp);

# calculate Q2 limit using F statistics
conflim = 100 - alpha * 100;

nvar = length(eigenvals)

if (ncomp < nvar)
{
eigenvals = eigenvals[(ncomp + 1):nvar]

cl = 2 * conflim - 100
t1 = sum(eigenvals)
t2 = sum(eigenvals^2)
t3 = sum(eigenvals^3)
h0 = 1 - 2 * t1 * t3/3/(t2^2);

if (h0 < 0.001)
h0 = 0.001

ca = sqrt(2) * erfinv(cl/100)
h1 = ca * sqrt(2 * t2 * h0^2)/t1
h2 = t2 * h0 * (h0 - 1)/(t1^2)
Q2lim = t1 * (1 + h1 + h2)^(1/h0)
}
else
Q2lim = 0

res = list(
T2lim = T2lim,
Q2lim = Q2lim
)
}

plotCumVariance.ldecomp = function(obj, show.labels = F)
{
# Shows cumulative explained variance plot.
#
# Arguments:
# obj: object of ldecomp class.
# show.labels: show or not labels for plot points

data = cbind(1:length(obj$cumexpvar), obj$cumexpvar)
data = rbind(c(0, 0), data)
colnames(data) = c('Components', 'Explained variance, %')
rownames(data) = round(c(0, obj$cumexpvar), 1)
mdaplots.linescatter(data, main = 'Cumulative variance', show.labels = show.labels)
}

plotVariance.ldecomp = function(obj, show.labels = F)
{
# Shows explained variance plot.
#
# Arguments:
# obj: object of ldecomp class.
# show.labels: show or not labels for plot points
#

data = cbind(1:length(obj$expvar), obj$expvar)
colnames(data) = c('Components', 'Explained variance, %')
rownames(data) = round(obj$expvar, 1)
mdaplots.linescatter(data, main = 'Variance', show.labels = show.labels)
}

plotScores.ldecomp = function(obj, comp = c(1, 2), cgroup = NULL,
show.labels = F, show.colorbar = T,
show.axes = T)
{
# Shows scores plot.
#
# Arguments:
# obj: object of ldecomp class.
# comp: which components to show on x and y axis.
# cgroup: variable for color grouping of plot points.
# show.labels: show or not labels for plot points.
# show.colorbar: show or not a colorbar legend if cgroup is provided.
# show.axes: show or not axes crossing (0, 0) point.

if (length(comp) == 1)
{
# scores vs objects
data = cbind(1:nrow(obj$scores), obj$scores[, comp])
colnames(data) = c('Objects', colnames(obj$scores)[comp])
rownames(data) = rownames(obj$scores)
mdaplots.scatter(data, main = 'Scores', cgroup = cgroup,
show.labels = show.labels,
show.colorbar = show.colorbar
)
}
else if (length(comp) == 2)
{
# scores vs scores
data = obj$scores[, c(comp[1], comp[2])]

if (show.axes == T)
show.lines = c(0, 0)
else
show.lines = F

mdaplots.scatter(data, main = 'Scores', cgroup = cgroup,
show.labels = show.labels,
show.colorbar = show.colorbar,
show.lines = show.lines)
}
else
{
stop('Wrong number of components!')
}
}

plotResiduals.ldecomp = function(obj, ncomp = NULL, cgroup = NULL,
show.labels = F, show.colorbar = T)
{
# Shows T2 vs Q2 residuals plot.
#
# Arguments:
# obj: object of ldecomp class.
# ncomp: number of components for the residuals
# cgroup: variable for color grouping of plot points.
# show.labels: show or not labels for plot points.
# show.colorbar: show or not a colorbar legend if cgroup is provided.

if (is.null(ncomp))
ncomp = obj$ncomp.selected

data = cbind(obj$T2[, ncomp], obj$Q2[, ncomp])
colnames(data) = c('T2', 'Q2')
mdaplots.scatter(data, main = sprintf('Residuals (ncomp = %d)', ncomp),
cgroup = cgroup, show.labels = show.labels,
show.colorbar = show.colorbar)
}

print.ldecomp = function(obj, str = NULL)
{
if (is.null(str))
str ='Results of data decomposition (class ldecomp)'

cat('\n')
cat(str, '\n')
cat('\nMajor fields:\n')
cat('$scores - matrix with score values (nobj x ncomp)\n')
cat('$T2 - matrix with T2 distances (nobj x ncomp)\n')
cat('$Q2 - matrix with Q2 residuals (nobj x ncomp)\n')
cat('$ncomp.selected - selected number of components\n')
cat('$expvar - explained variance for each component\n')
cat('$cumexpvar - cumulative explained variance\n\n')
}

summary.ldecomp = function(obj, str = NULL)
{
if (is.null(str))
str ='Summary for data decomposition (class ldecomp)'

cat('\n')
cat(str, '\n')
cat(sprintf('\nSelected components: %d\n\n', obj$ncomp.selected))

data = cbind(round(obj$expvar, 2),
round(obj$cumexpvar, 2))

colnames(data) = c('Exp. var', 'Cum. exp. var')
show(data)
}

erfinv = function (x) qnorm((1 + x)/2)/sqrt(2)


Loading

0 comments on commit 342e2ff

Please sign in to comment.