-
Notifications
You must be signed in to change notification settings - Fork 0
/
generateData.R
69 lines (64 loc) · 2.21 KB
/
generateData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
require(mlbench)
#generates itemsets then changes them by jumping items from one cluster into another
#Parameters:
#n: number of items
#cl: number of clusters
#r: radius of cluster
#sd: standard deviation of items' distribution
#periods: number of time points
#nm: Number of items which must jump from one time point to another.
# if NULL then it will be random between 1 and 0
generateData <- function(n=500, cl=4, r=6, sd=1.5, periods=20, nm=NULL){
set.seed(123)
if (is.null(nm)){
nm = as.integer(runif(periods - 1) * 20)
}
oldData <- initialData(n, cl, r, sd)
allData <- oldData
periods <- periods - 1
for( i in 1:periods){
# par(mar = c(3,3,1,1)) ## default is c(5,4,4,2) + 0.1
# plot(oldData$x, oldData$y, col=oldData$idtyp, pch=oldData$idtyp, xlab = 'X',
# ylab = 'Y', mgp = c(2, 1, 0))
newData <- mutateData(oldData, cl, nm[i])
allData <- merge(allData, newData, all=TRUE, sort=FALSE)
oldData <- newData
}
# par(mar = c(3,3,1,1)) ## default is c(5,4,4,2) + 0.1
# plot(oldData$x, oldData$y, col=oldData$idtyp, pch=oldData$idtyp, xlab = 'X',
# ylab = 'Y', mgp = c(2, 1, 0))
return (allData)
}
# creates next time point of data based on the previous data set and the number of items should be jumped
# data: previous time point dataset. it also introduce small randomness by changing the
# possition of items in the same cluster.
#cl: number of clusters
#nm: number of jump
mutateData <- function(data,cl, nm){
data$period <- data$period[1] + 1
lenDat <- length(data[,1])
sm <- sample(lenDat, nm)
len <- length(sm)
s1 <- sm[1:(len/2)]
s2 <- sm[((len/2)+1):len]
data$x[s1] <- -data$x[s1]
data$y[s2] <- -data$y[s2]
data$x <- data$x + data$x * runif(lenDat, 0.01, 0.02)
data$y <- data$y + data$y * runif(lenDat, 0.01, 0.02)
return (data)
}
# Creates the first dataset in a Cartesian coordinate.
#n: number of items
#cl: number of clusters
#r: radius of cluster
#sd: standard deviation of items' distribution
initialData <- function(n, cl, r, sd){
set.seed(138)
p <- mlbench.2dnormals(n, cl, r, sd)
idtyp <- as.integer(p$classes)
idsubj <- 1:n
period <- rep(1, n)
x <- p$x[,1]
y <- p$x[,2]
return (data.frame(idtyp, idsubj, period, x, y))
}