# simulated data
n = 200
mu1 = c(3,8); mu2 = c(8,3); mu3 = c(7,7); mu4 = c(2,2);
var = 1.5;
x1 = t(matrix(mu1,2,n/4)) + matrix(rnorm(n)*2,n/4,2)
x2 = t(matrix(mu2,2,n/4)) + matrix(rnorm(n)*2,n/4,2)
x3 = t(matrix(mu3,2,n/4)) + matrix(rnorm(n)*.5,n/4,2)
x4 = t(matrix(mu4,2,n/4)) + matrix(rnorm(n)*5,n/4,2)
x = rbind(x1,x2,x3,x4)
Y = c(rep(1,n/4),rep(2,n/4),rep(3,n/4),rep(4,n/4))
y = factor(Y)
plot(x[,1],x[,2],col=as.numeric(y),pch=16)
# improved version of kmeans,
# showing improvement for each iteration step
mv.kmeans = function(x,k,seed=101,maxit=100,cens=NULL){
set.seed(seed)
n = nrow(x)
if(is.null(cens)){
cens = x[sample(1:n,k),]
}
plot(x[,1],x[,2],pch=16)
points(cens[,1],cens[,2],col=1:k,pch=16,cex=2)
thr = 1e-6; ind = 1; iter = 1;
while( ind>thr & iter<maxit)
{
oldcen = cens
km = kmeans(x,centers=cens,iter.max=1,nstart=1)
plot(x[,1],x[,2],col=km$cluster,pch=16)
cens = km$centers
points(cens[,1],cens[,2],col=1:k,pch=16,cex=2)
iter = iter + 1
ind = sum(diag((oldcen-cens)%*%t(oldcen-cens)))
# inx is the improvement for estimates of means
# or:
# inx = sum((oldcen-cens)^2)
print(ind)
}
return(list(centers=cens,cluster=km$cluster))
}
mv.kmeans(x,4,maxit=10)
Initial choice
After 1st iteration
After 2nd iteration
After 3rd iteration
Improvement for each iteration:
[1] 68.20739 [1] 1.615522 [1] 0
#test 1
inds = which(x[,1]<1 & x[,2]<1)
startc = x[sample(inds,4),]
mv.kmeans(x,4,maxit=10,cens=startc)
[1] 191.4699 [1] 47.31325 [1] 0.1695448 [1] 0
#test 2
inds = which(x[,1]<1 & x[,2]<5)
startc = x[sample(inds,5),]
mv.kmeans(x,5,maxit=10,cens=startc)
#test 3
inds = which(x[,1]>5 & x[,2]<5)
startc = x[sample(inds,5),]
mv.kmeans(x,5,maxit=10,cens=startc)
No comments:
Post a Comment