This file contains examples of the use of the weightedEM, unpack, and cwLocScat functions. It reproduces all the figures of the report “Analyzing cellwise weighted data” by P.J. Rousseeuw.
library("cellWise")
n = 10
d = 3
A = matrix(0.7, d, d); diag(A) = 1
A
## [,1] [,2] [,3]
## [1,] 1.0 0.7 0.7
## [2,] 0.7 1.0 0.7
## [3,] 0.7 0.7 1.0
set.seed(12345)
library("MASS")
X = mvrnorm(n, rep(0,d), A)
colnames(X) = c("X1","X2","X3")
X[1,3] = X[2,2] = X[3,1] = X[4,1] = X[6,2] = NA
X # rows 1, 2, 3, 4, 6 have NAs
## X1 X2 X3
## [1,] 0.81494704 0.5501005 NA
## [2,] 1.57453449 NA 0.5526973
## [3,] NA -0.2420284 0.2337319
## [4,] NA -0.5869239 0.2996664
## [5,] -0.24745384 0.9288529 0.9443676
## [6,] -0.74023654 NA -2.0885170
## [7,] 0.19707169 0.9746399 0.5190202
## [8,] -0.06183274 -0.1193971 -0.5598499
## [9,] 0.21050943 -0.7740109 -0.1989791
## [10,] -0.82944245 -0.9502024 -0.6871550
w = c(1,2,1,1,2,2,1,1,1,1) # rowwise weights
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # number of iteration steps taken
## [1] 131
out$mu
## X1 X2 X3
## 0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6)
## X1 X2 X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
plot(1:out$niter, out$loglikhd[1:out$niter], type='l',
lty=1, col=4, xlab='step', ylab='log(likelihood)',
main='log(likelihood) of weighted EM iterations')
tail(out$loglikhd) # would have NAs for computeloglik=F
## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189
out$impX # imputed data, has no NA's
## X1 X2 X3
## [1,] 0.81494704 0.55010045 0.7764194
## [2,] 1.57453449 0.01172782 0.5526973
## [3,] 0.49065646 -0.24202836 0.2337319
## [4,] 0.75818277 -0.58692386 0.2996664
## [5,] -0.24745384 0.92885285 0.9443676
## [6,] -0.74023654 -2.19170090 -2.0885170
## [7,] 0.19707169 0.97463990 0.5190202
## [8,] -0.06183274 -0.11939713 -0.5598499
## [9,] 0.21050943 -0.77401094 -0.1989791
## [10,] -0.82944245 -0.95020238 -0.6871550
# The weights may be multiplied by a constant:
#
(w = c(1,2,1,1,2,2,1,1,1,1)/3) # divide weights by 3
## [1] 0.3333333 0.6666667 0.3333333 0.3333333 0.6666667 0.6666667 0.3333333
## [8] 0.3333333 0.3333333 0.3333333
out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # OK, same results:
## [1] 131
out$mu # same
## X1 X2 X3
## 0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6) # same
## X1 X2 X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
tail(out$loglikhd)
## [1] -11.3573 -11.3573 -11.3573 -11.3573 -11.3573 -11.3573
# converges to -11.3573 = -34.07189 / 3
# Create an equivalent matrix y without weights, by repeating
# some rows according to their integer weights:
#
Y = X[c(1,2,2,3,4,5,5,6,6,7,8,9,10),]
dim(Y)
## [1] 13 3
Y # This gives the same results:
## X1 X2 X3
## [1,] 0.81494704 0.5501005 NA
## [2,] 1.57453449 NA 0.5526973
## [3,] 1.57453449 NA 0.5526973
## [4,] NA -0.2420284 0.2337319
## [5,] NA -0.5869239 0.2996664
## [6,] -0.24745384 0.9288529 0.9443676
## [7,] -0.24745384 0.9288529 0.9443676
## [8,] -0.74023654 NA -2.0885170
## [9,] -0.74023654 NA -2.0885170
## [10,] 0.19707169 0.9746399 0.5190202
## [11,] -0.06183274 -0.1193971 -0.5598499
## [12,] 0.21050943 -0.7740109 -0.1989791
## [13,] -0.82944245 -0.9502024 -0.6871550
out = weightedEM(Y,crit=1e-12,computeloglik=T) # OK, same
out$niter
## [1] 131
out$mu
## X1 X2 X3
## 0.21182926 -0.28077406 -0.06154232
round(out$Sigma,6)
## X1 X2 X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466
tail(out$loglikhd)
## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189
# converges to -34.07189 like before.
X = matrix(c(2.8,5.3,4.9,7.4,
2.3,5.7,4.3,7.2,
2.5,5.1,4.4,7.6),nrow=3,byrow=T)
W = matrix(c(0.8,1.0,0.3,0.4,
0.3,0.5,0.9,0.5,
1.0,0.6,0,0.7),nrow=3,byrow=T)
rownames(X) = rownames(W) = c("A","B","C")
colnames(X) = colnames(W) = c("V1","V2","V3","V4")
n = nrow(X); d = ncol(X)
X
## V1 V2 V3 V4
## A 2.8 5.3 4.9 7.4
## B 2.3 5.7 4.3 7.2
## C 2.5 5.1 4.4 7.6
W
## V1 V2 V3 V4
## A 0.8 1.0 0.3 0.4
## B 0.3 0.5 0.9 0.5
## C 1.0 0.6 0.0 0.7
out = unpack(X,W)
cbind(out$U,out$v) # OK
## V1 V2 V3 V4
## A NA 5.3 NA NA 0.2
## A 2.8 5.3 NA NA 0.4
## A 2.8 5.3 NA 7.4 0.1
## A 2.8 5.3 4.9 7.4 0.3
## B NA NA 4.3 NA 0.4
## B NA 5.7 4.3 7.2 0.2
## B 2.3 5.7 4.3 7.2 0.3
## C 2.5 NA NA NA 0.3
## C 2.5 NA NA 7.6 0.1
## C 2.5 5.1 NA 7.6 0.6
dim(out$U)
## [1] 10 4
set.seed(12345)
n = 1000; d = 2
A = matrix(0.7, d, d); diag(A) = 1
A
## [,1] [,2]
## [1,] 1.0 0.7
## [2,] 0.7 1.0
X = mvrnorm(n, rep(0,d), A)
head(X)
## [,1] [,2]
## [1,] -0.1098666 1.1895284
## [2,] 0.6233152 0.6848755
## [3,] 0.2309203 -0.4324656
## [4,] -0.1164846 -0.7197229
## [5,] 0.7061365 0.4110647
## [6,] -0.9412289 -2.4109163
W = abs(mvrnorm(n, rep(0,d), diag(rep(1,2))))
W = W/max(as.vector(W))
W[2,1] = 0
W[5,2] = 0
head(W)
## [,1] [,2]
## [1,] 0.151856434 0.18102767
## [2,] 0.000000000 0.32052154
## [3,] 0.120772713 0.17167154
## [4,] 0.003729069 0.32719369
## [5,] 0.327865177 0.00000000
## [6,] 0.285126341 0.01091696
fit = cwLocScat(X,W)
fit$cwMLEiter # number of iteration steps
## [1] 47
fit$cwMLEmu
## 1 2
## 0.05972004 0.04231900
fit$cwMean
## 1 2
## 0.06657723 0.03736100
fit$cwMLEsigma
## 1 2
## 1 0.9717504 0.6735339
## 2 0.6735339 1.0075136
fit$cwCov # similar to cwMLEsigma:
## 1 2
## 1 0.9759485 0.7398607
## 2 0.7398607 1.0299615
fit$sqrtCov # same diagonal:
## 1 2
## 1 0.9759485 0.718826
## 2 0.7188260 1.029961
data("data_personality_traits")
X <- data_personality_traits$X
W <- data_personality_traits$W
cbind(X,W) # as in table in the paper
## t1 t2 t3 t4 t5 t6 t1 t2 t3 t4 t5 t6
## [1,] 7.0 5 7.0 5.0 5.0 5.0 0.50 0.29 0.50 0.29 0.29 0.29
## [2,] 10.0 10 10.0 7.0 8.5 7.0 1.00 1.00 1.00 0.50 0.58 0.50
## [3,] 5.0 5 10.0 5.0 5.0 5.0 0.29 0.29 1.00 0.29 0.29 0.29
## [4,] 10.0 10 10.0 5.0 5.0 5.0 1.00 1.00 1.00 0.29 0.29 0.29
## [5,] 7.0 7 8.5 5.0 5.0 5.0 0.50 0.50 0.58 0.29 0.29 0.29
## [6,] 10.0 5 5.0 8.5 8.5 5.0 1.00 0.29 0.29 0.58 0.58 0.29
## [7,] 5.0 7 7.0 5.0 5.0 8.5 0.29 0.50 0.50 0.29 0.29 0.58
## [8,] 10.0 10 10.0 10.0 10.0 10.0 1.00 1.00 1.00 1.00 1.00 1.00
## [9,] 8.5 7 8.5 5.0 5.0 5.0 0.58 0.50 0.58 0.29 0.29 0.29
## [10,] 5.0 10 5.0 7.0 5.0 7.0 0.29 1.00 0.29 0.50 0.29 0.50
out = unpack(X,W)
cbind(out$U,out$v)
## t1 t2 t3 t4 t5 t6
## 1 7.0 NA 7.0 NA NA NA 0.21
## 1 7.0 5 7.0 5.0 5.0 5.0 0.29
## 2 10.0 10 10.0 NA NA NA 0.42
## 2 10.0 10 10.0 NA 8.5 NA 0.08
## 2 10.0 10 10.0 7.0 8.5 7.0 0.50
## 3 NA NA 10.0 NA NA NA 0.71
## 3 5.0 5 10.0 5.0 5.0 5.0 0.29
## 4 10.0 10 10.0 NA NA NA 0.71
## 4 10.0 10 10.0 5.0 5.0 5.0 0.29
## 5 NA NA 8.5 NA NA NA 0.08
## 5 7.0 7 8.5 NA NA NA 0.21
## 5 7.0 7 8.5 5.0 5.0 5.0 0.29
## 6 10.0 NA NA NA NA NA 0.42
## 6 10.0 NA NA 8.5 8.5 NA 0.29
## 6 10.0 5 5.0 8.5 8.5 5.0 0.29
## 7 NA NA NA NA NA 8.5 0.08
## 7 NA 7 7.0 NA NA 8.5 0.21
## 7 5.0 7 7.0 5.0 5.0 8.5 0.29
## 8 10.0 10 10.0 10.0 10.0 10.0 1.00
## 9 8.5 NA 8.5 NA NA NA 0.08
## 9 8.5 7 8.5 NA NA NA 0.21
## 9 8.5 7 8.5 5.0 5.0 5.0 0.29
## 10 NA 10 NA NA NA NA 0.50
## 10 NA 10 NA 7.0 NA 7.0 0.21
## 10 5.0 10 5.0 7.0 5.0 7.0 0.29
fit = cwLocScat(X,W)
fit$cwMLEiter
## [1] 49
round(fit$cwMLEmu,2)
## t1 t2 t3 t4 t5 t6
## 8.82 8.72 8.98 7.40 7.53 7.37
round(fit$cwMean,2)
## t1 t2 t3 t4 t5 t6
## 8.73 8.61 8.87 7.09 7.16 7.09
round(fit$cwMLEsigma, 2)
## t1 t2 t3 t4 t5 t6
## t1 3.22 1.83 1.49 1.91 2.52 1.02
## t2 1.83 3.49 1.55 1.69 1.82 2.00
## t3 1.49 1.55 2.50 0.63 1.22 0.92
## t4 1.91 1.69 0.63 3.54 3.48 2.84
## t5 2.52 1.82 1.22 3.48 3.90 2.82
## t6 1.02 2.00 0.92 2.84 2.82 3.58
round(eigen(fit$cwMLEsigma)$values, 2)
## [1] 13.13 3.24 2.18 1.25 0.31 0.11
round(fit$cwCov, 2)
## t1 t2 t3 t4 t5 t6
## t1 3.36 2.10 1.72 2.63 3.33 1.37
## t2 2.10 3.61 1.78 2.20 2.44 2.46
## t3 1.72 1.78 2.59 0.81 1.64 1.14
## t4 2.63 2.20 0.81 3.99 4.17 3.26
## t5 3.33 2.44 1.64 4.17 4.76 3.40
## t6 1.37 2.46 1.14 3.26 3.40 4.00
round(eigen(fit$cwCov)$values,5)
## [1] 15.91356 2.99031 2.26943 1.08606 0.05570 0.00416
round(cov(X), 2) # unweighted
## t1 t2 t3 t4 t5 t6
## t1 4.96 1.61 1.44 2.01 3.00 0.07
## t2 1.61 4.93 1.38 1.39 1.26 2.17
## t3 1.44 1.38 4.04 -0.42 0.59 0.36
## t4 2.01 1.39 -0.42 3.29 3.25 1.93
## t5 3.00 1.26 0.59 3.25 3.90 1.89
## t6 0.07 2.17 0.36 1.93 1.89 3.29
round(eigen(cov(X))$values, 2)
## [1] 11.97 4.95 4.64 2.28 0.47 0.10
Now we reproduce the figure in the paper
ellips = function(covmat, mu, quant=0.95, npoints = 120)
{ # computes points of the ellipse t(X-mu)%*%covmat%*%(X-mu) = c
# with c = qchisq(quant,df=2)
if (!all(dim(covmat) == c(2, 2))) stop("covmat is not 2 by 2")
eig = eigen(covmat)
U = eig$vectors
R = U %*% diag(sqrt(eig$values)) %*% t(U) # square root of covmat
angles = seq(0, 2*pi, length = npoints+1)
xy = cbind(cos(angles),sin(angles)) # points on the unit circle
fac = sqrt(qchisq(quant, df=2))
scale(fac*xy%*%R, center = -mu, scale=FALSE)
}
n = nrow(X)
j = 3; k = 6 # to plot variables t3 and t6
xy = X[,c(j,k)]
cov2cor(cov(X)[c(j,k),c(j,k)]) # unweighted correlation is 0.10
## t3 t6
## t3 1.00000000 0.09896998
## t6 0.09896998 1.00000000
cov2cor(fit$cwMLEsigma[c(j,k),c(j,k)]) # now correlation is 0.31
## t3 t6
## t3 1.0000000 0.3077506
## t6 0.3077506 1.0000000
(wxy = W[,c(j,k)])
## t3 t6
## [1,] 0.50 0.29
## [2,] 1.00 0.50
## [3,] 1.00 0.29
## [4,] 1.00 0.29
## [5,] 0.58 0.29
## [6,] 0.29 0.29
## [7,] 0.50 0.58
## [8,] 1.00 1.00
## [9,] 0.58 0.29
## [10,] 0.29 0.50
duplicated(xy) # ties: row 4 equals row 3, and row 9 equals row 5
## [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
wxy[3,] = wxy[3,] + wxy[4,] # add cell weights of rows 3 and 4
wxy[5,] = wxy[5,] + wxy[9,] # add cell weights of rows 5 and 9
(wxy = wxy[-c(4,9),]) # remove duplicate rows
## t3 t6
## [1,] 0.50 0.29
## [2,] 1.00 0.50
## [3,] 2.00 0.58
## [4,] 1.16 0.58
## [5,] 0.29 0.29
## [6,] 0.50 0.58
## [7,] 1.00 1.00
## [8,] 0.29 0.50
(xy = xy[-c(4,9),]) # remove duplicate rows
## t3 t6
## [1,] 7.0 5.0
## [2,] 10.0 7.0
## [3,] 10.0 5.0
## [4,] 8.5 5.0
## [5,] 5.0 5.0
## [6,] 7.0 8.5
## [7,] 10.0 10.0
## [8,] 5.0 7.0
# pdf("personality_cwMLE_cwCov.pdf",width=5.5,height=5.5)
myxlim = c(2,14); myylim = c(1,13)
plot(xy, pch=16, col="white", xlim=myxlim, ylim=myylim,
xlab="",ylab="")
fac = 0.3 # for the size of the lines representing the cell weights
for(i in seq_len(nrow(xy))){
WY = c(xy[i,1] - fac*wxy[i,1],xy[i,2]) # (WestX, Y)
EY = c(xy[i,1] + fac*wxy[i,1],xy[i,2]) # (EastX, Y)
XS = c(xy[i,1],xy[i,2] - fac*wxy[i,2]) # (X, SouthY)
XN = c(xy[i,1],xy[i,2] + fac*wxy[i,2]) # (X, NorthY)
lines(rbind(WY,EY),lwd=3)
lines(rbind(XS,XN),lwd=3)
}
title(main="tolerance ellipses with and without cell weights",
line=0.8,cex.main=1) # 1.2)
title(xlab="trait 3",line=2.1,cex.lab=1.0)
title(ylab="trait 6",line=2.1,cex.lab=1.0)
center1 = colMeans(X[,c(j,k)])
covmat1 = (n-1)*cov(X[,c(j,k)])/n
ell1 = ellips(covmat1, center1)
lines(ell1,lwd=1.5,col="red") # ellipse from unweighted covariance
fit2 = cwLocScat(xy,wxy)
center2 = fit2$cwMLEmu
covmat2 = fit2$cwMLEsigma
ell2 = ellips(covmat2, center2) # ellipse from cwMLE estimates
lines(ell2,lwd=1.5,col="blue")
center3 = fit2$cwMean
covmat3 = fit2$cwCov
ell3 = ellips(covmat3, center3) # ellipse from cwMean and cwCov
lines(ell3,lwd=1.5,lty=2,col="blue")
legend("topleft",c("cwMLE","cwCov","MLE"),
col=c("blue","blue","red"), lty=c(1,2,1), lwd=1.5, cex=1)
# dev.off()
# The blue ellipses, that use the cell weights, are a bit
# higher and more slanted than the red ellipse that doesn't,
# mainly due to the high cell weight of the y-coordinate of
# the point in the top right corner.