Introduction

This file contains examples of the use of the weightedEM, unpack, and cwLocScat functions. It reproduces all the figures of the report “Analyzing cellwise weighted data” by P.J. Rousseeuw.

library("cellWise")

n = 10
d = 3
A = matrix(0.7, d, d); diag(A) = 1
A

##      [,1] [,2] [,3]
## [1,]  1.0  0.7  0.7
## [2,]  0.7  1.0  0.7
## [3,]  0.7  0.7  1.0

set.seed(12345)
library("MASS")
X = mvrnorm(n, rep(0,d), A)
colnames(X) = c("X1","X2","X3")
X[1,3] = X[2,2] = X[3,1] = X[4,1] = X[6,2] = NA
X # rows 1, 2, 3, 4, 6 have NAs

##                X1         X2         X3
##  [1,]  0.81494704  0.5501005         NA
##  [2,]  1.57453449         NA  0.5526973
##  [3,]          NA -0.2420284  0.2337319
##  [4,]          NA -0.5869239  0.2996664
##  [5,] -0.24745384  0.9288529  0.9443676
##  [6,] -0.74023654         NA -2.0885170
##  [7,]  0.19707169  0.9746399  0.5190202
##  [8,] -0.06183274 -0.1193971 -0.5598499
##  [9,]  0.21050943 -0.7740109 -0.1989791
## [10,] -0.82944245 -0.9502024 -0.6871550

w = c(1,2,1,1,2,2,1,1,1,1) # rowwise weights

out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # number of iteration steps taken

## [1] 131

out$mu

##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232

round(out$Sigma,6)

##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466

plot(1:out$niter, out$loglikhd[1:out$niter], type='l',
     lty=1, col=4, xlab='step', ylab='log(likelihood)',
     main='log(likelihood) of weighted EM iterations')

plot of chunk unnamed-chunk-3

tail(out$loglikhd) # would have NAs for computeloglik=F

## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189

out$impX # imputed data, has no NA's

##                X1          X2         X3
##  [1,]  0.81494704  0.55010045  0.7764194
##  [2,]  1.57453449  0.01172782  0.5526973
##  [3,]  0.49065646 -0.24202836  0.2337319
##  [4,]  0.75818277 -0.58692386  0.2996664
##  [5,] -0.24745384  0.92885285  0.9443676
##  [6,] -0.74023654 -2.19170090 -2.0885170
##  [7,]  0.19707169  0.97463990  0.5190202
##  [8,] -0.06183274 -0.11939713 -0.5598499
##  [9,]  0.21050943 -0.77401094 -0.1989791
## [10,] -0.82944245 -0.95020238 -0.6871550

# The weights may be multiplied by a constant:
#
(w = c(1,2,1,1,2,2,1,1,1,1)/3) # divide weights by 3

##  [1] 0.3333333 0.6666667 0.3333333 0.3333333 0.6666667 0.6666667 0.3333333
##  [8] 0.3333333 0.3333333 0.3333333

out = weightedEM(X,w,crit=1e-12,computeloglik=T)
out$niter # OK, same results:

## [1] 131

out$mu # same

##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232

round(out$Sigma,6) # same

##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466

tail(out$loglikhd)

## [1] -11.3573 -11.3573 -11.3573 -11.3573 -11.3573 -11.3573

# converges to -11.3573 = -34.07189 / 3


# Create an equivalent matrix y without weights, by repeating
# some rows according to their integer weights:
#
Y = X[c(1,2,2,3,4,5,5,6,6,7,8,9,10),]
dim(Y)

## [1] 13  3

Y # This gives the same results:

##                X1         X2         X3
##  [1,]  0.81494704  0.5501005         NA
##  [2,]  1.57453449         NA  0.5526973
##  [3,]  1.57453449         NA  0.5526973
##  [4,]          NA -0.2420284  0.2337319
##  [5,]          NA -0.5869239  0.2996664
##  [6,] -0.24745384  0.9288529  0.9443676
##  [7,] -0.24745384  0.9288529  0.9443676
##  [8,] -0.74023654         NA -2.0885170
##  [9,] -0.74023654         NA -2.0885170
## [10,]  0.19707169  0.9746399  0.5190202
## [11,] -0.06183274 -0.1193971 -0.5598499
## [12,]  0.21050943 -0.7740109 -0.1989791
## [13,] -0.82944245 -0.9502024 -0.6871550

out = weightedEM(Y,crit=1e-12,computeloglik=T) # OK, same
out$niter

## [1] 131

out$mu

##          X1          X2          X3 
##  0.21182926 -0.28077406 -0.06154232

round(out$Sigma,6)

##          X1       X2       X3
## X1 0.660288 0.331107 0.474936
## X2 0.331107 1.088924 0.943985
## X3 0.474936 0.943985 1.002466

tail(out$loglikhd)

## [1] -34.07189 -34.07189 -34.07189 -34.07189 -34.07189 -34.07189

# converges to -34.07189 like before.

Unpack the toy example in section 2 of the paper

X = matrix(c(2.8,5.3,4.9,7.4,
             2.3,5.7,4.3,7.2,
             2.5,5.1,4.4,7.6),nrow=3,byrow=T)
W = matrix(c(0.8,1.0,0.3,0.4,
             0.3,0.5,0.9,0.5,
             1.0,0.6,0,0.7),nrow=3,byrow=T)
rownames(X) = rownames(W) = c("A","B","C")
colnames(X) = colnames(W) = c("V1","V2","V3","V4")
n = nrow(X); d = ncol(X)
X

##    V1  V2  V3  V4
## A 2.8 5.3 4.9 7.4
## B 2.3 5.7 4.3 7.2
## C 2.5 5.1 4.4 7.6

##    V1  V2  V3  V4
## A 0.8 1.0 0.3 0.4
## B 0.3 0.5 0.9 0.5
## C 1.0 0.6 0.0 0.7

out = unpack(X,W)
cbind(out$U,out$v) # OK

##    V1  V2  V3  V4    
## A  NA 5.3  NA  NA 0.2
## A 2.8 5.3  NA  NA 0.4
## A 2.8 5.3  NA 7.4 0.1
## A 2.8 5.3 4.9 7.4 0.3
## B  NA  NA 4.3  NA 0.4
## B  NA 5.7 4.3 7.2 0.2
## B 2.3 5.7 4.3 7.2 0.3
## C 2.5  NA  NA  NA 0.3
## C 2.5  NA  NA 7.6 0.1
## C 2.5 5.1  NA 7.6 0.6

dim(out$U)

## [1] 10  4

Playing with the function cwLocScat

set.seed(12345)
n = 1000; d = 2
A = matrix(0.7, d, d); diag(A) = 1
A

##      [,1] [,2]
## [1,]  1.0  0.7
## [2,]  0.7  1.0

X = mvrnorm(n, rep(0,d), A)
head(X)

##            [,1]       [,2]
## [1,] -0.1098666  1.1895284
## [2,]  0.6233152  0.6848755
## [3,]  0.2309203 -0.4324656
## [4,] -0.1164846 -0.7197229
## [5,]  0.7061365  0.4110647
## [6,] -0.9412289 -2.4109163

W = abs(mvrnorm(n, rep(0,d), diag(rep(1,2))))
W = W/max(as.vector(W))
W[2,1] = 0
W[5,2] = 0
head(W)

##             [,1]       [,2]
## [1,] 0.151856434 0.18102767
## [2,] 0.000000000 0.32052154
## [3,] 0.120772713 0.17167154
## [4,] 0.003729069 0.32719369
## [5,] 0.327865177 0.00000000
## [6,] 0.285126341 0.01091696

fit = cwLocScat(X,W)
fit$cwMLEiter # number of iteration steps

## [1] 47

fit$cwMLEmu

##          1          2 
## 0.05972004 0.04231900

fit$cwMean

##          1          2 
## 0.06657723 0.03736100

fit$cwMLEsigma

##           1         2
## 1 0.9717504 0.6735339
## 2 0.6735339 1.0075136

fit$cwCov # similar to cwMLEsigma:

##           1         2
## 1 0.9759485 0.7398607
## 2 0.7398607 1.0299615

fit$sqrtCov # same diagonal:

##           1        2
## 1 0.9759485 0.718826
## 2 0.7188260 1.029961

Personality traits example from section 4

data("data_personality_traits")
X <- data_personality_traits$X
W <- data_personality_traits$W
cbind(X,W) # as in table in the paper

##         t1 t2   t3   t4   t5   t6   t1   t2   t3   t4   t5   t6
##  [1,]  7.0  5  7.0  5.0  5.0  5.0 0.50 0.29 0.50 0.29 0.29 0.29
##  [2,] 10.0 10 10.0  7.0  8.5  7.0 1.00 1.00 1.00 0.50 0.58 0.50
##  [3,]  5.0  5 10.0  5.0  5.0  5.0 0.29 0.29 1.00 0.29 0.29 0.29
##  [4,] 10.0 10 10.0  5.0  5.0  5.0 1.00 1.00 1.00 0.29 0.29 0.29
##  [5,]  7.0  7  8.5  5.0  5.0  5.0 0.50 0.50 0.58 0.29 0.29 0.29
##  [6,] 10.0  5  5.0  8.5  8.5  5.0 1.00 0.29 0.29 0.58 0.58 0.29
##  [7,]  5.0  7  7.0  5.0  5.0  8.5 0.29 0.50 0.50 0.29 0.29 0.58
##  [8,] 10.0 10 10.0 10.0 10.0 10.0 1.00 1.00 1.00 1.00 1.00 1.00
##  [9,]  8.5  7  8.5  5.0  5.0  5.0 0.58 0.50 0.58 0.29 0.29 0.29
## [10,]  5.0 10  5.0  7.0  5.0  7.0 0.29 1.00 0.29 0.50 0.29 0.50

out = unpack(X,W)
cbind(out$U,out$v)

##      t1 t2   t3   t4   t5   t6     
## 1   7.0 NA  7.0   NA   NA   NA 0.21
## 1   7.0  5  7.0  5.0  5.0  5.0 0.29
## 2  10.0 10 10.0   NA   NA   NA 0.42
## 2  10.0 10 10.0   NA  8.5   NA 0.08
## 2  10.0 10 10.0  7.0  8.5  7.0 0.50
## 3    NA NA 10.0   NA   NA   NA 0.71
## 3   5.0  5 10.0  5.0  5.0  5.0 0.29
## 4  10.0 10 10.0   NA   NA   NA 0.71
## 4  10.0 10 10.0  5.0  5.0  5.0 0.29
## 5    NA NA  8.5   NA   NA   NA 0.08
## 5   7.0  7  8.5   NA   NA   NA 0.21
## 5   7.0  7  8.5  5.0  5.0  5.0 0.29
## 6  10.0 NA   NA   NA   NA   NA 0.42
## 6  10.0 NA   NA  8.5  8.5   NA 0.29
## 6  10.0  5  5.0  8.5  8.5  5.0 0.29
## 7    NA NA   NA   NA   NA  8.5 0.08
## 7    NA  7  7.0   NA   NA  8.5 0.21
## 7   5.0  7  7.0  5.0  5.0  8.5 0.29
## 8  10.0 10 10.0 10.0 10.0 10.0 1.00
## 9   8.5 NA  8.5   NA   NA   NA 0.08
## 9   8.5  7  8.5   NA   NA   NA 0.21
## 9   8.5  7  8.5  5.0  5.0  5.0 0.29
## 10   NA 10   NA   NA   NA   NA 0.50
## 10   NA 10   NA  7.0   NA  7.0 0.21
## 10  5.0 10  5.0  7.0  5.0  7.0 0.29

fit = cwLocScat(X,W)
fit$cwMLEiter

## [1] 49

round(fit$cwMLEmu,2)

##   t1   t2   t3   t4   t5   t6 
## 8.82 8.72 8.98 7.40 7.53 7.37

round(fit$cwMean,2)

##   t1   t2   t3   t4   t5   t6 
## 8.73 8.61 8.87 7.09 7.16 7.09

round(fit$cwMLEsigma, 2)

##      t1   t2   t3   t4   t5   t6
## t1 3.22 1.83 1.49 1.91 2.52 1.02
## t2 1.83 3.49 1.55 1.69 1.82 2.00
## t3 1.49 1.55 2.50 0.63 1.22 0.92
## t4 1.91 1.69 0.63 3.54 3.48 2.84
## t5 2.52 1.82 1.22 3.48 3.90 2.82
## t6 1.02 2.00 0.92 2.84 2.82 3.58

round(eigen(fit$cwMLEsigma)$values, 2)

## [1] 13.13  3.24  2.18  1.25  0.31  0.11

round(fit$cwCov, 2)

##      t1   t2   t3   t4   t5   t6
## t1 3.36 2.10 1.72 2.63 3.33 1.37
## t2 2.10 3.61 1.78 2.20 2.44 2.46
## t3 1.72 1.78 2.59 0.81 1.64 1.14
## t4 2.63 2.20 0.81 3.99 4.17 3.26
## t5 3.33 2.44 1.64 4.17 4.76 3.40
## t6 1.37 2.46 1.14 3.26 3.40 4.00

round(eigen(fit$cwCov)$values,5)

## [1] 15.91356  2.99031  2.26943  1.08606  0.05570  0.00416

round(cov(X), 2) # unweighted

##      t1   t2    t3    t4   t5   t6
## t1 4.96 1.61  1.44  2.01 3.00 0.07
## t2 1.61 4.93  1.38  1.39 1.26 2.17
## t3 1.44 1.38  4.04 -0.42 0.59 0.36
## t4 2.01 1.39 -0.42  3.29 3.25 1.93
## t5 3.00 1.26  0.59  3.25 3.90 1.89
## t6 0.07 2.17  0.36  1.93 1.89 3.29

round(eigen(cov(X))$values, 2)

## [1] 11.97  4.95  4.64  2.28  0.47  0.10

Now we reproduce the figure in the paper

ellips = function(covmat, mu, quant=0.95, npoints = 120)
{ # computes points of the ellipse t(X-mu)%*%covmat%*%(X-mu) = c
  # with c = qchisq(quant,df=2)
  if (!all(dim(covmat) == c(2, 2))) stop("covmat is not 2 by 2")
  eig = eigen(covmat)
  U = eig$vectors
  R = U %*% diag(sqrt(eig$values)) %*% t(U) # square root of covmat
  angles = seq(0, 2*pi, length = npoints+1)
  xy = cbind(cos(angles),sin(angles)) # points on the unit circle
  fac = sqrt(qchisq(quant, df=2))
  scale(fac*xy%*%R, center = -mu, scale=FALSE)
}  


n = nrow(X)
j = 3; k = 6 # to plot variables t3 and t6
xy = X[,c(j,k)]
cov2cor(cov(X)[c(j,k),c(j,k)]) # unweighted correlation is 0.10

##            t3         t6
## t3 1.00000000 0.09896998
## t6 0.09896998 1.00000000

cov2cor(fit$cwMLEsigma[c(j,k),c(j,k)]) # now correlation is 0.31

##           t3        t6
## t3 1.0000000 0.3077506
## t6 0.3077506 1.0000000

(wxy = W[,c(j,k)])

##         t3   t6
##  [1,] 0.50 0.29
##  [2,] 1.00 0.50
##  [3,] 1.00 0.29
##  [4,] 1.00 0.29
##  [5,] 0.58 0.29
##  [6,] 0.29 0.29
##  [7,] 0.50 0.58
##  [8,] 1.00 1.00
##  [9,] 0.58 0.29
## [10,] 0.29 0.50

duplicated(xy) # ties: row 4 equals row 3, and row 9 equals row 5

##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE

wxy[3,] = wxy[3,] + wxy[4,] # add cell weights of rows 3 and 4
wxy[5,] = wxy[5,] + wxy[9,] # add cell weights of rows 5 and 9
(wxy = wxy[-c(4,9),]) # remove duplicate rows

##        t3   t6
## [1,] 0.50 0.29
## [2,] 1.00 0.50
## [3,] 2.00 0.58
## [4,] 1.16 0.58
## [5,] 0.29 0.29
## [6,] 0.50 0.58
## [7,] 1.00 1.00
## [8,] 0.29 0.50

(xy = xy[-c(4,9),]) # remove duplicate rows

##        t3   t6
## [1,]  7.0  5.0
## [2,] 10.0  7.0
## [3,] 10.0  5.0
## [4,]  8.5  5.0
## [5,]  5.0  5.0
## [6,]  7.0  8.5
## [7,] 10.0 10.0
## [8,]  5.0  7.0

# pdf("personality_cwMLE_cwCov.pdf",width=5.5,height=5.5)
myxlim = c(2,14); myylim = c(1,13)
plot(xy, pch=16, col="white", xlim=myxlim, ylim=myylim,
     xlab="",ylab="")
fac = 0.3 # for the size of the lines representing the cell weights
for(i in seq_len(nrow(xy))){
  WY = c(xy[i,1] - fac*wxy[i,1],xy[i,2]) # (WestX, Y) 
  EY = c(xy[i,1] + fac*wxy[i,1],xy[i,2]) # (EastX, Y) 
  XS = c(xy[i,1],xy[i,2] - fac*wxy[i,2]) # (X, SouthY)
  XN = c(xy[i,1],xy[i,2] + fac*wxy[i,2]) # (X, NorthY)
  lines(rbind(WY,EY),lwd=3)
  lines(rbind(XS,XN),lwd=3)
}
title(main="tolerance ellipses with and without cell weights",
      line=0.8,cex.main=1) # 1.2)
title(xlab="trait 3",line=2.1,cex.lab=1.0)
title(ylab="trait 6",line=2.1,cex.lab=1.0)
center1 = colMeans(X[,c(j,k)])
covmat1 = (n-1)*cov(X[,c(j,k)])/n                   
ell1 = ellips(covmat1, center1)
lines(ell1,lwd=1.5,col="red") # ellipse from unweighted covariance
fit2 = cwLocScat(xy,wxy)
center2 = fit2$cwMLEmu
covmat2 = fit2$cwMLEsigma
ell2 = ellips(covmat2, center2) # ellipse from cwMLE estimates
lines(ell2,lwd=1.5,col="blue")
center3 = fit2$cwMean
covmat3 = fit2$cwCov
ell3 = ellips(covmat3, center3) # ellipse from cwMean and cwCov
lines(ell3,lwd=1.5,lty=2,col="blue")
legend("topleft",c("cwMLE","cwCov","MLE"),
       col=c("blue","blue","red"), lty=c(1,2,1), lwd=1.5, cex=1)

plot of chunk unnamed-chunk-8

# dev.off()

# The blue ellipses, that use the cell weights, are a bit
# higher and more slanted than the red ellipse that doesn't,
# mainly due to the high cell weight of the y-coordinate of 
# the point in the top right corner.

cellwise weights examples

Rousseeuw, P.J.

2023-10-25

Introduction

Unpack the toy example in section 2 of the paper

Playing with the function cwLocScat

Personality traits example from section 4