data(neuroblastoma, package="neuroblastoma")
library(data.table)
nb.dt <- data.table(neuroblastoma[["profiles"]])
nb.dt[, data.i := rank(position), keyby=.(profile.id, chromosome)]
(pro.dt <- nb.dt[profile.id=="4" & chromosome %in% paste(1:10)])
#> Key: <profile.id, chromosome>
#> profile.id chromosome position logratio data.i
#> <fctr> <fctr> <int> <num> <num>
#> 1: 4 1 809681 -0.5777670 1
#> 2: 4 1 928433 -0.4500844 2
#> 3: 4 1 987423 -0.6170561 3
#> 4: 4 1 1083595 -0.8059129 4
#> 5: 4 1 1392490 -0.6826959 5
#> ---
#> 1797: 4 10 129101212 -0.2933589 129
#> 1798: 4 10 129523531 -0.2378638 130
#> 1799: 4 10 131219243 -0.2042331 131
#> 1800: 4 10 132561724 -0.2567005 132
#> 1801: 4 10 135221961 -0.1297339 133
library(ggplot2)
ggplot()+
facet_grid(. ~ chromosome, scales="free", space="free")+
scale_x_continuous(
"Position/index in data sequence",
breaks=seq(0, 1000, by=100))+
scale_y_continuous(
"logratio (data values)")+
geom_point(aes(
data.i, logratio),
data=pro.dt)
In the data set plotted above we have ten data sequences (panels from left to right), for which we can learn common HMM parameters.
pro.dt[, row := 1:.N]
all.y.vec <- pro.dt[["logratio"]]
data.list <- split(pro.dt, paste(pro.dt[["chromosome"]]))
first.row.vec <- pro.dt[data.i==1, row]
last.row.vec <- pro.dt[, .SD[data.i==.N], by=chromosome][["row"]]
n.states <- 4
log.A.mat <- log(matrix(1/n.states, n.states, n.states))
set.seed(1)
mean.vec <- rnorm(n.states)
sd.param <- 1
log.pi.vec <- log(rep(1/n.states, n.states))
Parameter initializations above. Below we initialize a matrix/array for gamma/xi which has an index sized according to the full data set (total number of observations across all ten sequences).
all.log.gamma.mat <- matrix(NA, nrow(pro.dt), n.states)
all.log.xi.array <- array(NA, c(n.states, n.states, nrow(pro.dt)))
for(chrom.i in seq_along(data.list)){
one.chrom <- data.list[[chrom.i]]
row.vec <- one.chrom[["row"]]
y.vec <- one.chrom[["logratio"]]
N.data <- length(y.vec)
log.emission.mat <- dnorm(
matrix(y.vec, N.data, n.states, byrow=FALSE),
matrix(mean.vec, N.data, n.states, byrow=TRUE),
sd.param,
log=TRUE)
fwd.list <- plotHMM::forward_interface(
log.emission.mat, log.A.mat, log.pi.vec)
log.alpha.mat <- fwd.list[["log_alpha"]]
log.beta.mat <- plotHMM::backward_interface(log.emission.mat, log.A.mat)
all.log.gamma.mat[row.vec, ] <- plotHMM::multiply_interface(
log.alpha.mat, log.beta.mat)
all.log.xi.array[,, row.vec[-N.data] ] <- plotHMM::pairwise_interface(
log.emission.mat, log.A.mat, log.alpha.mat, log.beta.mat)
}
The code above has a for loop over the ten data sequences. At the end
of each iteration row.vec
is used to define the indices (specific to
each sequence) where the results of multiply
and pairwise
are
stored.
(new.log.pi.vec <- apply(
all.log.gamma.mat[first.row.vec,]-log(length(first.row.vec)),
2, plotHMM::logsumexp))
#> [1] -1.187884 -1.101956 -1.307548 -2.381292
Above we compute the new log initial state probabilities, using
first.row.vec
to define the indices of the first data points in each
sequence.
prob.mat <- exp(all.log.gamma.mat)
(new.mean.vec <- colSums(all.y.vec*prob.mat)/colSums(prob.mat))
#> [1] -0.10342880 -0.05626308 -0.11673591 0.01231217
resid.mat <- all.y.vec-matrix(
new.mean.vec, length(all.y.vec), n.states, byrow=TRUE)
var.est <- sum(prob.mat * resid.mat^2) / sum(prob.mat)
(new.sd.param <- sqrt(var.est))
#> [1] 0.2419531
Above we compute the new emission (mean and sd) parameters.
new.log.A.mat <- plotHMM::transition_interface(
all.log.gamma.mat[-last.row.vec,],
all.log.xi.array[,, -last.row.vec])
Finally we compute a new log transition probability matrix, using
last.row.vec
to exclude the last data point in each sequence.