I'm doing some optimization in R and in connection with that I need to write a function that returns a jacobian. It's a very simple jacobian -- just zeros and ones -- but I'd like to populate it quickly and cleanly. My current code works but is very sloppy.
I have a four-dimensional array of probabilities. Index the dimensions by i, j, k, l. My constraint is that, for each i, j, k, the sum of probabilities over index l must equal 1.
I compute my constraint vector like this:
get_prob_array_from_vector <- function(prob_vector, array_dim) {
return(array(prob_vector, array_dim))
}
constraint_function <- function(prob_vector, array_dim) {
prob_array <- get_prob_array_from_vector(prob_vector, array_dim)
prob_array_sums <- apply(prob_array, MARGIN=c(1, 2, 3), FUN=sum)
return(as.vector(prob_array_sums) - 1) # Should equal zero
}
My question is: what is a clean, fast way of computing the jacobian of as.vector(apply(array(my_input_vector, array_dim), MARGIN=c(1, 2, 3), FUN=sum)) -- i.e., my constraint_function in the code above -- with respect to my_input_vector?
Here is my sloppy solution (which I check for correctness against the jacobian function from the numDeriv package):
library(numDeriv)
array_dim <- c(5, 4, 3, 3)
get_prob_array_from_vector <- function(prob_vector, array_dim) {
return(array(prob_vector, array_dim))
}
constraint_function <- function(prob_vector, array_dim) {
prob_array <- get_prob_array_from_vector(prob_vector, array_dim)
prob_array_sums <- apply(prob_array, MARGIN=c(1, 2, 3), FUN=sum)
return(as.vector(prob_array_sums) - 1)
}
constraint_function_jacobian <- function(prob_vector, array_dim) {
prob_array <- get_prob_array_from_vector(prob_vector, array_dim)
jacobian <- matrix(0, Reduce("*", dim(prob_array)[1:3]), length(prob_vector))
## Must be a faster, clearner way of populating jacobian
for(i in seq_along(prob_vector)) {
dummy_vector <- rep(0, length(prob_vector))
dummy_vector[i] <- 1
dummy_array <- get_prob_array_from_vector(dummy_vector, array_dim)
dummy_array_sums <- apply(dummy_array, MARGIN=c(1, 2, 3), FUN=sum)
jacobian_row_idx <- which(dummy_array_sums != 0, arr.ind=FALSE)
stopifnot(length(jacobian_row_idx) == 1)
jacobian[jacobian_row_idx, i] <- 1
} # Is there a fast, readable one-liner that does the same as this for loop?
stopifnot(sum(jacobian) == length(prob_vector))
stopifnot(all(jacobian == 0 | jacobian == 1))
return(jacobian)
}
## Example of a probability array satisfying my constraint
my_prob_array <- array(0, array_dim)
for(i in seq_len(array_dim[1])) {
for(j in seq_len(array_dim[2])) {
my_prob_array[i, j, , ] <- diag(array_dim[3])
}
}
my_prob_array[1, 1, , ] <- 1 / array_dim[3]
my_prob_array[2, 1, , ] <- 0.25 * (1 / array_dim[3]) + 0.75 * diag(array_dim[3])
my_prob_vector <- as.vector(my_prob_array) # Flattened representation of my_prob_array
should_be_zero_vector <- constraint_function(my_prob_vector, array_dim)
is.vector(should_be_zero_vector)
all(should_be_zero_vector == 0) # Constraint is satistied
## Check constraint_function_jacobian for correctness using numDeriv
jacobian_analytical <- constraint_function_jacobian(my_prob_vector, array_dim)
jacobian_numerical <- jacobian(constraint_function, my_prob_vector, array_dim=array_dim)
max(abs(jacobian_analytical - jacobian_numerical)) # Very small
My functions take prob_vector as input -- i.e., a flattened representation of my probability array -- because optimization functions require vector arguments.
Spend some time to understand what you were trying to do, but here is a proposition to replace your constraint_function_jacobian:
enhanced <- function(prob_vector, array_dim) {
firstdim <- Reduce("*", array_dim[1:3])
seconddim <- length(prob_vector)
jacobian <- matrix(0, firstdim, seconddim)
idxs <- split(1:seconddim, cut(1:seconddim, array_dim[4], labels=FALSE))
for (i in seq_along(idxs)) {
diag(jacobian[, idxs[[i]] ]) <- 1
}
stopifnot(sum(jacobian) == length(prob_vector))
stopifnot(all(jacobian == 0 | jacobian == 1))
jacobian
}
Unless I'm wrong, the jacobian construction is filling diagonals with 1, as it is not a square matrix we have to split it on array_dim[4] square matrix to fill up their diagonals with 1.
I did get rid of the transformation of prob_vector to an array to then get its dim as it will be the same as array_dim, skipping this step is not a huge improvement but it simplify the code IMO.
Results are ok according to test:
identical(constraint_function_jacobian(my_prob_vector, array_dim),
enhanced(my_prob_vector, array_dim))
# [1] TRUE
According to benchmark it gives a great speedup:
microbenchmark::microbenchmark(
original=constraint_function_jacobian(my_prob_vector, array_dim),
enhanced=enhanced(my_prob_vector, array_dim), times=100)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# original 16946.979 18466.491 20150.304 19066.7410 19671.4100 28148.035 100 b
# enhanced 678.222 737.948 799.005 796.3905 834.5925 1141.773 100 a
Related
I have the following objects:
A: 1 array with x,y,z, dimensions -> containing a variable (Temperature)
B & C: 2 arrays with x,y dimensions -> containing the indices of vectors along A's z dimension
A <- array(rnorm(n = 12*4*3*5), dim = c(4,3,5))
dimnames(A) <- list("x" = c(1:4), "y" = c(1:3), "z" = c(1:5))
B <- matrix(rep(c(2:1), 6), nrow = 4)
dimnames(B) <- list("x" = c(1:4), "y" = c(1:3))
C <- matrix(rep(c(4:5), 6), nrow = 4)
dimnames(C) <- list("x" = c(1:4), "y" = c(1:3))
I'm looking for a way to apply sum of A across the z dimension only between the indices indicated by B and C.
If instead of the 3D-array I had a vector I would solve it like this:
> A <- round(c(rnorm(5)), 1)
> B <- 2 #index of first value to sum
> C <- 4 #index of last value to sum
> vindex <- seq(B,C,1)
> A
[1] 0.0 -0.9 -1.1 -1.7 -0.4
> vindex
[1] 2 3 4
> sum(A[vindex])
[1] -3.7
>
# or better with a function
> foo <- function(x, start_idx, end_idx) {
+ vidx <- seq(start_idx, end_idx, 1)
+ return(sum(x[vidx]))
+ }
>
> foo(A,B,C)
[1] -3.7
Unfortunately seq() does not accept vectors as arguments and therefore it's not straightforward to use the apply function. If again were A[x,y,z] and B and C[x,y]:
> apply(A,c(1,2),foo,B,C)
Error in seq.default(start_idx, end_idx, 1) : 'from' must be of length 1
Called from: seq.default(start_idx, end_idx, 1)
It would be great if anybody knew how to make this function workable with apply or with other clean solutions.
Thanks a lot!
This is not a very nice task for base R, and I would prefer to implement it in C++ in the absence of a package that already does so (?).
Logically speaking, a plain but vectorized solution to your problem could be structured as:
# initialize index array
D <- array(
1,
dim = c(4,3,5),
dimnames = list(x = letters[1:4], y = letters[1:3], z = letters[1:5])
)
# set indices out of bounds to zero
E <- rep(1:5, each = 4*3)
BB <- rep(B, times = 5)
D[E < BB] <- 0
CC <- rep(C, times = 5)
D[E > CC] <- 0
# multiply with index array and sum
apply(A * D, c(1,2), sum)
I have a formula that creates matrices. Later with every single matrix of the set I have to do some time consuming stuff. So far, I'm bundling these matrices into a list with lapply(). Now, I assume operating with an array of matrices would be much faster. The thing is, I don't know how to let the matirices be generated into an array as with lapply().
I give you this example:
# matrix generating function
mxSim <- function(X, n) {
mx = matrix(NA, nrow = n, ncol = 3,
dimnames = list(NULL, c("d", "alpha", "beta")))
mx[,1] = rbinom(n, 1, .375)
mx[,2] = rnorm(n, 0, 2)
mx[,3] = .42 * rnorm(n, 0, 6)
return(mx)
}
# bundle matrices together
mx.lst <- lapply(1:1e1, mxSim, n = 1e4)
# some stuff to be done after, like e. g.:
lapply(mx.lst, function(m) lm(d ~ alpha + beta, as.data.frame(m)))
Could anybody give me some advise how to do this with an array?
I've been looking into this answer, but for it the matrices have to be already generated, and I only could help me by listing them before again.
Enough with the hooha. Lets time it.
library(microbenchmark)
# matrix generating function
mxSim <- function(X, n) {
mx = matrix(NA, nrow = n, ncol = 3,
dimnames = list(NULL, c("d", "alpha", "beta")))
mx[,1] = rbinom(n, 1, .375)
mx[,2] = rnorm(n, 0, 2)
mx[,3] = .42 * rnorm(n, 0, 6)
return(mx)
}
# bundle matrices together
mx.lst <- lapply(1:1e1, mxSim, n = 1e4)
mx.array <- array(mx.lst,dim=c(2,5))
# some stuff to be done after, like e. g.:
#Timing...
some.fnc<-function(m)lm(d ~ alpha + beta, as.data.frame(m))
list.test<-microbenchmark(lapply(mx.lst, some.fnc))
array.test<-microbenchmark(apply(mx.array, MARGIN=c(1,2), some.fnc))
expr min lq mean median uq max neval
lapply: 74.8953 101.9424 173.8733 146.7186 234.7577 397.2494 100
apply: 77.2362 101.0338 174.4178 137.153 264.6854 418.7297 100
Naively applying a function over a list as opposed to an array is almost identical in actual performance.
For the sake of completeness I just made some other benchmarks with n=1e3 as stated in the comment of #SeldomSeenSlim's answer. In addition I made it with a list of data.frames(), and this was a bit surprising.
Here is the function for data.frames, for matrix function see above.
dfSim <- function(X, n) {
d <- rbinom(n, 1, .375)
alpha <- rnorm(n, 0, 2)
beta <- .42 * rnorm(n, 0, 6)
data.frame(d, alpha, beta)
}
Bundeling
mx.lst <- lapply(1:1e3, mxSim, n = 1e4)
mx.array <- array(mx.lst, dim = c(2, 500))
df.lst <- lapply(1:1e3, dfSim, n = 1e4)
And the microbenchmarks:
some.fnc <- function(m) lm(d ~ alpha + beta, as.data.frame(m))
list.test <- microbenchmark(lapply(mx.lst, some.fnc))
array.test <- microbenchmark(apply(mx.array, MARGIN = c(1, 2), some.fnc))
df.list.test <- microbenchmark(lapply(df.lst, some.fnc))
Results
Unit: seconds
expr min lq mean median uq max neval
lapply 9.658568 9.742613 9.831577 9.784711 9.911466 10.30035 100
apply 9.727057 9.951213 9.994986 10.00614 10.06847 10.22178 100
lapply(df) 9.121293 9.229912 9.286592 9.277967 9.327829 10.12548 100
Now, what does us tell this?
But, okay, as a bold sidenote:
microbenchmark((lapply(1:1e3, mxSim, n = 1e4)), (lapply(1:1e3, dfSim, n = 1e4)))
expr min lq mean median uq max neval cld
(lapply(mxSim)) 2.533466 2.551199 2.563864 2.555421 2.559234 2.693383 100 a
(lapply(dfSim)) 2.676869 2.695826 2.718454 2.701161 2.706249 3.293431 100 b
I am trying to implement the Metropolis-Hastings algorithm for a simple linear regression in C (without use of other libraries (boost, Eigen etc.) and without two-dimensional arrays)*. For better testing of the code/evaluation of the trace plots, I have rewritten the code for R (see below) by keeping as much of the C-code as possible.
Unfortunately, the chains don't converge. I am wondering if
there is a mistake in the implementation itself?
"just" a bad choice of proposal distributions?
Assuming the latter, I am thinking about how to find good parameters of proposal distributions (currently I have picked arbitrary values) so that the algorithm works. Even with three parameters as in this case, it is quite hard to find suitable parameters. How does one normally handle this problem if say Gibbs sampling is not an alternative?
*I want to use this code for Cuda
#### posterior distribution
logPostDensity <- function(x, y, a, b, s2, N)
{
sumSqError = 0.0
for(i in 1:N)
{
sumSqError = sumSqError + (y[i] - (a + b*x[i]))^2
}
return(((-(N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))
}
# x = x values
# y = actual datapoints
# N = sample size
# m = length of chain
# sigmaProp = uniform proposal for sigma squared
# paramAProp = uniform proposal for intercept
# paramBProp = uniform proposal for slope
mcmcSampling <- function(x,y,N,m,sigmaProp,paramAProp,paramBProp)
{
paramsA = vector("numeric",length=m) # intercept
paramsB = vector("numeric",length=m) # slope
s2 = vector("numeric",length=m) # sigma squared
paramsA[1] = 0
paramsB[1] = 0
s2[1] = 1
for(i in 2:m)
{
paramsA[i] = paramsA[i-1] + runif(1,-paramAProp,paramAProp)
if((logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N)
- logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
< log(runif(1)))
{
paramsA[i] = paramsA[i-1]
}
paramsB[i] = paramsB[i-1] + runif(1,-paramBProp,paramBProp)
if((logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N)
- logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
< log(runif(1)))
{
paramsB[i] = paramsB[i-1]
}
s2[i] = s2[i-1] + runif(1,-sigmaProp,sigmaProp)
if((s2[i] < 0) || (logPostDensity(x,y,paramsA[i],paramsB[i],s2[i],N)
- logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N))
< log(runif(1)))
{
s2[i] = s2[i-1]
}
}
res = data.frame(paramsA,paramsB,s2)
return(res)
}
#########################################
set.seed(321)
x <- runif(100)
y <- 2 + 5*x + rnorm(100)
summary(lm(y~x))
df <- mcmcSampling(x,y,10,5000,0.05,0.05,0.05)
par(mfrow=c(3,1))
plot(df$paramsA[3000:5000],type="l",main="intercept")
plot(df$paramsB[3000:5000],type="l",main="slope")
plot(df$s2[3000:5000],type="l",main="sigma")
There was one mistake in the intercept section (paramsA). Everything else was fine. I've implemented what Alexey suggested in his comments. Here's the solution:
pow <- function(x,y)
{
return(x^y)
}
#### posterior distribution
posteriorDistribution <- function(x, y, a, b,s2,N)
{
sumSqError <- 0.0
for(i in 1:N)
{
sumSqError <- sumSqError + pow(y[i] - (a + b*x[i]),2)
}
return((-((N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))
}
# x <- x values
# y <- actual datapoints
# N <- sample size
# m <- length of chain
# sigmaProposalWidth <- width of uniform proposal dist for sigma squared
# paramAProposalWidth <- width of uniform proposal dist for intercept
# paramBProposalWidth <- width of uniform proposal dist for slope
mcmcSampling <- function(x,y,N,m,sigmaProposalWidth,paramAProposalWidth,paramBProposalWidth)
{
desiredAcc <- 0.44
paramsA <- vector("numeric",length=m) # intercept
paramsB <- vector("numeric",length=m) # slope
s2 <- vector("numeric",length=m) # sigma squared
paramsA[1] <- 0
paramsB[1] <- 0
s2[1] <- 1
accATot <- 0
accBTot <- 0
accS2Tot <- 0
for(i in 2:m)
{
paramsA[i] <- paramsA[i-1] + runif(1,-paramAProposalWidth,paramAProposalWidth)
accA <- 1
if((posteriorDistribution(x,y,paramsA[i],paramsB[i-1],s2[i-1],N) -
posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
{
paramsA[i] <- paramsA[i-1]
accA <- 0
}
accATot <- accATot + accA
paramsB[i] <- paramsB[i-1] + runif(1,-paramBProposalWidth,paramBProposalWidth)
accB <- 1
if((posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N) -
posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
{
paramsB[i] <- paramsB[i-1]
accB <- 0
}
accBTot <- accBTot + accB
s2[i] <- s2[i-1] + runif(1,-sigmaProposalWidth,sigmaProposalWidth)
accS2 <- 1
if((s2[i] < 0) || (posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i],N) -
posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N)) < log(runif(1)))
{
s2[i] <- s2[i-1]
accS2 <- 0
}
accS2Tot <- accS2Tot + accS2
if(i%%100==0)
{
paramAProposalWidth <- paramAProposalWidth * ((accATot/100)/desiredAcc)
paramBProposalWidth <- paramBProposalWidth * ((accBTot/100)/desiredAcc)
sigmaProposalWidth <- sigmaProposalWidth * ((accS2Tot/100)/desiredAcc)
accATot <- 0
accBTot <- 0
accS2Tot <- 0
}
}
res <- data.frame(paramsA,paramsB,s2)
return(res)
}
I have the following function which takes 4 vectors. The T vector has a given length and all 3 other vectors (pga, Sa5Hz and Sa1Hz) have a given (identical but not necessarily equal to T) lenght.
The output is a matrix with length(T) rows and length(pga) columns.
My code below seems like the perfect example of what NOT to do, however, I could not figure out a way to optimize it using an apply function. Can anyone help?
designSpectrum <- function (T, pga, Sa5Hz, Sa1Hz){
Ts <- Sa1Hz / Sa5Hz
#By convention, if Sa5Hz is null, set Ts as 0.
Ts[is.nan(Ts)] <- 0
res <- matrix(NA, nrow = length(T), ncol = length(pga))
for (i in 1:nrow(res))
{
for (j in 1:ncol(res))
{
res[i,j] <- if(T[i] <= 0) {pga[j]}
else if (T[i] <= 0.2 * Ts[j]) {pga[j] + T[i] * (Sa5Hz[j] - pga[j]) / (0.2 * Ts[j])}
else if (T[i] <= Ts[j]) {Sa5Hz[j]}
else Sa1Hz[j] / T[i]
}
}
return(res)
}
Instead of doing a double for loop and processing each i and j value separately, you could use the outer function to process all of them in one shot. Since you're now processing multiple i and j values simultaneously, you could switch to the vectorized ifelse statement instead of the non-vectorized if and else statements:
designSpectrum2 <- function (T, pga, Sa5Hz, Sa1Hz) {
Ts <- Sa1Hz / Sa5Hz
Ts[is.nan(Ts)] <- 0
outer(1:length(T), 1:length(pga), function(i, j) {
ifelse(T[i] <= 0, pga[j],
ifelse(T[i] <= 0.2 * Ts[j], pga[j] + T[i] * (Sa5Hz[j] - pga[j]) / (0.2 * Ts[j]),
ifelse(T[i] <= Ts[j], Sa5Hz[j], Sa1Hz[j] / T[i])))
})
}
identical(designSpectrum(T, pga, Sa5Hz, Sa1Hz), designSpectrum2(T, pga, Sa5Hz, Sa1Hz))
# [1] TRUE
Data:
T <- -1:3
pga <- 1:3
Sa5Hz <- 2:4
Sa1Hz <- 3:5
You can see the efficiency gains by testing on rather large vectors (here I'll use an output matrix with 1 million entries):
# Larger vectors
set.seed(144)
T2 <- runif(1000, -1, 3)
pga2 <- runif(1000, -1, 3)
Sa5Hz2 <- runif(1000, -1, 3)
Sa1Hz2 <- runif(1000, -1, 3)
# Runtime comparison
all.equal(designSpectrum(T2, pga2, Sa5Hz2, Sa1Hz2), designSpectrum2(T2, pga2, Sa5Hz2, Sa1Hz2))
# [1] TRUE
system.time(designSpectrum(T2, pga2, Sa5Hz2, Sa1Hz2))
# user system elapsed
# 4.038 1.011 5.042
system.time(designSpectrum2(T2, pga2, Sa5Hz2, Sa1Hz2))
# user system elapsed
# 0.517 0.138 0.652
The approach with outer is almost 10x faster.
SimNo <- 10
for (i in 1:SimNo){
z1<-rnorm(1000,0,1)
z2<-rnorm(1000,0,1)
z3<-rnorm(1000,0,1)
z4<-rnorm(1000,0,1)
z5<-rnorm(1000,0,1)
z6<-rnorm(1000,0,1)
X<-cbind(z1,z2,z3,z4,z5,z6)
sx<-scale(X)/sqrt(999)
det1<-det(t(sx)%*%sx)
detans<-do.call(rbind,lapply(1:SimNo, function(x) ifelse(det1<1,det1,0)))
}
when I run all commands with in loop except last one I get different values of determinant but when I run code with loops at once I get last value of determinant repeated for all.
Please help and guide to control all situation like this.
Is there way to have short and efficient way for this code, so that each individual variable can also be accessed.
Whenever you are repeating the same operation multiple times, and without inputs, think about using replicate. Here you can use it twice:
SimNo <- 10
det1 <- replicate(SimNo, {
X <- replicate(6, rnorm(1000, 0, 1))
sx <- scale(X) / sqrt(999)
det(t(sx) %*% sx)
})
detans <- ifelse(det1 < 1, det1, 0)
Otherwise, this is what your code should have looked with your for loop. You needed to create a vector for storing your outputs at each loop iteration:
SimNo <- 10
detans <- numeric(SimNo)
for (i in 1:SimNo) {
z1<-rnorm(1000,0,1)
z2<-rnorm(1000,0,1)
z3<-rnorm(1000,0,1)
z4<-rnorm(1000,0,1)
z5<-rnorm(1000,0,1)
z6<-rnorm(1000,0,1)
X<-cbind(z1,z2,z3,z4,z5,z6)
sx<-scale(X)/sqrt(999)
det1<-det(t(sx)%*%sx)
detans[i] <- ifelse(det1<1,det1,0)
}
Edit: you asked in the comments how to access X using replicate. You would have to make replicate create and store all your X matrices in a list. Then use the *apply family of functions to loop throughout that list to finish the computations:
X <- replicate(SimNo, replicate(6, rnorm(1000, 0, 1)), simplify = FALSE)
det1 <- sapply(X, function(x) {
sx <- scale(x) / sqrt(999)
det(t(sx) %*% sx)
})
detans <- ifelse(det1 < 1, det1, 0)
Here, X is now a list of matrices, so you can get e.g. the matrix for the second simulation by doing X[[2]].
SimNo <- 10
matdet <- matrix(data=NA, nrow=SimNo, ncol=1, byrow=TRUE)
for (i in 1:SimNo){
z1<-rnorm(1000,0,1)
z2<-rnorm(1000,0,1)
z3<-rnorm(1000,0,1)
z4<-rnorm(1000,0,1)
z5<-rnorm(1000,0,1)
z6<-rnorm(1000,0,1)
X<-cbind(z1,z2,z3,z4,z5,z6)
sx<-scale(X)/sqrt(999)
det1<-det(t(sx)%*%sx)
matdet[i] <-do.call(rbind,lapply(1:SimNo, function(x) ifelse(det1<1,det1,0)))
}
matdet