I was deriving NDVI (Normalized Difference Vegetation Index), which is a ratio of (NIR-R)/(NIR+R) where NIR is Near-Infrared band and R is Red band. This index ranges from -1 to 1. So I wrote a pyopencl code and here is what I have done and observed.
Python code:
import pyopencl as cl
import cv2
from PIL import Image
import numpy as np
from time import time
import matplotlib.pyplot as plt
#get kernel file
def getKernel():
kernel = open('kernel.c').read()
return kernel
#return images as numpy int32 arrays
def convToArray(im_r,im_nir):
a = np.asarray(im_r).astype(np.int32)
b = np.asarray(im_nir).astype(np.int32)
return a,b
#processing part
def getDerivation(platform,device,im_r,im_nir):
#setting device
pltfrm = cl.get_platforms()[platform]
dev = pltfrm.get_devices()[device]
cntx = cl.Context([dev])
queue = cl.CommandQueue(cntx)
#get 2Darrays
r,nir = convToArray(im_r,im_nir)
#shape of array
x = r.shape[1]
mf = cl.mem_flags
bs = time()
#input images buffer
inR = cl.Buffer(cntx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=r)
inIR = cl.Buffer(cntx,mf.READ_ONLY | mf.COPY_HOST_PTR,hostbuf=nir)
#output image buffers
ndvi = cl.Buffer(cntx,mf.WRITE_ONLY,r.nbytes)
be = time()
print("Buffering time: " + str(be-bs) + " sec")
ts = time()
#load kernel
task = cl.Program(cntx,getKernel()%(x)).build()
#execute the process
#create empty buffer to store result
Vout = np.empty_like(r)
#dump output buffers to empty arrays
te = time()
#convert arrays to gray - image compatible formate
NDVI = Vout.astype(np.uint8)
print("Processing time: " + str(te - ts) + " On: " + + " --> " +
return NDVI
def process(platform,device,im_r,im_nir):
NDVI,NDBI,NDWI = getDerivation(platform,device,im_g,im_r,im_nir,im_swir)
if __name__ == '__main__':
R = cv2.imread("BAND3.jpg",0)
NIR = cv2.imread("BAND4.jpg",0)
print(R.dtype) #returns uint8
process(0,0,R,NIR) #(0,0) is my intel gpu
kernel code(C):
__kernel void derive(__global int* inR,__global int* inIR,__global int* ndvi){
int x = get_global_id(0);
int y = get_global_id(1);
int width = %d;
int index = x + y*width;
//ndvi ratio (-1 to 1)
int a = ((inIR[index] - inR[index])/(inIR[index] + inR[index])) * (256);
a = (a < (0) ? (-1*a) : (a));
a = (a > (255) ? (255) : (a));
ndvi[index] = (a);
input image R:
input image NIR:
both the images have bit depth of 8
BUT I GET JUST A BLANK IMAGE. I wrote the result on the command line for debugging reasons initially,
command line output:
(1151, 1151)
Buffering time: 0.015959739685058594 sec
Processing time: 0.22115755081176758 On: Intel(R) OpenCL --> Intel(R) HD Graphics 520
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
Now what i think is i may not be using proper datatype for the images? also, in the kernel the line ((inIR[index] - inR[index])/(inIR[index] + inR[index])) will gives a float value, which i multiply with 256 to get a pixel value for that respective float value. So is it there the problem? Does any one know where i am going wrong?
Help is much appreciated!
Okay ... i got it. I just changed the datatype in the line a = np.asarray(im_r).astype(np.int32) in the function convToArray() to float32 and in the kernel file, i changed the parameter type to float and added int a = (int)((((float)(inIR[index] - inR[index])/(float)(inIR[index] + inR[index]))+1)*127.5); for the calculation. However, i need an explaination, why this worked and not the other way... I probably can think like, the result what we get after this calculation, int type loses data while conversion from it?
I am trying to change Karpathy's code so that it works with softmax function so that I can use it for game with more than 2 actions. However, I cannot get it to work. Can someone help point me to the right direction please? Thanks. Below is my attempt.
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym
# hyperparameters
H = 100 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.9 # discount factor for reward
decay_rate = 0.9 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
num_action = 2
# model initialization
D = 6 # input dimensionality: 80x80 grid
if resume:
model = pickle.load(open('save.p', 'rb'))
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(num_action, H) / np.sqrt(H)
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]
def softmax(w, t = 1.0):
e = np.exp(np.array(w) / t)
dist = e / np.sum(e)
return dist
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def policy_forward(x):
h =['W1'], x)
h[h<0] = 0 # ReLU nonlinearity
logp =['W2'], h)
p = softmax(logp)
return p, h # return probability of taking action 2, and hidden state
def policy_backward(eph, epdlogp):
""" backward pass. (eph is array of intermediate hidden states) """
# print eph.shape
# print epdlogp.shape
# print model['W2'].shape
# dW2 =, epdlogp).ravel()
# dh = np.outer(epdlogp, model['W2'])
# dh[eph <= 0] = 0 # backpro prelu
# dW1 =, epx)
# return {'W1':dW1, 'W2':dW2}
dW2 =, epdlogp).T
# print dW2.shape
dh =, model['W2'])
# print dh.shape
dh[eph <= 0] = 0 # backpro prelu
dW1 =, epx)
return {'W1':dW1, 'W2':dW2}
env = gym.make("Acrobot-v1")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
while True:
if render: env.render()
# preprocess the observation, set input to network to be difference image
cur_x = observation
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
prev_x = cur_x
# forward the policy network and sample an action from the returned probability
aprob, h = policy_forward(x)
action = np.argmax(aprob)
if action == 1:
action = 2
# action = 2 if np.random.uniform() > aprob[1] else 0
# print aprob
# action = 2 if np.random.uniform() < aprob else 3 # roll the dice!
# record various intermediates (needed later for backprop)
xs.append(x) # observation
hs.append(h) # hidden state
# if action == 0:
# y = [1,0,0]
# elif action == 1:
# y = [0,1,0]
# else:
# y = [0,0,1]
y = [1,0] if action == 0 else [0,1] # a "fake label"
dlogps.append(aprob-y) # grad that encourages the action that was taken to be taken (see if confused)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
if done: # an episode finished
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
eph = np.vstack(hs)
epdlogp = np.vstack(dlogps)
epr = np.vstack(drs)
xs,hs,dlogps,drs = [],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# standardize the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
grad = policy_backward(eph, epdlogp)
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch
# perform rmsprop parameter update every batch_size episodes
if episode_number % batch_size == 0:
for k,v in model.iteritems():
g = grad_buffer[k] # gradient
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
# boring book-keeping
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
reward_sum = 0
observation = env.reset() # reset env
prev_x = None
When debugging, this code runs into a "nan" issue which I can't figure out how to fix.
I think the NaN problem that you mention in a comment is due to your Softmax function.
Softmax computes the exponential function, exp(x) which can easily exceed the range of single or double precision floats for moderate values of x. This would cause exp to return NaN.
The mathematical form of Softmax is:
s[i] = exp(x[i]) / (exp(x[0]) + exp(x[1]) + .. + exp(x[n-1]))
We can divide the numerator and denominator of this expression by an arbitrary value, say exp(a) without affecting the result.
s[i] = (exp(x[i])/exp(a)) / ((exp(x[0]) + exp(x[1]) + .. + exp(x[n-1])/exp(a)))
s[i] = exp(x[i]-a) / (exp(x[0]-a) + exp(x[1]-a) + .. + exp(x[n-1]-a))
If we let a = max(x) then all exponents will be zero or negative, so no call to exp will return NaN.
I don't use Python or numpy, but I imagine you could define softmax something like:
def softmax(w):
a = np.max(w)
e = np.exp(np.array(w) - a)
dist = e / np.sum(e)
return dist
I am trying to implement the Metropolis-Hastings algorithm for a simple linear regression in C (without use of other libraries (boost, Eigen etc.) and without two-dimensional arrays)*. For better testing of the code/evaluation of the trace plots, I have rewritten the code for R (see below) by keeping as much of the C-code as possible.
Unfortunately, the chains don't converge. I am wondering if
there is a mistake in the implementation itself?
"just" a bad choice of proposal distributions?
Assuming the latter, I am thinking about how to find good parameters of proposal distributions (currently I have picked arbitrary values) so that the algorithm works. Even with three parameters as in this case, it is quite hard to find suitable parameters. How does one normally handle this problem if say Gibbs sampling is not an alternative?
*I want to use this code for Cuda
#### posterior distribution
logPostDensity <- function(x, y, a, b, s2, N)
sumSqError = 0.0
for(i in 1:N)
sumSqError = sumSqError + (y[i] - (a + b*x[i]))^2
return(((-(N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))
# x = x values
# y = actual datapoints
# N = sample size
# m = length of chain
# sigmaProp = uniform proposal for sigma squared
# paramAProp = uniform proposal for intercept
# paramBProp = uniform proposal for slope
mcmcSampling <- function(x,y,N,m,sigmaProp,paramAProp,paramBProp)
paramsA = vector("numeric",length=m) # intercept
paramsB = vector("numeric",length=m) # slope
s2 = vector("numeric",length=m) # sigma squared
paramsA[1] = 0
paramsB[1] = 0
s2[1] = 1
for(i in 2:m)
paramsA[i] = paramsA[i-1] + runif(1,-paramAProp,paramAProp)
- logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
< log(runif(1)))
paramsA[i] = paramsA[i-1]
paramsB[i] = paramsB[i-1] + runif(1,-paramBProp,paramBProp)
- logPostDensity(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N))
< log(runif(1)))
paramsB[i] = paramsB[i-1]
s2[i] = s2[i-1] + runif(1,-sigmaProp,sigmaProp)
if((s2[i] < 0) || (logPostDensity(x,y,paramsA[i],paramsB[i],s2[i],N)
- logPostDensity(x,y,paramsA[i],paramsB[i],s2[i-1],N))
< log(runif(1)))
s2[i] = s2[i-1]
res = data.frame(paramsA,paramsB,s2)
x <- runif(100)
y <- 2 + 5*x + rnorm(100)
df <- mcmcSampling(x,y,10,5000,0.05,0.05,0.05)
There was one mistake in the intercept section (paramsA). Everything else was fine. I've implemented what Alexey suggested in his comments. Here's the solution:
pow <- function(x,y)
#### posterior distribution
posteriorDistribution <- function(x, y, a, b,s2,N)
sumSqError <- 0.0
for(i in 1:N)
sumSqError <- sumSqError + pow(y[i] - (a + b*x[i]),2)
return((-((N/2)+1) * log(s2)) + ((-0.5/s2) * sumSqError))
# x <- x values
# y <- actual datapoints
# N <- sample size
# m <- length of chain
# sigmaProposalWidth <- width of uniform proposal dist for sigma squared
# paramAProposalWidth <- width of uniform proposal dist for intercept
# paramBProposalWidth <- width of uniform proposal dist for slope
mcmcSampling <- function(x,y,N,m,sigmaProposalWidth,paramAProposalWidth,paramBProposalWidth)
desiredAcc <- 0.44
paramsA <- vector("numeric",length=m) # intercept
paramsB <- vector("numeric",length=m) # slope
s2 <- vector("numeric",length=m) # sigma squared
paramsA[1] <- 0
paramsB[1] <- 0
s2[1] <- 1
accATot <- 0
accBTot <- 0
accS2Tot <- 0
for(i in 2:m)
paramsA[i] <- paramsA[i-1] + runif(1,-paramAProposalWidth,paramAProposalWidth)
accA <- 1
if((posteriorDistribution(x,y,paramsA[i],paramsB[i-1],s2[i-1],N) -
posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
paramsA[i] <- paramsA[i-1]
accA <- 0
accATot <- accATot + accA
paramsB[i] <- paramsB[i-1] + runif(1,-paramBProposalWidth,paramBProposalWidth)
accB <- 1
if((posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N) -
posteriorDistribution(x,y,paramsA[i-1],paramsB[i-1],s2[i-1],N)) < log(runif(1)))
paramsB[i] <- paramsB[i-1]
accB <- 0
accBTot <- accBTot + accB
s2[i] <- s2[i-1] + runif(1,-sigmaProposalWidth,sigmaProposalWidth)
accS2 <- 1
if((s2[i] < 0) || (posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i],N) -
posteriorDistribution(x,y,paramsA[i],paramsB[i],s2[i-1],N)) < log(runif(1)))
s2[i] <- s2[i-1]
accS2 <- 0
accS2Tot <- accS2Tot + accS2
paramAProposalWidth <- paramAProposalWidth * ((accATot/100)/desiredAcc)
paramBProposalWidth <- paramBProposalWidth * ((accBTot/100)/desiredAcc)
sigmaProposalWidth <- sigmaProposalWidth * ((accS2Tot/100)/desiredAcc)
accATot <- 0
accBTot <- 0
accS2Tot <- 0
res <- data.frame(paramsA,paramsB,s2)
I have the following function which takes 4 vectors. The T vector has a given length and all 3 other vectors (pga, Sa5Hz and Sa1Hz) have a given (identical but not necessarily equal to T) lenght.
The output is a matrix with length(T) rows and length(pga) columns.
My code below seems like the perfect example of what NOT to do, however, I could not figure out a way to optimize it using an apply function. Can anyone help?
designSpectrum <- function (T, pga, Sa5Hz, Sa1Hz){
Ts <- Sa1Hz / Sa5Hz
#By convention, if Sa5Hz is null, set Ts as 0.
Ts[is.nan(Ts)] <- 0
res <- matrix(NA, nrow = length(T), ncol = length(pga))
for (i in 1:nrow(res))
for (j in 1:ncol(res))
res[i,j] <- if(T[i] <= 0) {pga[j]}
else if (T[i] <= 0.2 * Ts[j]) {pga[j] + T[i] * (Sa5Hz[j] - pga[j]) / (0.2 * Ts[j])}
else if (T[i] <= Ts[j]) {Sa5Hz[j]}
else Sa1Hz[j] / T[i]
Instead of doing a double for loop and processing each i and j value separately, you could use the outer function to process all of them in one shot. Since you're now processing multiple i and j values simultaneously, you could switch to the vectorized ifelse statement instead of the non-vectorized if and else statements:
designSpectrum2 <- function (T, pga, Sa5Hz, Sa1Hz) {
Ts <- Sa1Hz / Sa5Hz
Ts[is.nan(Ts)] <- 0
outer(1:length(T), 1:length(pga), function(i, j) {
ifelse(T[i] <= 0, pga[j],
ifelse(T[i] <= 0.2 * Ts[j], pga[j] + T[i] * (Sa5Hz[j] - pga[j]) / (0.2 * Ts[j]),
ifelse(T[i] <= Ts[j], Sa5Hz[j], Sa1Hz[j] / T[i])))
identical(designSpectrum(T, pga, Sa5Hz, Sa1Hz), designSpectrum2(T, pga, Sa5Hz, Sa1Hz))
# [1] TRUE
T <- -1:3
pga <- 1:3
Sa5Hz <- 2:4
Sa1Hz <- 3:5
You can see the efficiency gains by testing on rather large vectors (here I'll use an output matrix with 1 million entries):
# Larger vectors
T2 <- runif(1000, -1, 3)
pga2 <- runif(1000, -1, 3)
Sa5Hz2 <- runif(1000, -1, 3)
Sa1Hz2 <- runif(1000, -1, 3)
# Runtime comparison
all.equal(designSpectrum(T2, pga2, Sa5Hz2, Sa1Hz2), designSpectrum2(T2, pga2, Sa5Hz2, Sa1Hz2))
# [1] TRUE
system.time(designSpectrum(T2, pga2, Sa5Hz2, Sa1Hz2))
# user system elapsed
# 4.038 1.011 5.042
system.time(designSpectrum2(T2, pga2, Sa5Hz2, Sa1Hz2))
# user system elapsed
# 0.517 0.138 0.652
The approach with outer is almost 10x faster.
I am aware of the ncdf package of R.
I am trying to plot wind speed and wind direction(4D) of WRFout netCDF file.
Variable as a function of (i,j,k,l).
New variables need to be created as the calculation shown in the code.
Looping through 4 for loops is taking too long. I understand advanced looping techniques such as plyr or tapply or mapply exist. But examples for these are only for 2D array/matrix.
Please suggest optimization package for 4D array.
ncin <- open.ncdf("")
imax <- 425 #No of grids in Longitude
jmax <- 200 #No of grids in Latitude
kmax <- 40 #Vertical layers
lmax <- 11 #Time
paiv <- atan(1.0)/45.0
#However, UUin and VVin are in staggerd grid from input file
#dim(UUin) is (426,200,40,11)
#dim(VVin) is (425,201,40,11)
UUin <- get.var.ncdf(ncin,"U")
VVin <- get.var.ncdf(ncin,"V")
#Initialize arrays in normal grid
UU <- array(0.0, c(imax,jmax,kmax,lmax))
VV <- array(0.0, c(imax,jmax,kmax,lmax))
Wspd <- array(0.0,c(imax,jmax,kmax,lmax))
Wdir <- array(0.0,c(imax,jmax,kmax,lmax))
for (l in 1:lmax) {
for (k in 1:kmax) {
for (j in 1:jmax) {
for (i in 1:imax) {
#Change U and V staggerd grid to regular grid
UU[i,j,k,l] <- 0.5*(UUin[i,j,k,l]+ UUin[i+1,j,k,l])
VV[i,j,k,l] <- 0.5*(VVin[i,j,k,l]+ VVin[i,j+1,k,l])
#Now calculate wind speed and direction in regular grid
Wspd[i,j,k,l] <- sqrt(UU[i,j,k,l]*UU[i,j,k,l] + VV[i,j,k,l]* VV[i,j,k,l])
if (UU[i,j,k,l] == 0.0)
{ # startif
if(VV[i,j,k,l] < 0.0)
Wdir[i,j,k,l] = 0.0
Wdir[i,j,k,l] = 180.0
Wdir[i,j,k,l] = atan2(VV[i,j,k,l], UU[i,j,k,l]) / paiv
if(UU[i,j,k,l] < 0.0) {
Wdir[i,j,k,l] = 90.0 - Wdir[i,j,k,l]
Wdir[i,j,k,l] = 270.0 - Wdir[i,j,k,l]
if (Wdir[i,j,k,l] < 0.0)
Wdir[i,j,k,l] = Wdir[i,j,k,l] + 360.0
if (Wdir[i,j,k,l] > 360.0)
Wdir[i,j,k,l] = Wdir[i,j,k,l] - 360.0
} #end of i
} #end of j
} #end of k
} #end of l