How to make softmax work with policy gradient? - artificial-intelligence

I am trying to change Karpathy's code so that it works with softmax function so that I can use it for game with more than 2 actions. However, I cannot get it to work. Can someone help point me to the right direction please? Thanks. Below is my attempt.
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import cPickle as pickle
import gym
# hyperparameters
H = 100 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.9 # discount factor for reward
decay_rate = 0.9 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
num_action = 2
# model initialization
D = 6 # input dimensionality: 80x80 grid
if resume:
model = pickle.load(open('save.p', 'rb'))
else:
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(num_action, H) / np.sqrt(H)
grad_buffer = { k : np.zeros_like(v) for k,v in model.iteritems() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.iteritems() } # rmsprop memory
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]
def softmax(w, t = 1.0):
e = np.exp(np.array(w) / t)
dist = e / np.sum(e)
return dist
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
def policy_forward(x):
h = np.dot(model['W1'], x)
h[h<0] = 0 # ReLU nonlinearity
logp = np.dot(model['W2'], h)
p = softmax(logp)
return p, h # return probability of taking action 2, and hidden state
def policy_backward(eph, epdlogp):
""" backward pass. (eph is array of intermediate hidden states) """
# print eph.shape
# print epdlogp.shape
# print model['W2'].shape
# dW2 = np.dot(eph.T, epdlogp).ravel()
# dh = np.outer(epdlogp, model['W2'])
# dh[eph <= 0] = 0 # backpro prelu
# dW1 = np.dot(dh.T, epx)
# return {'W1':dW1, 'W2':dW2}
dW2 = np.dot(eph.T, epdlogp).T
# print dW2.shape
dh = np.dot(epdlogp, model['W2'])
# print dh.shape
dh[eph <= 0] = 0 # backpro prelu
dW1 = np.dot(dh.T, epx)
return {'W1':dW1, 'W2':dW2}
env = gym.make("Acrobot-v1")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
while True:
if render: env.render()
# preprocess the observation, set input to network to be difference image
cur_x = observation
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
prev_x = cur_x
# forward the policy network and sample an action from the returned probability
aprob, h = policy_forward(x)
action = np.argmax(aprob)
if action == 1:
action = 2
# action = 2 if np.random.uniform() > aprob[1] else 0
# print aprob
# action = 2 if np.random.uniform() < aprob else 3 # roll the dice!
# record various intermediates (needed later for backprop)
xs.append(x) # observation
hs.append(h) # hidden state
# if action == 0:
# y = [1,0,0]
# elif action == 1:
# y = [0,1,0]
# else:
# y = [0,0,1]
y = [1,0] if action == 0 else [0,1] # a "fake label"
dlogps.append(aprob-y) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)
# step the environment and get new measurements
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
if done: # an episode finished
episode_number += 1
# stack together all inputs, hidden states, action gradients, and rewards for this episode
epx = np.vstack(xs)
eph = np.vstack(hs)
epdlogp = np.vstack(dlogps)
epr = np.vstack(drs)
xs,hs,dlogps,drs = [],[],[],[] # reset array memory
# compute the discounted reward backwards through time
discounted_epr = discount_rewards(epr)
# standardize the rewards to be unit normal (helps control the gradient estimator variance)
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
grad = policy_backward(eph, epdlogp)
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch
# perform rmsprop parameter update every batch_size episodes
if episode_number % batch_size == 0:
for k,v in model.iteritems():
g = grad_buffer[k] # gradient
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
# boring book-keeping
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
reward_sum = 0
observation = env.reset() # reset env
prev_x = None
When debugging, this code runs into a "nan" issue which I can't figure out how to fix.

I think the NaN problem that you mention in a comment is due to your Softmax function.
Softmax computes the exponential function, exp(x) which can easily exceed the range of single or double precision floats for moderate values of x. This would cause exp to return NaN.
Solution
The mathematical form of Softmax is:
s[i] = exp(x[i]) / (exp(x[0]) + exp(x[1]) + .. + exp(x[n-1]))
We can divide the numerator and denominator of this expression by an arbitrary value, say exp(a) without affecting the result.
s[i] = (exp(x[i])/exp(a)) / ((exp(x[0]) + exp(x[1]) + .. + exp(x[n-1])/exp(a)))
s[i] = exp(x[i]-a) / (exp(x[0]-a) + exp(x[1]-a) + .. + exp(x[n-1]-a))
If we let a = max(x) then all exponents will be zero or negative, so no call to exp will return NaN.
I don't use Python or numpy, but I imagine you could define softmax something like:
def softmax(w):
a = np.max(w)
e = np.exp(np.array(w) - a)
dist = e / np.sum(e)
return dist

Related

Quantum walk on 3D grid

I am trying to apply the quantum coin walk on a 3D grid, with 3 Hadamard coins. However I can't seem to get symmetric results after 3 steps. Is it simply not possible to have a probability distribution which is symmetric with such a coin?
Thank you
ps the implementation is based on http://susan-stepney.blogspot.com/2014/02/mathjax.html and the position vector captures a 3D grid.
pps Has this been attempted on qiskit? I couldn't use the hard coded matrix to get result perfectly symmetric for some reasons...
Not sure I answered your question, but
from the code reference you mentioned, I only changed line 30 to:ax = fig.add_subplot(111, projection = '3d') and line 3 to:from mpl_toolkits.mplot3d import Axes3D
from numpy import *
from matplotlib.pyplot import *
from mpl_toolkits.mplot3d import Axes3D
N = 100 # number of random steps
P = 2*N+1 # number of positions
coin0 = array([1, 0]) # |0>
coin1 = array([0, 1]) # |1>
C00 = outer(coin0, coin0) # |0><0|
C01 = outer(coin0, coin1) # |0><1|
C10 = outer(coin1, coin0) # |1><0|
C11 = outer(coin1, coin1) # |1><1|
C_hat = (C00 + C01 + C10 - C11)/sqrt(2.)
ShiftPlus = roll(eye(P), 1, axis=0)
ShiftMinus = roll(eye(P), -1, axis=0)
S_hat = kron(ShiftPlus, C00) + kron(ShiftMinus, C11)
U = S_hat.dot(kron(eye(P), C_hat))
posn0 = zeros(P)
posn0[N] = 1 # array indexing starts from 0, so index N is the central posn
psi0 = kron(posn0,(coin0+coin1*1j)/sqrt(2.))
psiN = linalg.matrix_power(U, N).dot(psi0)
prob = empty(P)
for k in range(P):
posn = zeros(P)
posn[k] = 1
M_hat_k = kron( outer(posn,posn), eye(2))
proj = M_hat_k.dot(psiN)
prob[k] = proj.dot(proj.conjugate()).real
fig = figure()
ax = fig.add_subplot(111, projection = '3d')
plot(arange(P), prob)
plot(arange(P), prob, 'o')
loc = range(0, P, P // 10) #Location of ticks
xticks(loc)
xlim(0, P)
ax.set_xticklabels(range(-N, N+1, P // 10))
show()

1D-Coupled Transient Diffusion in FiPY with Reactive Boundary Condition

I would like to solve the transient diffusion equation for two compounds A and B as shown in image. I think the image is a better way to show my problem.
Diffusion equations and boundary conditions.
As you can see, the reaction only occurs at the surface and the flux of A is equal to flux of B. So, this two equations are coupled only at surface. The boundary condition is similar to ROBIN boundary condition, explained in Fipy manual. However, the main difference is the existence of the second variable in boundary condition. Does anybody have any idea how to formulate this boundary condition in Fipy?
I guess I need to add some extra term to ROBIN boundary condition, but I couldn't figure it out.
I really appreciate your help.
This is the code which solves the mentioned equation with ROBIN boundary condition # x=0.
-D(dC_A/dx) = -kC_A
-D(dC_B/dx) = -kC_B
In this condition, I can easily use ROBIN boundary condition to solve equations. The results seem reasonable for this boundary condition.
"""
Question for StackOverflow
"""
#%%
from fipy import Variable, FaceVariable, CellVariable, Grid1D, TransientTerm, DiffusionTerm, Viewer, ImplicitSourceTerm
from fipy.tools import numerix
#%%
##### Model parameters
L= 8.4853e-4 # m boundary layer thickness
dx= 1e-8 # mesh size
nx = int(L/dx)+1 # number of meshes
D = 1e-9 # m^2/s diffusion coefficient
k = 1e-4 # m/s reaction coefficient R = k [c_A],
c_inf = 0. # ROBIN general condition, once can think R = k ([c_A]-[c_inf])
c_init = 1. # Initial concentration of compound A, mol/m^3
#%%
###### Meshing and variable definition
mesh = Grid1D(nx=nx, dx=dx)
c_A = CellVariable(name="c_A", hasOld = True,
mesh=mesh,
value=c_init)
c_B = CellVariable(name="c_B", hasOld = True,
mesh=mesh,
value=0.)
#%%
##### Right boundary condition
valueRight = c_init
c_A.constrain(valueRight, mesh.facesRight)
c_B.constrain(0., mesh.facesRight)
#%%
### ROBIN BC requirements, defining cellDistanceVectors
## This code is for fixing celldistance via this link:
## https://stackoverflow.com/questions/60073399/fipy-problem-with-grid2d-celltofacedistancevectors-gives-error-uniformgrid2d
MA = numerix.MA
tmp = MA.repeat(mesh._faceCenters[..., numerix.NewAxis,:], 2, 1)
cellToFaceDistanceVectors = tmp - numerix.take(mesh._cellCenters, mesh.faceCellIDs, axis=1)
tmp = numerix.take(mesh._cellCenters, mesh.faceCellIDs, axis=1)
tmp = tmp[..., 1,:] - tmp[..., 0,:]
cellDistanceVectors = MA.filled(MA.where(MA.getmaskarray(tmp), cellToFaceDistanceVectors[:, 0], tmp))
#%%
##### Defining mask and Robin BC at left boundary
mask = mesh.facesLeft
Gamma0 = D
Gamma = FaceVariable(mesh=mesh, value=Gamma0)
Gamma.setValue(0., where=mask)
dPf = FaceVariable(mesh=mesh,
value=mesh._faceToCellDistanceRatio * cellDistanceVectors)
n = mesh.faceNormals
a = FaceVariable(mesh=mesh, value=k, rank=1)
b = FaceVariable(mesh=mesh, value=D, rank=0)
g = FaceVariable(mesh=mesh, value= k * c_inf, rank=0)
RobinCoeff = (mask * Gamma0 * n / (-dPf.dot(a)+b))
#%%
#### Making a plot
viewer = Viewer(vars=(c_A, c_B),
datamin=-0.2, datamax=c_init * 1.4)
viewer.plot()
#%% Time step and simulation time definition
time = Variable()
t_simulation = 4 # seconds
timeStepDuration = .05
steps = int(t_simulation/timeStepDuration)
#%% PDE Equations
eqcA = (TransientTerm(var=c_A) == DiffusionTerm(var=c_A, coeff=Gamma) +
(RobinCoeff * g).divergence
- ImplicitSourceTerm(var=c_A, coeff=(RobinCoeff * a.dot(-n)).divergence))
eqcB = (TransientTerm(var=c_B) == DiffusionTerm(var=c_B, coeff=Gamma) -
(RobinCoeff * g).divergence
+ ImplicitSourceTerm(var=c_B, coeff=(RobinCoeff * a.dot(-n)).divergence))
#%% A loop for solving PDE equations
while time() <= (t_simulation):
time.setValue(time() + timeStepDuration)
c_B.updateOld()
c_A.updateOld()
res1=res2 = 1e10
viewer.plot()
while (res1 > 1e-6) & (res2 > 1e-6):
res1 = eqcA.sweep(var=c_A, dt=timeStepDuration)
res2 = eqcB.sweep(var=c_B, dt=timeStepDuration)
It's possible to solve this as a fully implicit system. The code below simplifies the problem to have a unity domain size and diffusion coefficient. k is set to 0.2. It captures the analytical solution quite well with some caveats (see below).
from fipy import (
CellVariable,
TransientTerm,
DiffusionTerm,
ImplicitSourceTerm,
Grid1D,
Viewer,
)
L = 1.0
nx = 1000
dx = L / nx
konstant = 0.2
coeff = 1.0
mesh = Grid1D(nx=nx, dx=dx)
var_a = CellVariable(mesh=mesh, value=1.0, hasOld=True)
var_b = CellVariable(mesh=mesh, value=0.0, hasOld=True)
var_a.constrain(1.0, mesh.facesRight)
var_b.constrain(0.0, mesh.facesRight)
coeff_mask = ~mesh.facesLeft * coeff
boundary_coeff = konstant * (mesh.facesLeft * mesh.faceNormals).divergence
eqn_a = TransientTerm(var=var_a) == DiffusionTerm(
coeff_mask, var=var_a
) - ImplicitSourceTerm(boundary_coeff, var=var_a) + ImplicitSourceTerm(
boundary_coeff, var=var_b
)
eqn_b = TransientTerm(var=var_b) == DiffusionTerm(
coeff_mask, var=var_b
) - ImplicitSourceTerm(boundary_coeff, var=var_b) + ImplicitSourceTerm(
boundary_coeff, var=var_a
)
eqn = eqn_a & eqn_b
for _ in range(5):
var_a.updateOld()
var_b.updateOld()
eqn.sweep(dt=1e10)
Viewer((var_a, var_b)).plot()
print("var_a[0] (expected):", (1 + konstant) / (1 + 2 * konstant))
print("var_b[0] (expected):", konstant / (1 + 2 * konstant))
print("var_a[0] (actual):", var_a[0])
print("var_b[0] (actual):", var_b[0])
input("wait")
Note the following:
As written the boundary condition is only first order accurate, which doesn't really matter for this problem, but might hurt you for in higher dimensions. There might be ways to fix this such as having a small cell near the boundary or adding in an explicit second order correction for the boundary condition.
The equations are coupled here. If uncoupled it would probably require loads of iterations to reach equilibrium.
It did require a few iterations to reach equilibrium, but it shouldn't. That's probably due to the solver not converging adequately without a few tries. It might be that coupled equations have some bad conditioning.

How can i concatenate three 2D arrays which contain hue, saturation and intensity values in their respective arrays and display that as an image?

I am new to image processing and python. As you can see from my code, i managed to convert my RGB image to HSI by using the different formulas that i found.
I stored the values of hue, saturation and intensity in three different arrays. That is also in the code down below. How can i concatenate those three arrays and display the concatenated image as an image?
import math
from PIL import Image
img = Image.open("D:\\Texture analysis\\trees-clolorful-aerial-view-wallpaper.jpg")
rgb_img = img.convert('RGB')
row, col = img.size
print(row, col)
i = j = 0
satValue = 0
inValue = 0
hueValue = 0
squareValue = 0
hueArray = [[0 for x in range(row)] for y in range(col)]
satArray = [[0 for x in range(row)] for y in range(col)]
inArray = [[0 for x in range(row)] for y in range(col)]
division = 0
denominator = 0
numerator = 0
radAngle = 0
degAngle = 0
product = 0
sqr = 0
count = 0
uCount = 0
while i < row:
j = 0
while j < col:
red, green, blue = rgb_img.getpixel((i, j))
hRed = sRed = iRed = red
hGreen = sGreen = iGreen = green
hBlue = sBlue = iBlue = blue
# =========================Saturation Calculation==============================
if sRed == 0 and sGreen == 0 and sBlue == 0:
satValue = 0
satArray[i][j] = 0
else:
if (sRed < sGreen) and (sRed < sBlue):
satValue = 1 - (((3) * (sRed)) / (sRed + sGreen + sBlue))
satArray[i][j] = satValue
# print(satValue)
elif (sGreen < sRed) and (sGreen < sBlue):
satValue = 1 - (((3) * (sGreen)) / (sRed + sGreen + sBlue))
satArray[i][j] = satValue
# print(satValue)
else:
satValue = 1 - (((3) * (sBlue)) / (sRed + sGreen + sBlue))
satArray[i][j] = satValue
# print(satValue)
# =============================================================================
# ==========================Intensity Calculation==============================
inValue = (iRed + iGreen + iBlue) / 3
inArray[i][j] = inValue
count += 1
print(inValue, count)
# =============================================================================
# =============================Hue Calculation=================================
product = (hRed - hBlue) * (hGreen - hBlue)
sqr = (hRed - hGreen) * (hRed - hGreen)
denominator = math.sqrt(sqr + product)
if denominator != 0:
numerator = ((hRed - hGreen) + (hRed - hBlue)) / 2
division = numerator / denominator
radAngle = math.acos(division)
degAngle = math.degrees(radAngle)
if hBlue <= hGreen:
hueValue = degAngle
hueArray[i][j] = hueValue
elif hBlue > hGreen:
hueValue = 360 - degAngle
hueArray[i][j] = hueValue
elif denominator == 0:
hueValue = 0
hueArray[i][j] = hueValue
#print(hueValue, count)
# =============================================================================
j += 1
i += 1 print(i, j)
PS. You will be seeing a lot of my amateur code in the future as well :D
I can see what's going wrong now I am back at a computer. You probably tried this:
#!/usr/bin/env python3
from PIL import Image
img = Image.open('start.png')
hsvimg = img.convert('HSV')
hsvimg.save('result.png')
And if you do that, you actually get an error message:
OSError: cannot write mode HSV as PNG
because, PNG images are always in sRGB colourspace, so it correctly declines to write your HSV image. The thing is though, that the colourspace conversion actually worked and the values in the image actually are the HSV values that you want. You can check this with:
img.getpixel((X,Y))
and
hsvimg.getpixel((X,Y))
where X and Y are any random coordinates you like. You will see the the latter is always the correct HSV representation of the former's RGB colour.
I am not sure what you are trying to do overall, so I can't really advise properly, but one thing you could do is "lie through your teeth" and tell PIL/Pillow that the image is RGB even though you know it is HSV. So if you do:
hsvimg = img.convert('HSV')
hsvimg.mode='RGB' # Tell PIL image is RGB
hsvimg.save('result.png')
it will save an image but it, and all other viewers, will show your Hue as Blue, your Saturation as Green and your Value as Blue.
I am guessing you have other processing to do, and this is only an intermediate aspect of your processing, so it probably won't matter and you can probably carry on and do your processing and convert back at the end and save to an sRGB PNG file without needing to lie.
In answer to your actual question, you can split and merge channels like this with PIL/Pillow:
# Split and recombine with PIL
r,g,b = img.split()
merged = Image.merge(mode='RGB',bands=(r,g,b)))
Or, if you prefer Numpy which is often faster:
# Open image as Numpy array
img = np.array(Image.open('start.png'))
# Split into 3 channels/arrays/bands
r = img[:, :, 0]
g = img[:, :, 1]
b = img[:, :, 2]
# Recombine to single image
merged = np.dstack((r, g, b))

TypeError: 'numpy.ndarray' object is not callable

I have seen other people asking something similar but I can not figure out the problem anyway.
I am trying to translate a Matlab code to Python and I have a problem after the following line in the FOR loop:
dx = abs(np.diff(g_coord(num)))
Below you have the code up to that loop. Any help will be appreciated. I really tried to fix it by myself but unsuccessfully. Sorry if it is a stupid mistake. The MATLAB lines are kept as Python comments in case it helps.
import numpy as np
from scipy.sparse import lil_matrix
# physical parameters
seconds_per_yr = 60*60*24*365; # number of seconds in one year
lx = 10000 ; #length of spatial domain (m)
Cp = 1e3 ; # rock heat capacity (J/kg/K)
rho = 2700 ; # rock density (kg/mˆ3)
K = 3.3 ; # bulk thermal conductivity (W/m/K)
kappa = K/(Cp*rho); # thermal diffusivity (mˆ2/s)
Tb = 0 ; # temperatures at boundaries (o C)
A = 2.6e-6 ; # heat production (W/mˆ3)
H = A/(rho*Cp); # heat source term (o K/s) % numerical parameters
dt = 1000*seconds_per_yr ; # time step (s)
ntime = 5000 ; # number of time steps
nels = 40 ; # total number of elements
nod = 2 ; # number of nodes per element
nn = nels+1 # total number of nodes
dx = lx/nels ; # element size
g_coord = np.arange(0, lx+1, dx)#[0:dx:lx]
bcdof = np.array([1, nn]); #[ 1 nn ] ; boundary nodes
bcval = np.array([Tb, Tb]); #[ Tb Tb ] ; # boudary values
g_num = np.zeros((nod, nels), float); #zeros(nod,nels) ;
g_num[0,:]=np.arange(1, nn); #g_num(1,:) = [1:nn-1] ;
g_num[1,:]=np.arange(2, nn+1); #g_num(2,:) = [2:nn] ;
# initialise matrices and vectors
ff = np.zeros((nn,1), float); # system load vector
b = np.zeros((nn,1), float); # system rhs vector
lhs=lil_matrix((nn, nn)) #lhs = sparse(nn,nn); system lhs matrix
rhs=lil_matrix((nn, nn)) #rhs = sparse(nn,nn); system rhs matrix
displ = np.zeros((nn,1), float); # initial temperature (o C)
#-----------------------------------------------------
# matrix assembly
#-----------------------------------------------------
# Matlab version of the loop
#-----------------------------------------------------
#for iel=1:nels # loop over all elements
# num = g_num(:,iel) ; # retrieve equation number
# dx = abs(diff(g_coord(num))) ; # length of element
# MM = dx*[1/3 1/6 ; 1/6 1/3 ] ;# mass matrix
# KM = [kappa/dx -kappa/dx ; -kappa/dx kappa/dx ]; #diffn matrix
# F = dx*H*[1/2 ; 1/2] ; # load vector
# lhs(num,num) = lhs(num,num) + MM/dt + KM ; # assemble lhs
# rhs(num,num) = rhs(num,num) + MM/dt ; # assemble rhs
# ff(num) = ff(num) + F ; # assemble load
#end # end of element loop
#Python version of the loop
#-----------------------------------------------------
for iel in range(0, nels): # loop over all elements
num = g_num[:,iel] # retrieve equation number
#print(num)
dx = abs(np.diff(g_coord[num])) # length of element
MM = dx*(np.array([[1/3, 1/6],[1/6, 1/3]])) # mass matrix
KM = np.array([[kappa/dx, -kappa/dx],[-kappa/dx, kappa/dx]])
F = dx*H*(np.array([1/2, 1/2])).reshape(-1,1) # load vector
lhs[num,num] = lhs[num,num] + MM/dt + KM # assemble lhs
rhs[num,num] = rhs[num,num] + MM/dt # assemble rhs
ff[num] = ff[num] + F # assemble load
The error seems to be because num is a float.
Simply do:
dx = abs(np.diff(g_coord[np.int32(num)]))
However it raises another error a few lines later because num is a 2-element array. You know what the code should do which I do not. If you have more issues, you can comment below or edit your question with the first problem solved.
Also, I noticed that you left all the ; at the end of the lines as in Matlab. You do not need this in Python. Also, I think there is no need for you to specify float when you create the matrices of zeros, they naturally are float.

Repeating utility values in Value Iteration (Markov Decision Process)

I am trying to implement the value iteration algorithm of the Markov Decision Process using python. I have one implementation. But, this is giving me many repeated values for the utilities. My transition matrix is quite sparse. Probably, this is causing the problem. But, I am not very sure if this assumption is correct. How should I correct this?
The code might be pretty shoddy. I am very new to value iteration. So please help me identify problems with my code. The reference code is this :http://carlo-hamalainen.net/stuff/mdpnotes/. I have used the ipod_mdp.py code file. Here is the snippet of my implementation:
num_of_states = 470 #total number of states
#initialization
V1 = [0.25] * num_of_states
get_target_index = state_index[(u'48.137654', u'11.579949')] #each state is a location
#print "The target index is ", get_target_index
V1[get_target_index] = -100 #assigning least cost to the target state
V2 = [0.0] * num_of_states
policy = [0.0] * num_of_states
count = 0.0
while max([abs(V1[i] - V2[i]) for i in range(num_of_states)]) > 0.001:
print max([abs(V1[i] - V2[i]) for i in range(num_of_states)])
print count
for s in range(num_of_states): #for each state
#initialize minimum action to the first action in the list
min_action = actions_index[actions[0]] #initialize - get the action index for the first iteration
min_action_cost = cost[s, actions_index[actions[0]]] #initialize the cost
for w in range(num_of_states):
if (s, state_index[actions[0]], w) in transitions: #if this transition exists in the matrix - non-zero value
min_action_cost += 0.9 * transitions[s, state_index[actions[0]], w] * V1[w]
else:
min_action_cost += 0.9 * 0.001 * V1[w] #if not - give it a small value of 0.001 instead of 0.0
#get the minimum action cost for the state
for a in actions:
this_cost = cost[s, actions_index[a]]
for w in range(num_of_states):
# if index_state[w] != 'm':
if (s, state_index[a], w) in transitions:
this_cost += 0.9 * transitions[s, state_index[a], w] * V1[w]
else:
this_cost += 0.9 * 0.001 * V1[w]
if this_cost < min_action_cost:
min_action = actions_index[a]
min_action_cost = this_cost
V2[s] = min_action_cost
policy[s] = min_action
V1, V2 = V2, V1 #swap
count += 1
Thank you very much.
I am not sure I understand your code fully. I will just leave my implementation here in case someone needs it.
import numpy as np
def valueIteration(R, P, discount, threshold):
V = np.copy(R)
old_V = np.copy(V)
error = float("inf")
while error > threshold:
old_V, V = (V, old_V)
max_values = np.dot(P, old_V).max(axis=1)
np.copyto(V, R + discount * max_values)
error = np.linalg.norm(V - old_V)
return V
S = 30
A = 4
R = np.zeros(S)
# Goal state S-1
R[S-2] = 1
P = np.random.rand(S,A,S)
# Goal state goes to dwell state
P[S-2,:,:] = 0
P[S-2,:,S-1] = 1
P[S-1,:,:] = 0
P[S-1,:,S-1] = 1
for s in range(S-2): #goal and dwell states do not need normalization
for a in range(A):
P[s,a,:] /= P[s,a,:].sum()
V = valueIteration(R,P,0.97,0.001)

Resources