Fitting a linear regression with scipy.stats; error in array shapes - arrays

I have written some code to read a data file using pandas and process the data with numpy. This results in some NaNs in the numpy array. I mask those out so that I can apply a linear regression fit with scipy.stats:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
def makeArray(band):
"""
Takes as argument a string as the name of a wavelength band.
Converts the list of magnitudes in that band into a numpy array,
replacing invalid values (where invalid == -999) with NaNs.
Returns the array.
"""
array_name = band + '_mag'
array = np.array(df[array_name])
array[array==-999]=np.nan
return array
# Read data file
fields = ['no', 'NED', 'z', 'obj_type','S_21', 'power', 'SI_flag',
'U_mag', 'B_mag', 'V_mag', 'R_mag', 'K_mag', 'W1_mag',
'W2_mag', 'W3_mag', 'W4_mag', 'L_UV', 'Q', 'flag_uv']
magnitudes = ['U_mag', 'B_mag', 'V_mag', 'R_mag', 'K_mag', 'W1_mag',
'W2_mag', 'W3_mag', 'W4_mag']
df = pd.read_csv('todo.dat', sep = ' ',
names = fields, index_col = False)
# Define axes for processing
redshifts = np.array(df['z'])
y = np.log(makeArray('K'))
mask = np.isnan(y)
plt.scatter(redshifts, y, label = ('K'), s = 2, color = 'r')
slope, intercept, r_value, p_value, std_err = stats.linregress(redshifts, y[mask])
fit = slope*redshifts + intercept
plt.legend()
plt.show()
but the lines where I calculate the stats parameters and the fit line (third- and fourth-to-last lines) give me the following error:
Traceback (most recent call last):
File "<ipython-input-77-ec9f43cdfa9b>", line 1, in <module>
runfile('C:/Users/Jeremy/Dropbox/Notes/Postgrad/Masters Research/VUW/QSOs/read_csv.py', wdir='C:/Users/Jeremy/Dropbox/Notes/Postgrad/Masters Research/VUW/QSOs')
File "C:\Users\Jeremy\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
execfile(filename, namespace)
File "C:\Users\Jeremy\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Jeremy/Dropbox/Notes/Postgrad/Masters Research/VUW/QSOs/read_csv.py", line 35, in <module>
slope, intercept, r_value, p_value, std_err = stats.linregress(redshifts, y[mask])
File "C:\Users\Jeremy\Anaconda3\lib\site-packages\scipy\stats\_stats_mstats_common.py", line 92, in linregress
ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
File "C:\Users\Jeremy\Anaconda3\lib\site-packages\numpy\lib\function_base.py", line 2865, in cov
X = np.vstack((X, y))
File "C:\Users\Jeremy\Anaconda3\lib\site-packages\numpy\core\shape_base.py", line 234, in vstack
return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
ValueError: all the input array dimensions except for the concatenation axis must match exactly
The variables are shaped like:
so I'm not sure what the error means, or how to fix it. Is there a way around this? Or perhaps another module I can use instead of scipy.stats that will allow me to fit a linear regression?

The problem is that y[mask] is a different length to redshifts.
Below is a simple example piece of code to show the issue..
import numpy as np
na = np.array
y = na([np.nan, 4, 5, 6, 7, 8, np.nan, 9, 10, np.nan])
mask = np.isnan(y)
print(len(y), len(y[mask]))
You will have to substitute values for the nan values in y with something like..
print('old y: ', y)
for idx, m in enumerate(mask):
if m:
y[idx] = 1000 # or whatever value you decide on
print('new y: ', y)
Full example code...
import numpy as np
na = np.array
y = na([np.nan, 4, 5, 6, 7, 8, np.nan, 9, 10, np.nan])
mask = np.isnan(y)
print(len(y), len(y[mask]))
print('old y: ', y)
for idx, m in enumerate(mask):
if m:
y[idx] = 1000 # or whatever value you decide on
print('new y: ', y)
print(len(y))

Related

How to apply textwrap.wrap as ufunc on xarray.Dataarray

I am desperately trying to split strings within an xarray.Dataarray.
What should happen to every element of the array is e.g.
"aaabbbccc" --> [aaa, bbb, ccc]
Fortunately, such a function already exists in the textwrap library, but applying it to my Dataarray is a different story:
xds = riox.open_rasterio(fp_output_tmp_mlsieved, chunks = "auto")
<xarray.DataArray (band: 1, y: 2, x: 2)>
dask.array<transpose, shape=(1, 2, 2), dtype=<U18, chunksize=(1, 2, 2), chunktype=numpy.ndarray>
Coordinates:
* band (band) int64 1
* x (x) float64 3.077e+06 3.077e+06 ... 3.077e+06 3.077e+06
* y (y) float64 1.865e+06 1.865e+06 ... 1.865e+06 1.865e+06
spatial_ref int64 0
Loaded it looks like this:
array([[['000000000000000000', '000000000000000000'],
['000000000000000000', '000000000000000000']]], dtype='<U18')
I think a solution is to apply it with xr.apply_ufunc(). I have managed to do that with a simpler numpy function before, but with wrap() all I get is a bunch of errors. I think the main issue is that it is not a vectorized numpy function and second that I canĀ“t get the dimensions to work out. My latest try looks like that:
def decompressor(s, l):
return np.array(wrap(s.item(), l))
def ufunc_decompressor(s, l):
return xr.apply_ufunc(
decompressor,
s, l,
output_dtypes=[np.dtype(f"U{l}")],
input_core_dims=[["band"],[]],
output_core_dims=[["band"]],
exclude_dims=set(("band",)),
dask="parallelized",
vectorize=True
)
xds_split = ufunc_decompressor(xds, 3).load()
What I get is a cryptic error:
File "/home/.../miniconda3/envs/postproc/lib/python3.10/site-packages/dask/array/gufunc.py", line 489, in <genexpr>
core_output_shape = tuple(core_shapes[d] for d in ocd)
KeyError: 'dim0'

NameError: name 'sequence' is not defined

I am using jupyter notebook.
i am trying to split the sequence into multiple samples.
running the following code gives me NameError.
# univariate data preparation
from numpy import array
# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
X, y = list(), list()
for i in range(len(sequence)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the sequence
if end_ix > len(sequence)-1:
break
# gather input and output parts of the pattern
seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
# define input sequence
raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90]
# choose a number of time steps
n_steps = 3
# split into samples
X, y = split_sequence(raw_seq, n_steps)
# summarize the data
for i in range(len(X)):
print(X[i], y[i])

Plotting a list vs a list of arrays with matplotlib

Let's say I have two lists a and b, whereas one is a list of arrays
a = [1200, 1400, 1600, 1800]
b = [array([ 1.84714754, 4.94204658, 11.61580355, ..., 17.09772144,
17.09537562, 17.09499705]), array([ 3.08541849, 5.11338795, 10.26957508, ..., 16.90633304,
16.90417909, 16.90458781]), array([ 4.61916789, 4.58351918, 4.37590053, ..., -2.76705271,
-2.46715664, -1.94577492]), array([7.11040853, 7.79529924, 8.48873734, ..., 7.78736448, 8.47749987,
9.36040364])]
The shape of both is said to be (4,)
If I now try to plot these via plt.scatter(a, b)
I get an error I can't relate to: ValueError: setting an array element with a sequence.
At the end I want a plot where per n-th value in a a set of values stored as n-th array in b shall be plotted.
I'm pretty sure I've done this before, but I can't get this working.
Any ideas? ty
You need to adjust the elements in a to match the elements in b
len_b = [len(sub_array) for sub_array in b]
a = [repeat_a for i,repeat_a in enumerate(a) for _ in range(len_b[i])]
# convert list of array to just list of values
b = np.ravel(b).tolist()
# check if lengths are same
assert len(a) == len(b)
# if yes, now this should work
plt.scatter(a,b)
I am afraid repetition it is. If all lists in b have the same length, you can use numpy.repeat:
import numpy as np
import matplotlib.pyplot as plt
#fake data
np.random.seed(123)
a = [1200, 1400, 1600, 1800]
b = np.random.randint(1, 100, (4, 11)).tolist()
plt.scatter(np.repeat(a, len(b[0])), b)
plt.show()
If you are not sure and want to be on the safe side, list comprehension it is.
import numpy as np
import matplotlib.pyplot as plt
#fake data
np.random.seed(123)
a = [1200, 1400, 1600, 1800]
b = np.random.randint(1, 100, (4, 11)).tolist()
plt.scatter([[x]*len(b[i]) for i, x in enumerate(a)], b)
plt.show()
The output is the same:
Referring to the suggestion of #sai I tried
import numpy as np
arr0 = np.array([1, 2, 3, 4, 5])
arr1 = np.array([6, 7, 8, 9])
arr2 = np.array([10, 11])
old_b = [arr0, arr1, arr2]
b = np.ravel(old_b).tolist()
print(len(b))
Which will give me length 3 instead of the length 11 I expected. How can I collapse a list of arrays to a single list?
edit:
b = np.concatenate(old_b).ravel().tolist()
will lead to the desired result. Thanks all.

how to random generate a sequence of list that are unobserved before in python 3

Assume I have the code in Python 3
X, Y, Z = 10, 20, 30
data = [[1,3,6],[8,15,29],[8,9,19]] # observe data
Then how can I random generate n (not very large) data elements that are not in the data.
Condition: the element [a,b,c] must be not in data and 0<a<X, 0<b<Y, 0<c<Z
[1,3,5] is good since it is not in data and its element satisfy the Condition
[11,3,6] is bad since it does not satisfy the Condition, 11>10
For example, when n=4, I want a list of element that are not duplicate
newdata = [[1,6,6],[8,17,25],[2,6,11], [4,6,12]]
This should do it:
from random import randint
X, Y, Z = 10, 20, 30
data = [[1,3,6],[8,15,29],[8,9,19]]
n = 4
newdata = set()
for i in range(n):
while True:
l = [randint(1, X), randint(1, Y), randint(1, Z)]
if l not in data:
newdata.add(tuple(l))
break
print(newdata)
Example result:
newdata = [(9, 9, 11), (10, 10, 4), (7, 6, 23), (2, 10, 4)]
It took a small effort, but this seems to work:
from random import *
from pprint import pprint
X, Y, Z = 10, 20, 30
data = [[1,3,6],[8,15,29],[8,9,19]]
while 1:
newData = []
try: n = int(input("How many lists do you want: "))
except:
print("Please enter an integer.\n")
continue
for i in range(n):
newList = [randrange(1, X), randrange(1, Y), randrange(1, Z)]
while (newList in data) or (newList in newData):
newList = [randrange(1, X), randrange(1, Y), randrange(1, Z)]
newData.append(newList)
pprint(newData)
This works by creating an empty list, getting a value for n, then entering a loop of exactly n iterations. It then creates a new list that satisfies the requirements. If the new list is in the observed data list, it just does it again and again until it isn't in the data. Then it adds this data to the output list and repeats the process until the for loop breaks (after n iterations).
There may be a better way of doing it, but this does the trick.
In case X, Y, Z are not too large you can just create all possible combinations and then sample from this pool:
import itertools as it
import random
x, y, z = 10, 20, 30
pool = it.product(range(x), range(y), range(z))
data = [(1, 3, 6), (8, 15, 29), (8, 9, 19)]
pool = set(pool) - set(data)
n = 4
newdata = random.sample(pool, n)
For higher performance you can use Numpy and the fact that the tuples can be converted to integer and back by simply enumerating them (as z, y, x gets enumerated):
import numpy as np
x, y, z = 100, 200, 300
n = 1000
data = [[1,3,6],[8,15,29],[8,9,19]]
forbidden = [i[0]*y*z + i[1]*z + i[2] for i in data]
pool = np.arange(x*y*z)
mask = np.ones(pool.size, dtype=bool)
mask[forbidden] = False
pool = pool[mask]
newdata = np.random.choice(pool, n, replace=False)
newdata = [(i // (y*z), i // z, i % z) for i in newdata]

scipy curve_fit with arrays TypeError: only length-1 arrays can be converted to Python scalars

I am trying to create the curve fit with scipy for the energy eigenvalues calculated from a 4x4 Hamiltonian matrix. In the following error "energies" corresponds to the function in which I define the Hamiltonian, "xdata" is an array given after and out of the function and corresponds to k and "e" is the energy eigenvalues that a get.
The error seems to be at the Hamiltonian matrix. However if I run the code without the curve_fit everything works fine.
I have also tried using np.array according to other questions I found here but again it doesn't work.
If a give a specific xdata in the curve fit, like xdata[0], the code works but it doesn't help me much since I want the fit using all values.
Does anyone know what is the problem? Thank you all in advance!
Traceback (most recent call last):
File "fitest.py", line 70, in <module>
popt, pcov = curve_fit(energies,xdata, e)#,
File "/eb/software/Python/2.7.12-intel-2016b/lib/python2.7/site-packages/scipy/optimize/minpack.py", line 651, in curve_fit
res = leastsq(func, p0, args=args, full_output=1, **kwargs)
File "/eb/software/Python/2.7.12-intel-2016b/lib/python2.7/site-packages/scipy/optimize/minpack.py", line 377, in leastsq
shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
File "/eb/software/Python/2.7.12-intel-2016b/lib/python2.7/site-packages/scipy/optimize/minpack.py", line 26, in _check_func
res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
File "/eb/software/Python/2.7.12-intel-2016b/lib/python2.7/site-packages/scipy/optimize/minpack.py", line 453, in _general_function
return function(xdata, *params) - ydata
File "fitest.py", line 23, in energies
[ 0.0, 0.0, 0.0, ep-2*V4*cos(kpt*d) ]],dtype=complex)
TypeError: only length-1 arrays can be converted to Python scalars
Code:
from numpy import sin, cos, array
from scipy.optimize import curve_fit
from numpy import *
from numpy.linalg import *
def energies(kpt, a=1.0, b=2.0, c=3.0, f=4.0):
e1=-15.0
e2=-10.0
d=1.0
v0=(-2.0/d**2)
V1=a*v0
V2=b*v0
V3=c*v0
V4=d*v0
basis=('|S, s>', '|S,px>', '|S, py>', '|S,pz>')
h=array([[ e1-2*V1*cos(kpt*d), -2*V2*1j*sin(kpt*d), 0.0, 0.0 ],
[ 2*V2*1j*sin(kpt*d), e2-2*V3*cos(kpt*d), 0.0, 0.0],
[ 0.0, 0.0, e2-2*V4*cos(kpt*d), 0.0],
[ 0.0, 0.0, 0.0, e2-2*V4*cos(kpt*d) ]],dtype=complex)
e,psi=eigh(h)
return e
print energies(kpt=0.0)
k2=0.4*2*pi/2.05
print energies(kpt=k2)
xdata = array([0.0,k2])
print xdata
popt, pcov = curve_fit(energies, xdata, e)
print " "
print popt
print " "
Your problem has nothing to do with your fit, you run into the same problem, if you perform
print energies(xdata)
The reason for this error message is that you put an array kpt into h as an array element and then tell numpy, to transform this array kpt into a complex number. Numpy is kind enough to transform an array of length one into a scalar, which then can be transformed into a complex number. This explains, why you didn't get an error message with xdata[0]. You can easily reproduce your problem like this
import numpy as np
#all fine with an array of length one
xa = np.asarray([1])
a = np.asarray([[xa, 2, 3], [4, 5, 6]])
print a
print a.astype(complex)
#can't apply dtype = complex to an array with two elements
xb = np.asarray([1, 2])
b = np.asarray([[xb, 2, 3], [4, 5, 6]])
print b
print b.astype(complex)
Idk, what you were trying to achieve with your energies function, so I can only speculate, what you were aiming at, when constructing the h array. Maybe a 3D array like this?
kpt = np.asarray([1, 2, 3])
h = np.zeros(16 * len(kpt), dtype = complex).reshape(len(kpt), 4, 4)
h[:, 0, 0] = 2 * kpt + 1
h[:, 0, 1] = kpt ** 2
h[:, 3, 2] = np.sin(kpt)
print h

Resources