How i can run GridSearchCV in dast_ml despite this error? - gridsearchcv

This is my code in Google Colab:
import cupy as cp
import numpy as np
import joblib
import dask_ml.model_selection as dcv
def ParamSelection(X, Y, nfolds):
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'kernel':['linear'], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}
svc = svm.SVC()
grid_search = dcv.GridSearchCV(svc, param_grid, cv = nfolds)
grid_search.fit(X, Y)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
return grid_search.best_estimator_
svc = ParamSelection(X_train.astype(cp.int_), y_train.astype(cp.int_), 10)
I have this error
TypeError Traceback (most recent call last)
<ipython-input-163-56196d6a31bd> in <module>()
15 return grid_search.best_estimator_
16
---> 17 svc = ParamSelection(X_train.astype(cp.int_), y_train.astype(cp.int_), 10)
18
9 frames
/usr/local/lib/python3.7/site-packages/cudf/core/frame.py in __array__(self, dtype)
1677 def __array__(self, dtype=None):
1678 raise TypeError(
-> 1679 "Implicit conversion to a host NumPy array via __array__ is not "
1680 "allowed, To explicitly construct a GPU array, consider using "
1681 "cupy.asarray(...)\nTo explicitly construct a "
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU array, consider using cupy.asarray(...)
To explicitly construct a host array, consider using .to_array()
For train_test_split I use function from :
from dask_ml.model_selection import train_test_split
I don't really know, where is problem.
Any suggestions?

Somewhere in the internals, Dask ML is likely calling np.asarray on a cupy array. This method of implicitly causing a CPU to GPU transfer is generally not permitted, so an error is thrown.
If you instead use CPU based data with a cuML estimator, this should work as expected.
import cupy as cp
import dask_ml.model_selection as dcv
from sklearn.datasets import make_classification
from cuml import svm
​
X, y = make_classification(
n_samples=100
)
​
def ParamSelection(X, Y, nfolds):
param_grid = {'C': [0.001, 10, 100],'gamma':[0.001, 100]}
svc = svm.SVC()
grid_search = dcv.GridSearchCV(svc, param_grid, cv = nfolds)
grid_search.fit(X, Y)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
return grid_search.best_estimator_
​
svc = ParamSelection(X, y, 2)
{'C': 10, 'gamma': 0.001}
SVC()
0.8399999737739563

Related

Tensorflow: convert PrefetchDataset to BatchDataset

Tensorflow: convert PrefetchDataset to BatchDataset
With latest Tensorflow version 2.3.1I am trying to follow basic text classification example at: https://www.tensorflow.org/tutorials/keras/text_classification. Instead of creating dataset from directory as example does, I am using a csv file:
SELECT_COLUMNS = ['SentimentText','Sentiment']
LABEL_COLUMN = 'Sentiment'
LABELS = [0, 1]
def get_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=3, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
As a result I get:
type(all_data)
tensorflow.python.data.ops.dataset_ops.PrefetchDataset
Example loads data from directory with:
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
And gets dataset of another type:
type(raw_train_ds)
tensorflow.python.data.ops.dataset_ops.BatchDataset
Now when I try to standardise and vectorise data with functions from example:
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
And apply them to my dataset I get error:
# Make a text-only dataset (without labels), then call adapt
train_text = all_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-1f1fc445912d> in <module>
1 # Make a text-only dataset (without labels), then call adapt
2 train_text = all_data.map(lambda x, y: x)
----> 3 vectorize_layer.adapt(train_text)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/layers/preprocessing/text_vectorization.py in adapt(self, data, reset_state)
378 shape = dataset_ops.get_legacy_output_shapes(data)
379 if not isinstance(shape, tensor_shape.TensorShape):
--> 380 raise ValueError("The dataset passed to 'adapt' must contain a single "
381 "tensor value.")
382 if shape.rank == 0:
ValueError: The dataset passed to 'adapt' must contain a single tensor value.
How to convert PrefetchDataset to BatchDataset ?
You could use tf.stack method to pack the features into a single array. The below function is from Custom training: walkthrough in Tensorflow.
def pack_features_vector(features, labels):
features = tf.stack(list(features.values()), axis=1)
return features, labels
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
train_dataset = all_data.map(pack_features_vector)
train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Increase speed creation for masked xarray file

I am currently trying to crop a retangular xarray file to the shape of a country using a mask grid. Below you can find my current solution (with simpler and smaller arrays). The code works and I get the desired mask based on 1s and 0s. The problem lies on the fact that the code when run on a real country shape (larger and more complex) takes over 30 minutes to run. Since I am using very basic operations here like nested for loops, I also tried different alternatives like a list approach. However, when timing the process, it did not improve on the code below. I wonder if there is a faster way to obtain this mask (vectorization?) or if I should approach the problem in a different way (tried exploring xarray's properties, but have not found anything that tackles this issue yet).
Code below:
import geopandas as gpd
from shapely.geometry import Polygon, Point
import pandas as pd
import numpy as np
import xarray as xr
df = pd.read_csv('Brazil_borders.csv',index_col=0)
lats = np.array([-20, -5, -5, -20,])
lons = np.array([-60, -60, -30, -30])
lats2 = np.array([-10.25, -10.75, -11.25, -11.75, -12.25, -12.75, -13.25, -13.75,
-14.25, -14.75, -15.25, -15.75, -16.25, -16.75, -17.25, -17.75,
-18.25, -18.75, -19.25, -19.75, -20.25, -20.75, -21.25, -21.75,
-22.25, -22.75, -23.25, -23.75, -24.25, -24.75, -25.25, -25.75,
-26.25, -26.75, -27.25, -27.75, -28.25, -28.75, -29.25, -29.75,
-30.25, -30.75, -31.25, -31.75, -32.25, -32.75])
lons2 = np.array([-61.75, -61.25, -60.75, -60.25, -59.75, -59.25, -58.75, -58.25,
-57.75, -57.25, -56.75, -56.25, -55.75, -55.25, -54.75, -54.25,
-53.75, -53.25, -52.75, -52.25, -51.75, -51.25, -50.75, -50.25,
-49.75, -49.25, -48.75, -48.25, -47.75, -47.25, -46.75, -46.25,
-45.75, -45.25, -44.75, -44.25])
points = []
for i in range(len(lats)):
_= [lats[i],lons[i]]
points.append(_)
poly_proj = Polygon(points)
mask = np.zeros((len(lats2),len(lons2))) # Mask with the dataset's shape and size.
for i in range(len(lats2)): # Iteration to verify if a given coordinate is within the polygon's area
for j in range(len(lons2)):
grid_point = Point(lats2[i], lons2[j])
if grid_point.within(poly_proj):
mask[i][j] = 1
bool_final = mask
bool_final
The alternative based on list approach, but with even worse processing time (according to timeit):
lats = np.array([-20, -5, -5, -20,])
lons = np.array([-60, -60, -30, -30])
lats2 = np.array([-10.25, -10.75, -11.25, -11.75, -12.25, -12.75, -13.25, -13.75,
-14.25, -14.75, -15.25, -15.75, -16.25, -16.75, -17.25, -17.75,
-18.25, -18.75, -19.25, -19.75, -20.25, -20.75, -21.25, -21.75,
-22.25, -22.75, -23.25, -23.75, -24.25, -24.75, -25.25, -25.75,
-26.25, -26.75, -27.25, -27.75, -28.25, -28.75, -29.25, -29.75,
-30.25, -30.75, -31.25, -31.75, -32.25, -32.75])
lons2 = np.array([-61.75, -61.25, -60.75, -60.25, -59.75, -59.25, -58.75, -58.25,
-57.75, -57.25, -56.75, -56.25, -55.75, -55.25, -54.75, -54.25,
-53.75, -53.25, -52.75, -52.25, -51.75, -51.25, -50.75, -50.25,
-49.75, -49.25, -48.75, -48.25, -47.75, -47.25, -46.75, -46.25,
-45.75, -45.25, -44.75, -44.25])
points = []
for i in range(len(lats)):
_= [lats[i],lons[i]]
points.append(_)
poly_proj = Polygon(points)
grid_point = [Point(lats2[i],lons2[j]) for i in range(len(lats2)) for j in range(len(lons2))]
mask = [1 if grid_point[i].within(poly_proj) else 0 for i in range(len(grid_point))]
bool_final2 = np.reshape(mask,(((len(lats2)),(len(lons2)))))
Thank you in advance!
Based on this answer from snowman2, I created this simple function that provides a much faster solution by using geopandas and rioxarray. Instead of using a list of latitudes and longitudes, one has to use a shapefile with the desired shape to be masked (Instructions for GeoDataFrame creation from list of coordinates).
import xarray as xr
import geopandas as gpd
import rioxarray
from shapely.geometry import mapping
def mask_shape_border (DS,shape_shp): #Inputs are the dataset to be cropped and the address of the mask file (.shp )
if 'lat' in DS: #Some datasets use lat/lon, others latitude/longitude
DS.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
elif 'latitude' in DS:
DS.rio.set_spatial_dims(x_dim="longitude", y_dim="latitude", inplace=True)
else:
print("Error: check latitude and longitude variable names.")
DS.rio.write_crs("epsg:4326", inplace=True)
mask = gpd.read_file(shape_shp, crs="epsg:4326")
DS_clipped = DS.rio.clip(mask.geometry.apply(mapping), mask.crs, drop=False)
return(DS_clipped)

Python, face_recognition convert string to array

I want to convert a variable to a string and then to an array that I can use to compare, but i dont know how to do that.
my code:
import face_recognition
import numpy as np
a = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_10_32_24_Pro.jpg') # my picture 1
b = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_09_48_56_Pro.jpg') # my picture 2
c = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_09_48_52_Pro.jpg') # my picture 3
d = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\ziv sion.jpg') # my picture 4
e = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191120_17_46_40_Pro.jpg') # my picture 5
f = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191117_16_19_11_Pro.jpg') # my picture 6
a = face_recognition.face_encodings(a)[0]
b = face_recognition.face_encodings(b)[0]
c = face_recognition.face_encodings(c)[0]
d = face_recognition.face_encodings(d)[0]
e = face_recognition.face_encodings(e)[0]
f = face_recognition.face_encodings(f)[0]
Here I tried to convert the variable to a string
str_variable = str(a)
array_variable = np.array(str_variable)
my_face = a, b, c, d, e, f, array_variable
while True:
new = input('path: ')
print('Recognizing...')
unknown = face_recognition.load_image_file(new)
unknown_encodings = face_recognition.face_encodings(unknown)[0]
The program cannot use the variable:
results = face_recognition.compare_faces(array_variable, unknown_encodings, tolerance=0.4)
print(results)
recognize_times = int(results.count(True))
if (3 <= recognize_times):
print('hello boss!')
my_face = *my_face, unknown_encodings
please help me
The error shown:
Traceback (most recent call last):
File "C:/Users/zivsi/PycharmProjects/AI/pytt.py", line 37, in <module>
results = face_recognition.compare_faces(my_face, unknown_encodings, tolerance=0.4)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-
packages\face_recognition\api.py", line 222, in compare_faces
return list(face_distance(known_face_encodings, face_encoding_to_check) <= tolerance)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\face_recognition\api.py", line 72, in face_distance
return np.linalg.norm(face_encodings - face_to_compare, axis=1)
ValueError: operands could not be broadcast together with shapes (7,) (128,)
First of all, the array_variable should actually be a list of the known encodings and not a numpy array.
Also you do not need str.
Now, in your case, if the input images i.e., a,b,c,d,f,e do NOT have the same dimensions, the error will persist. You can not compare images that have different sizes using this function. The reason is that the comparison is based on the distance and distance is defined on vectors of the same length.
Here is a working simple example using the photos from https://github.com/ageitgey/face_recognition/tree/master/examples:
import face_recognition
import numpy as np
from PIL import Image, ImageDraw
from IPython.display import display
# Load a sample picture and learn how to recognize it.
obama_image = face_recognition.load_image_file("obama.jpg")
obama_face_encoding = face_recognition.face_encodings(obama_image)[0]
# Load a second sample picture and learn how to recognize it.
biden_image = face_recognition.load_image_file("biden.jpg")
biden_face_encoding = face_recognition.face_encodings(biden_image)[0]
array_variable = [obama_face_encoding,biden_face_encoding] # list of known encodings
# compare the list with the biden_face_encoding
results = face_recognition.compare_faces(array_variable, biden_face_encoding, tolerance=0.4)
print(results)
[False, True] # True means match, False mismatch
# False: coming from obama_face_encoding VS biden_face_encoding
# True: coming from biden_face_encoding VS biden_face_encoding
To run it go here: https://beta.deepnote.com/project/09705740-31c0-4d9a-8890-269ff1c3dfaf#
Documentation: https://face-recognition.readthedocs.io/en/latest/face_recognition.html
EDIT
To save the known encodings you can use numpy.save
np.save('encodings',biden_face_encoding) # save
load_again = np.load('encodings.npy') # load again

Error when Importing keras in embedded python in C

I'm trying to embed python in my C application. I download the package in python official website and manage to do a simple Hello World.
Now I want to go deeper and use some libraries of python like numpy, keras, tensorflow...
I'm working with Python 3.5.4, I installed all the needed package on my PC with pip3 :
pip3 install keras
pip3 install tensorflow
...
then I created my script and launch it in python environment, it works fine :
Python:
# Importing the libraries
#
import numpy as np
import pandas as pd
dataset2 = pd.read_csv('I:\RNA\dataset19.csv')
X_test = dataset2.iloc[:, 0:228].values
y_test = dataset2.iloc[:, 228].values
# 2.
import pickle
sc = pickle.load(open('I:\RNA\isVerb_sc', 'rb'))
X_test = sc.transform(X_test)
# 3.
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
classifier = Sequential()
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu', input_dim = 228))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 114, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dropout(p = 0.3))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
classifier.load_weights('I:\RNA\isVerb_weights.h5')
y_pred = classifier.predict(X_test)
y_pred1 = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
But when I execute the same script in a C environment with embed python it didn't work :
At first, I execute my script directly with PyRun_SimpleFile with no luck, so I sliced it in multiple instructions with PyRun_SimpleString to detect the problem :
C:
result = PyRun_SimpleString("import numpy as np"); // result = 0 (ok)
result = PyRun_SimpleString("import pandas as pd"); // result = 0 (ok)
...
result = PyRun_SimpleString("import pickle"); // result = 0 (ok)
... (all insctruction above works)
result = PyRun_SimpleString("import keras"); // result = -1 !!
... (all under this failed)
but there is not a single stack trace about this error, I tried this but I just got :
"Here's the output: (null)"
My initialization of Python in C seems correct since others libraries import fine :
// Python
wchar_t *stdProgramName = L"I:\\LIBs\\cpython354";
Py_SetProgramName(stdProgramName);
wchar_t *stdPythonHome = L"I:\\LIBs\\cpython354";
Py_SetPythonHome(stdPythonHome);
wchar_t *stdlib = L"I:\\LIBs\\cpython354;I:\\LIBs\\cpython354\\Lib\\python35.zip;I:\\LIBs\\cpython354\\Lib;I:\\LIBs\\cpython354\\DLLs;I:\\LIBs\\cpython354\\Lib\\site-packages";
Py_SetPath(stdlib);
// Initialize Python
Py_Initialize();
When inside a Python cmd, the line import keras take some time (3sec) but works (a warning but I found no harm around it) :
>>> import keras
I:\LIBs\cpython354\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Using TensorFlow backend.
>>>
I'm at loss now, I don't know where to look at since there is no stack trace.
it seems like when you import keras, it executes this line :
sys.stderr.write('Using TensorFlow backend.\n')
or sys.stderr was not defined in python embedded on windows
A simple correction is to define sys.stderr, for example :
import sys
class CatchOutErr:
def __init__(self):
self.value = ''
def write(self, txt):
self.value += txt
catchOutErr = CatchOutErr()
sys.stderr = catchOutErr

TypeError: ufunc 'add' did not contain a loop

I use Anaconda and gdsCAD and get an error when all packages are installed correctly.
Like explained here: http://pythonhosted.org/gdsCAD/
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
My imports look like this (In the end I imported everything):
import numpy as np
from gdsCAD import *
import matplotlib.pyplot as plt
My example code looks like this:
something = core.Elements()
box=shapes.Box( (5,5),(1,5),0.5)
core.default_layer = 1
core.default_colors = 2
something.add(box)
something.show()
My error message looks like this:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-2f90b960c1c1> in <module>()
31 puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
32 bp.add(puffer_wafer)
---> 33 bp.show()
34 wafer = shapes.Circle((0.,0.), wafer_radius, wafer_line_thickness)
35 bp.add(wafer)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _show(self)
80 ax.margins(0.1)
81
---> 82 artists=self.artist()
83 for a in artists:
84 a.set_transform(a.get_transform() + ax.transData)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
952 art=[]
953 for p in self:
--> 954 art+=p.artist()
955 return art
956
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
475 poly = lines.buffer(self.width/2.)
476
--> 477 return [descartes.PolygonPatch(poly, lw=0, **self._layer_properties(self.layer))]
478
479
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _layer_properties(layer)
103 # Default colors from previous versions
104 colors = ['k', 'r', 'g', 'b', 'c', 'm', 'y']
--> 105 colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
106 color = colors[layer % len(colors)]
107 return {'color': color}
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The gdsCAD has been a pain from shapely install to this plotting issue.
This issue is because of wrong datatype being passed to colors function. It can be solved by editing the following line in core.py
colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
to
colors += list(matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15)))
If you dont know where the core.py is located. Just type in:
from gdsCAD import *
core
This will give you the path of core.py file. Good luck !
Well first, I'd ask that you please include actual code, as the 'example code' in the file is obviously different based on the traceback. When debugging, the details matter, and I need to be able to actually run the code.
You obviously have a data type problem. Chances are pretty good it's in the variables here:
puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
I had the same error thrown when I was running a call to Pandas. I changed the data to str(data) and the code worked.
I don't know if this helps I am fairly new to this myself, but I had a similar error and found that it is due to a type casting issue as suggested by previous answer. I can't see from the example in the question exactly what you are trying to do. Below is a small example of my issue and solution. My code is making a simple Random Forest model using scikit learn.
Here is an example that will give the error and it is caused by the third to last line, concatenating the results to write to file.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
This leads to an error of;
Traceback (most recent call last):
File "min_example.py", line 40, in <module>
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The solution is to make each variable a str() type on the third to last line then write to file. No other changes to then code have been made from the above.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(str(RFpreds[i])+",,"+str(yTest[i])+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
These examples are from a larger code so I hope the examples are clear enough.

Resources