Tensorflow: convert PrefetchDataset to BatchDataset
With latest Tensorflow version 2.3.1I am trying to follow basic text classification example at: https://www.tensorflow.org/tutorials/keras/text_classification. Instead of creating dataset from directory as example does, I am using a csv file:
SELECT_COLUMNS = ['SentimentText','Sentiment']
LABEL_COLUMN = 'Sentiment'
LABELS = [0, 1]
def get_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=3, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
As a result I get:
type(all_data)
tensorflow.python.data.ops.dataset_ops.PrefetchDataset
Example loads data from directory with:
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
And gets dataset of another type:
type(raw_train_ds)
tensorflow.python.data.ops.dataset_ops.BatchDataset
Now when I try to standardise and vectorise data with functions from example:
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
And apply them to my dataset I get error:
# Make a text-only dataset (without labels), then call adapt
train_text = all_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-1f1fc445912d> in <module>
1 # Make a text-only dataset (without labels), then call adapt
2 train_text = all_data.map(lambda x, y: x)
----> 3 vectorize_layer.adapt(train_text)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/layers/preprocessing/text_vectorization.py in adapt(self, data, reset_state)
378 shape = dataset_ops.get_legacy_output_shapes(data)
379 if not isinstance(shape, tensor_shape.TensorShape):
--> 380 raise ValueError("The dataset passed to 'adapt' must contain a single "
381 "tensor value.")
382 if shape.rank == 0:
ValueError: The dataset passed to 'adapt' must contain a single tensor value.
How to convert PrefetchDataset to BatchDataset ?
You could use tf.stack method to pack the features into a single array. The below function is from Custom training: walkthrough in Tensorflow.
def pack_features_vector(features, labels):
features = tf.stack(list(features.values()), axis=1)
return features, labels
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
train_dataset = all_data.map(pack_features_vector)
train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
Related
I'm trying to load the Cornell dataset from PyTorch Geometric to train my Graph Neural Network. I want to apply a mask but I achieve this error (also on Chameleon, Wisconsin, Texas datasets). My Dataset class works perfectly with all the datasets of Planetoid that are mono dimensional tensors, probable bidimensional tensors give problem.
I insert my code that can be ruined on Colab without problems.
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html
import torch_geometric
from torch_geometric.datasets import Planetoid, WebKB
from torch_geometric.utils import to_dense_adj, to_undirected, remove_self_loops
class Dataset(object):
def __init__(self, name):
super(Dataset, self).__init__()
self.name = name
if (name == 'Cora'):
dataset = Planetoid(root='/tmp/Cora', name='Cora', split="full")
if(name == 'Citeseer'):
dataset = Planetoid(root='/tmp/Cora', name='Citeseer', split="full")
if(name == 'PubMed'):
dataset = Planetoid(root='/tmp/Cora', name='Pubmed', split="full")
if(name == 'Cornell'):
dataset = WebKB(root='/tmp/WebKB', name='Cornell')
self.data = dataset[0]
print(self.data)
self.train_mask = self.data.train_mask
self.valid_mask = self.data.val_mask
self.test_mask = self.data.test_mask
def train_val_test_split(self):
train_x = self.data.x[self.data.train_mask]
train_y = self.data.y[self.data.train_mask]
valid_x = self.data.x[self.data.val_mask]
valid_y = self.data.y[self.data.val_mask]
test_x = self.data.x[self.data.test_mask]
test_y = self.data.y[self.data.test_mask]
return train_x, train_y, valid_x, valid_y, test_x, test_y
def get_fullx(self):
return self.data.x
def get_edge_index(self):
return self.data.edge_index
def get_adjacency_matrix(self):
# We will ignore this for the first part
adj = to_dense_adj(self.data.edge_index)[0]
return adj
The error that I achieve is in the title and is obtained in this snippet:
cornell_dataset = Dataset(name = 'Cornell')
train_x, train_y, valid_x, valid_y, test_x, test_y = cornell_dataset.train_val_test_split()
# check and confirm our data shapes match our expectations
print(f"Train shape x: {train_x.shape}, y: {train_y.shape}")
print(f"Val shape x: {valid_x.shape}, y: {valid_y.shape}")
print(f"Test shape x: {test_x.shape}, y: {test_y.shape}")
I want to convert a variable to a string and then to an array that I can use to compare, but i dont know how to do that.
my code:
import face_recognition
import numpy as np
a = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_10_32_24_Pro.jpg') # my picture 1
b = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_09_48_56_Pro.jpg') # my picture 2
c = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191115_09_48_52_Pro.jpg') # my picture 3
d = face_recognition.load_image_file('C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\ziv sion.jpg') # my picture 4
e = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191120_17_46_40_Pro.jpg') # my picture 5
f = face_recognition.load_image_file(
'C:\\Users\zivsi\OneDrive\תמונות\סרט צילום\WIN_20191117_16_19_11_Pro.jpg') # my picture 6
a = face_recognition.face_encodings(a)[0]
b = face_recognition.face_encodings(b)[0]
c = face_recognition.face_encodings(c)[0]
d = face_recognition.face_encodings(d)[0]
e = face_recognition.face_encodings(e)[0]
f = face_recognition.face_encodings(f)[0]
Here I tried to convert the variable to a string
str_variable = str(a)
array_variable = np.array(str_variable)
my_face = a, b, c, d, e, f, array_variable
while True:
new = input('path: ')
print('Recognizing...')
unknown = face_recognition.load_image_file(new)
unknown_encodings = face_recognition.face_encodings(unknown)[0]
The program cannot use the variable:
results = face_recognition.compare_faces(array_variable, unknown_encodings, tolerance=0.4)
print(results)
recognize_times = int(results.count(True))
if (3 <= recognize_times):
print('hello boss!')
my_face = *my_face, unknown_encodings
please help me
The error shown:
Traceback (most recent call last):
File "C:/Users/zivsi/PycharmProjects/AI/pytt.py", line 37, in <module>
results = face_recognition.compare_faces(my_face, unknown_encodings, tolerance=0.4)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-
packages\face_recognition\api.py", line 222, in compare_faces
return list(face_distance(known_face_encodings, face_encoding_to_check) <= tolerance)
File "C:\Users\zivsi\AppData\Local\Programs\Python\Python36\lib\site-packages\face_recognition\api.py", line 72, in face_distance
return np.linalg.norm(face_encodings - face_to_compare, axis=1)
ValueError: operands could not be broadcast together with shapes (7,) (128,)
First of all, the array_variable should actually be a list of the known encodings and not a numpy array.
Also you do not need str.
Now, in your case, if the input images i.e., a,b,c,d,f,e do NOT have the same dimensions, the error will persist. You can not compare images that have different sizes using this function. The reason is that the comparison is based on the distance and distance is defined on vectors of the same length.
Here is a working simple example using the photos from https://github.com/ageitgey/face_recognition/tree/master/examples:
import face_recognition
import numpy as np
from PIL import Image, ImageDraw
from IPython.display import display
# Load a sample picture and learn how to recognize it.
obama_image = face_recognition.load_image_file("obama.jpg")
obama_face_encoding = face_recognition.face_encodings(obama_image)[0]
# Load a second sample picture and learn how to recognize it.
biden_image = face_recognition.load_image_file("biden.jpg")
biden_face_encoding = face_recognition.face_encodings(biden_image)[0]
array_variable = [obama_face_encoding,biden_face_encoding] # list of known encodings
# compare the list with the biden_face_encoding
results = face_recognition.compare_faces(array_variable, biden_face_encoding, tolerance=0.4)
print(results)
[False, True] # True means match, False mismatch
# False: coming from obama_face_encoding VS biden_face_encoding
# True: coming from biden_face_encoding VS biden_face_encoding
To run it go here: https://beta.deepnote.com/project/09705740-31c0-4d9a-8890-269ff1c3dfaf#
Documentation: https://face-recognition.readthedocs.io/en/latest/face_recognition.html
EDIT
To save the known encodings you can use numpy.save
np.save('encodings',biden_face_encoding) # save
load_again = np.load('encodings.npy') # load again
I'm adapting a script.py to achieve transfer learning. I find many script to retrain a model by TFRecord files, but none of them worked for me bacause of something about TF2.0 and contrib, so I'm trying to convert a script to adapt to TF2 and to my model.
This is my script at the moment:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
keras = tf.keras
EPOCHS = 1
# Data preprocessing
import pathlib
#data_dir = tf.keras.utils.get_file(origin="/home/pi/venv/raccoon_dataset/", fname="raccoons_dataset")
#data_dir = pathlib.Path(data_dir)
data_dir = "/home/pi/.keras/datasets/ssd_mobilenet_v1_coco_2018_01_28/saved_model/saved_model.pb"
######################
# Read the TFRecords #
######################
def imgs_input_fn(filenames, perform_shuffle=False, repeat_count=1, batch_size=1):
def _parse_function(serialized):
features = \
{
'image': tf.io.FixedLenFeature([], tf.string),
'label': tf.io.FixedLenFeature([], tf.int64)
}
# Parse the serialized data so we get a dict with our data.
parsed_example = tf.io.parse_single_example(serialized=serialized,
features=features)
print("\nParsed example:\n", parsed_example, "\nEnd of parsed example:\n")
# Get the image as raw bytes.
image_shape = tf.stack([300, 300, 3])
image_raw = parsed_example['image']
label = tf.cast(parsed_example['label'], tf.float32)
# Decode the raw bytes so it becomes a tensor with type.
image = tf.io.decode_raw(image_raw, tf.uint8)
image = tf.cast(image, tf.float32)
image = tf.reshape(image, image_shape)
#image = tf.subtract(image, 116.779) # Zero-center by mean pixel
#image = tf.reverse(image, axis=[2]) # 'RGB'->'BGR'
d = dict(zip(["image"], [image])), [label]
return d
dataset = tf.data.TFRecordDataset(filenames=filenames)
# Parse the serialized data in the TFRecords files.
# This returns TensorFlow tensors for the image and labels.
#print("\nDataset before parsing:\n",dataset,"\n")
dataset = dataset.map(_parse_function)
#print("\nDataset after parsing:\n",dataset,"\n")
if perform_shuffle:
# Randomizes input using a window of 256 elements (read into memory)
dataset = dataset.shuffle(buffer_size=256)
dataset = dataset.repeat(repeat_count) # Repeats dataset this # times
dataset = dataset.batch(batch_size) # Batch size to use
print("\nDataset batched:\n", dataset, "\nEnd dataset\n")
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
print("\nIterator shape:\n", tf.compat.v1.data.get_output_shapes(iterator),"\nEnd\n")
#print("\nIterator:\n",iterator.get_next(),"\nEnd Iterator\n")
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
raw_train = tf.compat.v1.estimator.TrainSpec(input_fn=imgs_input_fn(
"/home/pi/venv/raccoon_dataset/data/train.record",
perform_shuffle=True,
repeat_count=5,
batch_size=20),
max_steps=1)
and this is the resulting screen:
Parsed example:
{'image': <tf.Tensor 'ParseSingleExample/ParseSingleExample:0' shape=() dtype=string>, 'label': <tf.Tensor 'ParseSingleExample/ParseSingleExample:1' shape=() dtype=int64>}
End of parsed example:
Dataset batched:
<BatchDataset shapes: ({image: (None, 300, 300, 3)}, (None, 1)), types: ({image: tf.float32}, tf.float32)>
End dataset
Iterator shape:
({'image': TensorShape([None, 300, 300, 3])}, TensorShape([None, 1]))
End
2019-11-20 14:01:14.493817: W tensorflow/core/framework/op_kernel.cc:1622] OP_REQUIRES failed at example_parsing_ops.cc:240 : Invalid argument: Feature: image (data type: string) is required but could not be found.
2019-11-20 14:01:14.495019: W tensorflow/core/framework/op_kernel.cc:1622] OP_REQUIRES failed at iterator_ops.cc:929 : Invalid argument: {{function_node __inference_Dataset_map__parse_function_27}} Feature: image (data type: string) is required but could not be found.
[[{{node ParseSingleExample/ParseSingleExample}}]]
Traceback (most recent call last):
File "transfer_learning.py", line 127, in <module>
batch_size=20),
File "transfer_learning.py", line 107, in imgs_input_fn
batch_features, batch_labels = iterator.get_next()
File "/home/pi/venv/lib/python3.7/site-packages/tensorflow_core/python/data/ops/iterator_ops.py", line 737, in get_next
return self._next_internal()
File "/home/pi/venv/lib/python3.7/site-packages/tensorflow_core/python/data/ops/iterator_ops.py", line 651, in _next_internal
output_shapes=self._flat_output_shapes)
File "/home/pi/venv/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_dataset_ops.py", line 2673, in iterator_get_next_sync
_six.raise_from(_core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: {{function_node __inference_Dataset_map__parse_function_27}} Feature: image (data type: string) is required but could not be found.
[[{{node ParseSingleExample/ParseSingleExample}}]] [Op:IteratorGetNextSync]
I don't know what I'm doing wrong.
I have used the example described here (http://openmdao.readthedocs.org/en/1.5.0/usr-guide/tutorials/doe-drivers.html?highlight=driver) to show my problem. I want to use the same approach for one component were "params" are array and no longer float . See example below
from openmdao.api import IndepVarComp, Group, Problem, ScipyOptimizer, ExecComp, DumpRecorder, Component
from openmdao.drivers.latinhypercube_driver import LatinHypercubeDriver, OptimizedLatinHypercubeDriver
import numpy as np
class Paraboloid(Component):
""" Evaluates the equation f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3 """
def __init__(self):
super(Paraboloid, self).__init__()
self.add_param('x', val=0.0)
self.add_param('y', val=0.0)
self.add_output('f_xy', val=0.0)
def solve_nonlinear(self, params, unknowns, resids):
"""f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3
"""
x = params['x']
y = params['y']
unknowns['f_xy'] = (x-3.0)**2 + x*y + (y+4.0)**2 - 3.0
def linearize(self, params, unknowns, resids):
#""" Jacobian for our paraboloid."""
x = params['x']
y = params['y']
J = {}
J['f_xy', 'x'] = 2.0*x - 6.0 + y
J['f_xy', 'y'] = 2.0*y + 8.0 + x
return J
class ParaboloidArray(Component):
""" Evaluates the equation f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3 """
def __init__(self):
super(ParaboloidArray, self).__init__()
self.add_param('X', val=np.array([0., 0.]))
self.add_output('f_xy', val=0.0)
def solve_nonlinear(self, params, unknowns, resids):
"""f(x,y) = (x-3)^2 + xy + (y+4)^2 - 3
"""
x = params['X'][0]
y = params['y'][1]
unknowns['f_xy'] = (x-3.0)**2 + x*y + (y+4.0)**2 - 3.0
top = Problem()
root = top.root = Group()
root.add('p1', IndepVarComp('x', 50.0), promotes=['*'])
root.add('p2', IndepVarComp('y', 50.0), promotes=['*'])
root.add('comp', Paraboloid(), promotes=['*'])
top.driver = OptimizedLatinHypercubeDriver(num_samples=4, seed=0, population=20, generations=4, norm_method=2)
top.driver.add_desvar('x', lower=-50.0, upper=50.0)
top.driver.add_desvar('y', lower=-50.0, upper=50.0)
top.driver.add_objective('f_xy')
top.setup()
top.run()
top.cleanup()
###########################
print("case float ok")
top = Problem()
root = top.root = Group()
root.add('p1', IndepVarComp('X', np.array([50., 50.])), promotes=['*'])
root.add('comp', ParaboloidArray(), promotes=['*'])
top.driver = OptimizedLatinHypercubeDriver(num_samples=4, seed=0, population=20, generations=4, norm_method=2)
top.driver.add_desvar('X', lower=np.array([-50., -50.]), upper=np.array([50., 50.]))
top.driver.add_objective('f_xy')
top.setup()
top.run()
top.cleanup()
I obtain the following error :
Traceback (most recent call last):
File "C:\Program Files (x86)\Wing IDE 101 5.0\src\debug\tserver\_sandbox.py", line 102, in <module>
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\core\problem.py", line 1038, in run
self.driver.run(self)
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\predeterminedruns_driver.py", line 108, in run
for run in runlist:
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\latinhypercube_driver.py", line 57, in _build_runlist
design_var_buckets = self._get_buckets(bounds['lower'], bounds['upper'])
File "D:\tlefeb\Anaconda2\Lib\site-packages\openmdao\drivers\latinhypercube_driver.py", line 101, in _get_buckets
bucket_walls = np.linspace(low, high, self.num_samples + 1)
File "D:\tlefeb\Anaconda2\Lib\site-packages\numpy\core\function_base.py", line 102, in linspace
if step == 0:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Did I misunderstood something in my way of coding ?
I get a different error than you, using the the latest OpenMDAO master, but I get an error non-the-less. There isn't anything wrong with the mode, but rather there are some bugs with using array variables for DOEs. I've added a bug-fix story to the OpenMDAO backlog, which we'll hopefully be able to deal with in the next couple weeks. We'd gladly accept a pull request if you develop a fix before we get to it though.
I use Anaconda and gdsCAD and get an error when all packages are installed correctly.
Like explained here: http://pythonhosted.org/gdsCAD/
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
My imports look like this (In the end I imported everything):
import numpy as np
from gdsCAD import *
import matplotlib.pyplot as plt
My example code looks like this:
something = core.Elements()
box=shapes.Box( (5,5),(1,5),0.5)
core.default_layer = 1
core.default_colors = 2
something.add(box)
something.show()
My error message looks like this:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-2f90b960c1c1> in <module>()
31 puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
32 bp.add(puffer_wafer)
---> 33 bp.show()
34 wafer = shapes.Circle((0.,0.), wafer_radius, wafer_line_thickness)
35 bp.add(wafer)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _show(self)
80 ax.margins(0.1)
81
---> 82 artists=self.artist()
83 for a in artists:
84 a.set_transform(a.get_transform() + ax.transData)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
952 art=[]
953 for p in self:
--> 954 art+=p.artist()
955 return art
956
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
475 poly = lines.buffer(self.width/2.)
476
--> 477 return [descartes.PolygonPatch(poly, lw=0, **self._layer_properties(self.layer))]
478
479
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _layer_properties(layer)
103 # Default colors from previous versions
104 colors = ['k', 'r', 'g', 'b', 'c', 'm', 'y']
--> 105 colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
106 color = colors[layer % len(colors)]
107 return {'color': color}
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The gdsCAD has been a pain from shapely install to this plotting issue.
This issue is because of wrong datatype being passed to colors function. It can be solved by editing the following line in core.py
colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
to
colors += list(matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15)))
If you dont know where the core.py is located. Just type in:
from gdsCAD import *
core
This will give you the path of core.py file. Good luck !
Well first, I'd ask that you please include actual code, as the 'example code' in the file is obviously different based on the traceback. When debugging, the details matter, and I need to be able to actually run the code.
You obviously have a data type problem. Chances are pretty good it's in the variables here:
puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
I had the same error thrown when I was running a call to Pandas. I changed the data to str(data) and the code worked.
I don't know if this helps I am fairly new to this myself, but I had a similar error and found that it is due to a type casting issue as suggested by previous answer. I can't see from the example in the question exactly what you are trying to do. Below is a small example of my issue and solution. My code is making a simple Random Forest model using scikit learn.
Here is an example that will give the error and it is caused by the third to last line, concatenating the results to write to file.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
This leads to an error of;
Traceback (most recent call last):
File "min_example.py", line 40, in <module>
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The solution is to make each variable a str() type on the third to last line then write to file. No other changes to then code have been made from the above.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(str(RFpreds[i])+",,"+str(yTest[i])+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
These examples are from a larger code so I hope the examples are clear enough.