Numpy Stack :: all input arrays must have the same shape - arrays

I was Trying to to Calculate initial embedding of all data frame which is first step to implement my GNN which is Heterogeneous in nature.I have used twitter msg data set in the task I loaded it in following way
load_path = '/Users/hemangjiwnani/Desktop/Projects/Paper1/KPGNN/datasets/Twitter/'
save_path = '/Users/hemangjiwnani/Desktop/Projects/Paper1/KPGNN/datasets/Twitter/'
# load dataset`enter code here`
p_part1 = load_path + '68841_tweets_multiclasses_filtered_0722_part1.npy'
p_part2 = load_path + '68841_tweets_multiclasses_filtered_0722_part2.npy'
#"./datasets/Twitter/68841_tweets_multiclasses_filtered_0722_part1.npy"
df_np_part1 = np.load(p_part1, allow_pickle=True)
df_np_part2 = np.load(p_part2, allow_pickle=True)
Then I have created a data frame of the same with the help of following code
df_np = np.concatenate((df_np_part1, df_np_part2), axis = 0) #Axis = 0 means horizontal
print("Loaded data.")
df = pd.DataFrame(data=df_np, columns=["event_id", "tweet_id", "text", "user_id", "created_at", "user_loc",\
"place_type", "place_full_name", "place_country_code", "hashtags", "user_mentions", "image_urls", "entities",
"words", "filtered_words", "sampled_words"])
print("Data converted to dataframe.")
print(df.shape)
print(df.head(5))
Which was having following output
Loaded data.
Data converted to dataframe.
(68841, 16)
event_id ... sampled_words
0 0 ... []
1 0 ... []
2 0 ... []
3 0 ... []
4 0 ... []
[5 rows x 16 columns]
This Function Below is raising a error while returning
def documents_to_features(df):
nlp = spacy.load("en_core_web_sm"
#nlp = en_core_web_lg.load()
features = df.filtered_words.apply(lambda x: nlp(' '.join(x)).vector).values
-->return np.stack(features, axis=0)
ERROR
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-2e8bc5e83009> in <module>()
----> 1 d_features = documents_to_features(df)
2 print("Document features generated.")
3 t_features = df_to_t_features(df)
4 print("Time features generated.")
5 combined_features = np.concatenate((d_features, t_features), axis=1)
1 frames
<ipython-input-9-b772a7744232> in documents_to_features(df)
3 #nlp = en_core_web_lg.load()
4 features = df.filtered_words.apply(lambda x: nlp(' '.join(x)).vector).values
----> 5 return np.stack(features, axis=0)
<__array_function__ internals> in stack(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/numpy/core/shape_base.py in stack(arrays, axis, out)
425 shapes = {arr.shape for arr in arrays}
426 if len(shapes) != 1:
--> 427 raise ValueError('all input arrays must have the same shape')
428
429 result_ndim = arrays[0].ndim + 1
ValueError: all input arrays must have the same shape

Related

Getting No such file or directory [[{{node ReadFile}}]] [[IteratorGetNext]] [Op:__inference_train_function_9137] error

This may be a simple answer, but currently making a neural network using keras and I ran into this problem through this code
\`EPOCHS = 50
callbacks = \[
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='min', min_delta=0.0001),
tf.keras.callbacks.ModelCheckpoint(
'weights.tf', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True),
tf.keras.callbacks.EarlyStopping(
monitor='val_loss', min_delta=0, patience=15, verbose=1, restore_best_weights=True)
\]
history = model.fit(
train_ds,
validation_data=val_ds,
verbose=1,
callbacks=callbacks,
epochs=EPOCHS,
)
model.load_weights('weights.tf')
model.evaluate(val_ds)\`
Output:
`Epoch 1/50
NotFoundError Traceback (most recent call last)
\<ipython-input-15-265d39d703c7\> in \<module\>
10 \]
11
\---\> 12 history = model.fit(
13 train_ds,
14 validation_data=val_ds,
1 frames
/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
52 try:
53 ctx.ensure_initialized()
\---\> 54 tensors = pywrap_tfe.TFE_Py_Execute(ctx.\_handle, device_name, op_name,
55 inputs, attrs, num_outputs)
56 except core.\_NotOkStatusException as e:
NotFoundError: Graph execution error:
train/60377.jpg; No such file or directory
\[\[{{node ReadFile}}\]\]
\[\[IteratorGetNext\]\] \[Op:\__inference_train_function_9137\]
`
Here's my data:
FairFace Dataset from Kaggle
Here's how I preprocessed (through code I borrowed) the images from the FairFace dataset.
\`IMG_SIZE = 224
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 224
NUM_CLASSES = len(labels_map)
# Dataset creation
y_train = tf.keras.utils.to_categorical(train.race, num_classes=NUM_CLASSES, dtype='float32')
y_val = tf.keras.utils.to_categorical(val.race, num_classes=NUM_CLASSES, dtype='float32')
train_ds = tf.data.Dataset.from_tensor_slices((train.file, y_train)).shuffle(len(y_train))
val_ds = tf.data.Dataset.from_tensor_slices((val.file, y_val))
assert len(train_ds) == len(train.file) == len(train.race)
assert len(val_ds) == len(val.file) == len(val.race)
# Read files
def map_fn(path, label):
image = tf.io.decode_jpeg(tf.io.read_file(path))
image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
return image, label
# Read files
train_ds = train_ds.map(lambda path, lbl: (tf.io.decode_jpeg(tf.io.read_file(path)), lbl), num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(lambda path, lbl: (tf.io.decode_jpeg(tf.io.read_file(path)), lbl), num_parallel_calls=AUTOTUNE)
# Batch and resize after batch, then prefetch
train_ds = val_ds.map(lambda imgs, lbls: (tf.image.resize(imgs, (IMG_SIZE, IMG_SIZE)), lbls), num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(lambda imgs, lbls: (tf.image.resize(imgs, (IMG_SIZE, IMG_SIZE)), lbls), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)
# Performance enchancement - cache, batch, prefetch
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)\`
I tried changing the jpg file name but to no avail.

Tensorflow: convert PrefetchDataset to BatchDataset

Tensorflow: convert PrefetchDataset to BatchDataset
With latest Tensorflow version 2.3.1I am trying to follow basic text classification example at: https://www.tensorflow.org/tutorials/keras/text_classification. Instead of creating dataset from directory as example does, I am using a csv file:
SELECT_COLUMNS = ['SentimentText','Sentiment']
LABEL_COLUMN = 'Sentiment'
LABELS = [0, 1]
def get_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=3, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
As a result I get:
type(all_data)
tensorflow.python.data.ops.dataset_ops.PrefetchDataset
Example loads data from directory with:
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
And gets dataset of another type:
type(raw_train_ds)
tensorflow.python.data.ops.dataset_ops.BatchDataset
Now when I try to standardise and vectorise data with functions from example:
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
And apply them to my dataset I get error:
# Make a text-only dataset (without labels), then call adapt
train_text = all_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-1f1fc445912d> in <module>
1 # Make a text-only dataset (without labels), then call adapt
2 train_text = all_data.map(lambda x, y: x)
----> 3 vectorize_layer.adapt(train_text)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/layers/preprocessing/text_vectorization.py in adapt(self, data, reset_state)
378 shape = dataset_ops.get_legacy_output_shapes(data)
379 if not isinstance(shape, tensor_shape.TensorShape):
--> 380 raise ValueError("The dataset passed to 'adapt' must contain a single "
381 "tensor value.")
382 if shape.rank == 0:
ValueError: The dataset passed to 'adapt' must contain a single tensor value.
How to convert PrefetchDataset to BatchDataset ?
You could use tf.stack method to pack the features into a single array. The below function is from Custom training: walkthrough in Tensorflow.
def pack_features_vector(features, labels):
features = tf.stack(list(features.values()), axis=1)
return features, labels
all_data = get_dataset(data_path, select_columns=SELECT_COLUMNS)
train_dataset = all_data.map(pack_features_vector)
train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Complex number in Fenics

I am currently trying to solve a complex-valued PDE with Fenics in a jupyter notebook but I am having trouble when I try to use a complex number in Fenics.
Here is how I've defined the variational problem:
u = TrialFunction(V)
v = TestFunction(V)
a = (inner(grad(u[0]), grad(v[0])) + inner(grad(u[1]), grad(v[1])))*dx + sin(lat)*(u[0]*v[1]-u[1]*v[0])*dx+1j*((-inner(grad(u[0]), grad(v[1])) + inner(grad(u[1]), grad(v[0])))*dx + (sin(lat)*(u[0]*v[0]-u[1]*v[1])*dx))
f = Constant((1.0,1.0))
b = (v[0]*f[0]+f[1]*v[1])*ds+1j*((f[1]*v[0]-f[0]*v[1])*ds)
I got the following error message:
AttributeError Traceback (most recent call last)
<ipython-input-74-7760afa5a395> in <module>()
1 u = TrialFunction(V)
2 v = TestFunction(V)
----> 3 a = (inner(grad(u[0]), grad(v[0])) + inner(grad(u[1]), grad(v[1])))*dx + sin(lat)*(u[0]*v[1]-u[1]*v[0])*dx+1j*((-inner(grad(u[0]), grad(v[1])) + inner(grad(u[1]), grad(v[0])))*dx + (sin(lat)*(u[0]*v[0]-u[1]*v[1])*dx)
4 f = Constant((0.0,0.0))
5 b = (v[0]*f[0]+f[1]*v[1])*ds+1j*((f[1]*v[0]-f[0]*v[1])*ds)
~/anaconda3_420/lib/python3.5/site-packages/ufl/form.py in __rmul__(self, scalar)
305 "Multiply all integrals in form with constant scalar value."
306 # This enables the handy "0*form" or "dt*form" syntax
--> 307 if is_scalar_constant_expression(scalar):
308 return Form([scalar*itg for itg in self.integrals()])
309 return NotImplemented
~/anaconda3_420/lib/python3.5/site-packages/ufl/checks.py in is_scalar_constant_expression(expr)
84 if is_python_scalar(expr):
85 return True
---> 86 if expr.ufl_shape:
87 return False
88 return is_globally_constant(expr)
AttributeError: 'complex' object has no attribute 'ufl_shape'
Could someone please help me?
By the way, Fenics might not be the best tool to solve complex-valued PDE and I would like to read your suggestions about such problems.

Manipulating character arrays quickly in R data.table [duplicate]

This question already has answers here:
Faster way to read fixed-width files
(4 answers)
Closed 4 years ago.
I have a huge datatset (14GB, 200 Mn rows) of character vector. I've fread it (took > 30 mins on 48 core 128 GB server). The string contains concatenated information on various fields. For instance, the first row of my table looks like:
2014120900000001091500bbbbcompany_name00032401
where the first 8 characters represent date in YYYYMMDD format, next 8 characters are id, next 6 the time in HHMMSS format and then next 16 are name (prefixed with b's) and the last 8 are price (2 decimal places).
I need to transfer the above 1 column data.table into 5 columns: date, id, time, name, price.
For the above character vector that will turn out to be: date = "2014-12-09", id = 1, time = "09:15:00", name = "company_name", price = 324.01
I am looking for a (very) fast and efficient dplyr / data.table solution. Right now I am doing it with using substr:
date = as.Date(substr(d, 1, 8), "%Y%m%d");
and it's taking forever to execute!
Update: With readr::read_fwf I am able to read the file in 5-10 mins. Apparently, the reading is faster than fread. Below is the code:
f = "file_name";
num_cols = 5;
col_widths = c(8,8,6,16,8);
col_classes = "ciccn";
col_names = c("date", "id", "time", "name", "price");
# takes 5-10 mins
data = readr::read_fwf(file = f, col_positions = readr::fwf_widths(col_widths, col_names), col_types = col_classes, progress = T);
setDT(data);
# object.size(data) / 2^30; # 17.5 GB
A possible solution:
library(data.table)
library(stringi)
widths <- c(8,8,6,16,8)
sp <- c(1, cumsum(widths[-length(widths)]) + 1)
ep <- cumsum(widths)
DT[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i]))]
which gives:
V1 V2 V3 V4 V5
1: 20141209 00000001 091500 bbbbcompany_name 00032401
Including some additional processing to get the desired result:
DT[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i]))
][, .(date = as.Date(V1, "%Y%m%d"),
id = as.integer(V2),
time = as.ITime(V3, "%H%M%S"),
name = sub("^(bbbb)","",V4),
price = as.numeric(V5)/100)]
which gives:
date id time name price
1: 2014-12-09 1 09:15:00 company_name 324.01
But you are actually reading a fixed width file. So could also consider read.fwf from base R or read_fwffrom readr or write your own fread.fwf-function like I did a while ago:
fread.fwf <- function(file, widths, enc = "UTF-8") {
sp <- c(1, cumsum(widths[-length(widths)]) + 1)
ep <- cumsum(widths)
fread(file = file, header = FALSE, sep = "\n", encoding = enc)[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i]))]
}
Used data:
DT <- data.table(V1 = "2014120900000001091500bbbbcompany_name00032401")
Maybe your solution is not so bad.
I am using this data:
df <- data.table(text = rep("2014120900000001091500bbbbcompany_name00032401", 100000))
Your solution:
> system.time(df[, .(date = as.Date(substr(text, 1, 8), "%Y%m%d"),
+ id = as.integer(substr(text, 9, 16)),
+ time = substr(text, 17, 22),
+ name = substr(text, 23, 38),
+ price = as.numeric(substr(text, 39, 46))/100)])
user system elapsed
0.17 0.00 0.17
#Jaap solution:
> library(data.table)
> library(stringi)
>
> widths <- c(8,8,6,16,8)
> sp <- c(1, cumsum(widths[-length(widths)]) + 1)
> ep <- cumsum(widths)
>
> system.time(df[, lapply(seq_along(sp), function(i) stri_sub(text, sp[i], ep[i]))
+ ][, .(date = as.Date(V1, "%Y%m%d"),
+ id = as.integer(V2),
+ time = V3,
+ name = sub("^(bbbb)","",V4),
+ price = as.numeric(V5)/100)])
user system elapsed
0.20 0.00 0.21
An attempt with read.fwf:
> setClass("myDate")
> setAs("character","myDate", function(from) as.Date(from, format = "%Y%m%d"))
> setClass("myNumeric")
> setAs("character","myNumeric", function(from) as.numeric(from)/100)
>
> ff <- function(x) {
+ file <- textConnection(x)
+ read.fwf(file, c(8, 8, 6, 16, 8),
+ col.names = c("date", "id", "time", "name", "price"),
+ colClasses = c("myDate", "integer", "character", "character", "myNumeric"))
+ }
>
> system.time(df[, as.list(ff(text))])
user system elapsed
2.33 6.15 8.49
All outputs are the same.
Maybe try using matrix with numeric instead of data.frame. Aggregation should take less time.

TypeError: ufunc 'add' did not contain a loop

I use Anaconda and gdsCAD and get an error when all packages are installed correctly.
Like explained here: http://pythonhosted.org/gdsCAD/
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
My imports look like this (In the end I imported everything):
import numpy as np
from gdsCAD import *
import matplotlib.pyplot as plt
My example code looks like this:
something = core.Elements()
box=shapes.Box( (5,5),(1,5),0.5)
core.default_layer = 1
core.default_colors = 2
something.add(box)
something.show()
My error message looks like this:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-2f90b960c1c1> in <module>()
31 puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
32 bp.add(puffer_wafer)
---> 33 bp.show()
34 wafer = shapes.Circle((0.,0.), wafer_radius, wafer_line_thickness)
35 bp.add(wafer)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _show(self)
80 ax.margins(0.1)
81
---> 82 artists=self.artist()
83 for a in artists:
84 a.set_transform(a.get_transform() + ax.transData)
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
952 art=[]
953 for p in self:
--> 954 art+=p.artist()
955 return art
956
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in artist(self, color)
475 poly = lines.buffer(self.width/2.)
476
--> 477 return [descartes.PolygonPatch(poly, lw=0, **self._layer_properties(self.layer))]
478
479
C:\Users\rpilz\AppData\Local\Continuum\Anaconda2\lib\site-packages\gdscad-0.4.5-py2.7.egg\gdsCAD\core.pyc in _layer_properties(layer)
103 # Default colors from previous versions
104 colors = ['k', 'r', 'g', 'b', 'c', 'm', 'y']
--> 105 colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
106 color = colors[layer % len(colors)]
107 return {'color': color}
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The gdsCAD has been a pain from shapely install to this plotting issue.
This issue is because of wrong datatype being passed to colors function. It can be solved by editing the following line in core.py
colors += matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15))
to
colors += list(matplotlib.cm.gist_ncar(np.linspace(0.98, 0, 15)))
If you dont know where the core.py is located. Just type in:
from gdsCAD import *
core
This will give you the path of core.py file. Good luck !
Well first, I'd ask that you please include actual code, as the 'example code' in the file is obviously different based on the traceback. When debugging, the details matter, and I need to be able to actually run the code.
You obviously have a data type problem. Chances are pretty good it's in the variables here:
puffer_wafer = shapes.Circle((0.,0.), puffer_wafer_radius, puffer_line_thickness)
I had the same error thrown when I was running a call to Pandas. I changed the data to str(data) and the code worked.
I don't know if this helps I am fairly new to this myself, but I had a similar error and found that it is due to a type casting issue as suggested by previous answer. I can't see from the example in the question exactly what you are trying to do. Below is a small example of my issue and solution. My code is making a simple Random Forest model using scikit learn.
Here is an example that will give the error and it is caused by the third to last line, concatenating the results to write to file.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
This leads to an error of;
Traceback (most recent call last):
File "min_example.py", line 40, in <module>
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The solution is to make each variable a str() type on the third to last line then write to file. No other changes to then code have been made from the above.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(str(RFpreds[i])+",,"+str(yTest[i])+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
These examples are from a larger code so I hope the examples are clear enough.

Resources