I am trying to partition a 2D numpy array into 2 separate numpy arrays based on the contents
of a particular column. This is my code:
import numpy as np
import pandas as pd
#profile
def partition_data(arr,target_colm):
total_colms = arr.shape[1]
target_data = arr[:,target_colm]
type1_data = []
type2_data = []
for i in range(arr.shape[0]):
if target_data[i]==0: # if value==0, put in another array
type1_data = np.append(type1_data,arr[i])
else:
type2_data = np.append(type2_data,arr[i])
type1_data = np.array(type1_data).reshape(int(len(type1_data)/total_colms),total_colms)
type2_data = np.array(type2_data).reshape(int(len(type2_data)/total_colms),total_colms)
return type1_data, type2_data
d = pd.read_csv('data.csv').values
x,y = partition_data(d,7) # check values of 7th column
Note: For my experiment, I have used a array of (14359,42) elements.
Now, when I profile this function using kernprof line profiler, I get the following results.
Wrote profile results to code.py.lprof
Timer unit: 1e-06 s
Total time: 7.3484 s
File: code2.py
Function: part_data at line 8
Line # Hits Time Per Hit % Time Line Contents
==============================================================
8 #profile
9 def part_data(arr,target_col):
10 1 7.0 7.0 0.0 total_colms = arr.shape[1]
11 1 14.0 14.0 0.0 target_data = arr[:,target_col]
12 1 2.0 2.0 0.0 type1_data = []
13 1 1.0 1.0 0.0 type2_data = []
14 5161 40173.0 7.8 0.5 for i in range(arr.shape[0]):
15 5160 39225.0 7.6 0.5 if target_data[i]==6:
16 4882 7231260.0 1481.2 98.4 type1_data = np.append(type1_data,arr[i])
17 else:
18 278 33915.0 122.0 0.5 type2_data = np.append(type2_data,arr[i])
19 1 3610.0 3610.0 0.0 type1_data = np.array(type1_data).reshape(int(len(type1_data)/total_colms),total_colms)
20 1 187.0 187.0 0.0 type2_data = np.array(type2_data).reshape(int(len(type2_data)/total_colms),total_colms)
21 1 3.0 3.0 0.0 return type1_data, type2_data
Here, one line-16 takes up significant time. In future, the real data size I will work with will be much bigger.
Can anyone please suggest a faster method of partitioning a numpy array?
This should make it alot faster:
def partition_data_vectorized(arr, target_colm):
total_colms = arr.shape[1]
target_data = arr[:,target_colm]
mask = target_data == 0
type1_data = arr[mask, :]
type2_data = arr[~mask, :]
return (
type1_data.reshape(int(type1_data.size / total_colms), total_colms),
type2_data.reshape(int(type2_data.size / total_colms), total_colms))
Some timings:
# Generate some sample inputs:
arr = np.random.rand(10000, 42)
arr[:, 7] = np.random.randint(0, 10, 10000)
%timeit c, d = partition_data_vectorized(arr, 7)
# 2.09 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit a, b = partition_data(arr, 7)
# 4.07 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
This is 2000 times faster than the non-vectorized calculation!
Comparing the results:
np.all(b == d)
# Out: True
np.all(a == c)
# Out: True
So the results are correct and it is 2000 times faster just by replacing the for-loop and the repeated array creation with np.append by vectorized operations.
Related
We have to perform the following operation around 400,000 times so I'm searching for the most efficient solution. I have tried several things but I'm curious whether there are even better approaches :)
Data example
We can use the following code to generate an example test set
random.seed(10)
np.random.seed(10)
def test_str():
n = 10000000
arr = np.random.randint(10000, size=n)
sign = np.random.choice(['+','-'], size=n)
return 'ID1' + '\t' + ' '.join(["{}{}".format(a,b) for a,b in zip(arr, sign)])
Which looks like ID1\t7688+ 737+ 677+ 1508- 9251-......
The code where it is all about :)
Copy the code from google colab (P.s. running it there gave me a TypingError whereas it ran fine on my machine), or just see the functions below
General function
From this Numba issue , but based on #armamut answer this may introduce a lot of overhead with Numba, making native Numpy apparently faster..
#nb.jit(nopython=True)
def str_to_int(s):
final_index, result = len(s) - 1, 0
for i,v in enumerate(s):
result += (ord(v) - 48) * (10 ** (final_index - i))
return result
Approach 1
#nb.jit(nopython=True)
def process_number(numb, identifier, i):
sign = 1 if numb[-1] == '+' else -1
return str_to_int(numb[:-1]), sign, i, identifier
#nb.jit(nopython=True)
def expand1(data):
identifier, l = data.split('\t')
identifier = str_to_int(identifier[-1])
numbers = l.split()
# init emtpy numpy array
arr = np.empty(shape = (len(numbers), 4), dtype = np.int64)
# Fill array
for i, numb in enumerate(numbers):
arr[i,:] = process_number(numb, identifier, i)
return arr
Approach 2
#nb.jit(nopython=True)
def expand2(data):
identifier, l = data.split('\t')
identifier = str_to_int(identifier[-1])
numbers = l.split()
size = len(numbers)
numbs = [ str_to_int(numb[:-1]) for numb in numbers ]
signs = [ 1 if numb[:-1] =='+' else -1 for numb in numbers ]
arr = np.empty(shape = (size, 4), dtype = np.int64)
arr[:,0] = numbs
arr[:,1] = signs
arr[:,2] = np.arange(0, size)
arr[:,3] = np.repeat(identifier, size)
return arr
Approach 3
#nb.jit(nopython=True)
def expand3(data):
identifier, l = data.split('\t')
identifier = str_to_int(identifier[-1])
numbers = l.split()
arr = np.empty(shape = (len(numbers), 4), dtype = np.int64)
for i, numb in enumerate(numbers):
arr[i,:] = str_to_int(numb[:-1]), 1 if numb[:-1] =='+' else -1, i, identifier
return arr
Answer approach
def expand4(t):
identifier, l = t.split('\t')
identifier = np.int(identifier[-1])
numbers = np.array([np.int(k[:-1]) for k in l.split(' ')])
signs = np.array([(k[-1] == '+') for k in l.split(' ')]) * 2 - 1
N = len(numbers)
arr = np.empty(shape = (N, 4), dtype = np.int64)
arr[:, 0] = numbers
arr[:, 1] = signs
arr[:, 2] = identifier
arr[:, 3] = np.arange(N)
return arr
Test results:
Expand 1
72.7 ms ± 177 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
Expand 2
27.9 ms ± 67.1 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
Expand 3
8.81 ms ± 20.3 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)
Expand 4 ANSWER 1
429 µs ± 63.4 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)
I cannot replicate your code, as I also got "ord" is not implemented error for numba.
But why are you using numba? Your str_to_int operation seems to be very expensive and unoptimized for vector operations etc. Why not (without numba):
def expand(t):
identifier, l = t.split('\t')
identifier = np.int(identifier[-1])
numbers = np.array([np.int(k[:-1]) for k in l.split(' ')])
signs = np.array([(k[-1] == '+') for k in l.split(' ')]) * 2 - 1
N = len(numbers)
arr = np.empty(shape = (N, 4), dtype = np.int64)
arr[:, 0] = numbers
arr[:, 1] = signs
arr[:, 2] = identifier
arr[:, 3] = np.arange(N)
return arr
t = test_str()
%timeit expand(t)
>>>
1.01 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Basically I have two 1d numpy arrays, let's call them x and y, both of the same length. I want to essentially get the result x1y1 + x2y2 + ... + xn*yn. Obviously I could do this with a for loop but is there a built-in method or something where I can do this in one line?
What you are trying to compute is known as an 'inner product' and, in the case of two vectors, is called a 'dot product'. Numpy has built-in functions for computing both which are optimized for speed over the simple (x*y).sum() solution.
import numpy as np
a = np.array([1, 2, 3])
b = np.array([3, 2, 1])
print(np.inner(a, b))
# 10
print(np.dot(a, b))
# 10
Some timing results in the table below with vectors a and b being 1000 randomly selected elements using np.random.randn:
np.dot(a, b) # 920 ns ± 9.9 ns
np.inner(a, b) # 1.1 µs ± 83.5 ns
(a*b).sum() # 4.2 µs ± 62.9 ns
np.sum(a*b) # 5.7 µs ± 170 ns
You can use sum(x*y) or (x*y).sum(), they're equivalent.
I have a function distance that take a natural number as an input and return a 1-D array of length 199. My goal is to merge all the arrays distance(0), ..., distance(499). My code to do so is as follows:
import numpy as np
np.random.seed(42)
n = 200
d = 500
sample = np.random.uniform(size = [n, d])
def distance(i):
value = list(sample[i, 0:3])
temp = value - sample[(i + 1):n, 0:3]
return np.sqrt(np.sum(temp**2, axis = 1))
temp = [distance(i) for i in range(n - 1)]
result = [j for i in temp for j in i]
Because I work with large d, I want to optimize as good as possible. I would like to ask for a faster way to merge such arrays.
Thank you so much!
If you are just trying to compute the pairwise distance:
from scipy.spatial.distance import cdist
dist = cdist(sample[:,:3], sample[:,:3])
Of course you get back a symmetric array with all pairwise distances. To get your result, you can do:
result = dist[np.triu_indices(n,k=1)]
Regarding the broadcasting comment, cdist will do something similar to this:
dist = np.sum((sample[None,:,:3]-sample[:,None,:3])**2, axis=-1)**0.5
For reference, below is the run time for each:
%%timeit -n 100
temp = [distance(i) for i in range(n - 1)]
result = [j for i in temp for j in i]
6.41 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit -n 100
temp = [distance(i) for i in range(n - 1)]
result = np.hstack(temp)
4.86 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit -n 100
temp = [distance(i) for i in range(n - 1)]
result = np.concatenate(temp)
4.28 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit -n 100
dist = np.sum((sample[None,:,:3]-sample[:,None,:3])**2, axis=-1)**0.5
result = dist[np.triu_indices(n,k=1)]
1.47 ms ± 61 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit -n 100
dist = cdist(sample[:,:3], sample[:,:3])
result = dist[np.triu_indices(n,k=1)]
415 µs ± 26.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
I have an enormous 1D numpy array of booleans w and an increasing list of indices i, which splits w into len(i)+1 subarrays. A toy example is:
w=numpy.array([True,False,False,False,True,True,True,True,False,False])
i=numpy.array([0,0,2,5,5,8,8])
I wish to compute a numpy array wi, whose i-th entry is 1 if the i-th subarray contains a True and 0 otherwise. In other words, the i-th entry of w is the sum (logical 'or') of elements of the i-th subarray of w. In our example, the output is:
[0 0 1 1 0 1 0 0]
This is achieved with the code:
wi=numpy.fromiter(map(numpy.any,numpy.split(w,i)),int)
Is there a more efficient way of doing this or is this optimal as far as memory is concerned?
P.S. related post
For efficiency (memory and performance), use np.bitwise_or.reduceat as it keeps the output in boolean -
In [10]: np.bitwise_or.reduceat(w,np.r_[0,i])
Out[10]: array([ True, True, False, True, False, False])
To have as int output, view as int -
In [11]: np.bitwise_or.reduceat(w,np.r_[0,i]).view('i1')
Out[11]: array([1, 1, 0, 1, 0, 0], dtype=int8)
Here's all-weather solution -
def slice_reduce_or(w, i):
valid = i<len(w)
invalidc =( ~valid).sum()
i = i[valid]
mi = np.r_[i[:-1]!=i[1:],True]
pp = i[mi]
p1 = np.bitwise_or.reduceat(w,pp)
N = len(i)+1
out = np.zeros(N+invalidc, dtype=bool)
out[1:N][mi] = p1
out[0] = w[:i[0]].any()
return out.view('i1')
Let's try np.add.reductat:
wi = np.add.reduceat(w,np.r_[0,i]).astype(bool)
output:
array([1, 1, 0, 1, 0, 0])
And performance:
%timeit -n 100 wi = np.add.reduceat(w,np.r_[0,i]).astype(bool).astype(int)
21.7 µs ± 7.86 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit -n 100 wi=np.fromiter(map(np.any,np.split(w,i)),int)
44.5 µs ± 7.79 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
So we're looking at about 2x speed here.
So I have a list with 5,000,000 integers. And I want to cover the list to a numpy array. I tried following code:
numpy.array( list )
But it is very slow.
I benchmarked this operation for 100 times and loop over the list for 100 times. There is no much difference.
Any good idea how to make it faster?
If you have cython you can create a function that is definetly faster. But just a warning: It will crash if there are invalid elements inside your list (not-integers or too big integers).
I use the IPython magic here (%load_ext cython and %%cython), the point is to show how the function looks like - not to show how you can compile Cython code (it's not hard and Cythons "how-to-compile" documentation is quite good).
%load_ext cython
%%cython
cimport cython
import numpy as np
#cython.boundscheck(False)
cpdef to_array(list inp):
cdef long[:] arr = np.zeros(len(inp), dtype=long)
cdef Py_ssize_t idx
for idx in range(len(inp)):
arr[idx] = inp[idx]
return np.asarray(arr)
And the timings:
import numpy as np
def other(your_list): # the approach from #Damian Lattenero in the other answer
ret = np.zeros(shape=(len(your_list)), dtype=int)
np.copyto(ret, your_list)
return ret
inp = list(range(1000000))
%timeit np.array(inp)
# 315 ms ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.array(inp, dtype=int)
# 311 ms ± 2.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit other(inp)
# 316 ms ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit to_array(inp)
# 23.4 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
So it's more than 10 times faster.
I think this is fast, I checked the times:
import numpy as np
import time
start_time = time.time()
number = 1
elements = 10000000
your_list = [number] * elements
ret = np.zeros(shape=(len(your_list)))
np.copyto(ret, your_list)
print("--- %s seconds ---" % (time.time() - start_time))
--- 0.7615997791290283 seconds ---
Make a big list of small integers; use the numpy crutch:
In [619]: arr = np.random.randint(0,256, 5000000)
In [620]: alist = arr.tolist()
In [621]: timeit alist = arr.tolist() # just for reference
10 loops, best of 3: 108 ms per loop
And time for plain list iteration (doesn't do anything)
In [622]: timeit [i for i in alist]
10 loops, best of 3: 193 ms per loop
Make an array of specified dtype
In [623]: arr8 = np.array(alist, 'uint8')
In [624]: timeit arr8 = np.array(alist, 'uint8')
1 loop, best of 3: 508 ms per loop
We can get a 2x improvement with fromiter; evidently it does less checking. np.array will work even if the list is a mix of numbers and strings. It also handles lists of lists etc.
In [625]: timeit arr81 = np.fromiter(alist, 'uint8')
1 loop, best of 3: 249 ms per loop
The advantage of working with arrays becomes apparent when we do math across the whole thing:
In [628]: timeit arr8.sum()
100 loops, best of 3: 6.93 ms per loop
In [629]: timeit sum(alist)
10 loops, best of 3: 74.4 ms per loop
In [630]: timeit 2*arr8
100 loops, best of 3: 6.89 ms per loop
In [631]: timeit [2*i for i in alist]
1 loop, best of 3: 465 ms per loop
It's well known that working with arrays is faster than with lists, but that there is a significant 'startup' overhead.