How to check if a numpy array is inside a Python sequence? - arrays

I'd like to check if a given array is inside a regular Python sequence (list, tuple, etc). For example, consider the following code:
import numpy as np
xs = np.array([1, 2, 3])
ys = np.array([4, 5, 6])
myseq = (xs, 1, True, ys, 'hello')
I would expect that simple membership checking with in would work, e.g.:
>>> xs in myseq
True
But apparently it fails if the element I'm trying to find isn't at the first position of myseq, e.g.:
>>> ys in myseq
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
So how can I perform this check?
If possible I'd like to do this without having to cast myseq into a numpy array or any other sort of data structure.

You can use any with the test that is appropriate:
import numpy as np
xs = np.array([1, 2, 3])
ys = np.array([4, 5, 6])
zs = np.array([7, 8, 9])
myseq = (xs, 1, True, ys, 'hello')
def arr_in_seq(arr, seq):
tp=type(arr)
return any(isinstance(e, tp) and np.array_equiv(e, arr) for e in seq)
Testing:
for x in (xs,ys,zs):
print(arr_in_seq(x,myseq))
True
True
False

This is probably not the most beatiful or fasted solution, but I think it works:
import numpy as np
def array_in_tuple(array, tpl):
i = 0
while i < len(tpl):
if isinstance(tpl[i], np.ndarray) and np.array_equal(array, tpl[i]):
return True
i += 1
return False
xs = np.array([1, 2, 3])
ys = np.array([4, 5, 6])
myseq = (xs, 1, True, ys, 'hello')
print(array_in_tuple(xs, myseq), array_in_tuple(ys, myseq), array_in_tuple(np.array([7, 8, 9]), myseq))

Related

Dictionary output has array inside

I am trying on of the online tutorials to have a dictionary of nine numbers and create another dictionary with statistics, below is the code with the input data, and the result as well
import numpy as np
a = [0, 1, 2, 3, 4, 5, 6, 7, 8]
arr = np.array(a).reshape(3, 3).astype(int)
result = {
"mean": [],
"variance": [],
"standard deviation": [],
"max": [],
"min": [],
"sum": []
}
# Creating a function1
def calculate1(a):
calculate1 = arr.mean(axis = a)
return(calculate1)
result["mean"].append(calculate1(0))
result["mean"].append(calculate1(1))
result["mean"].append(calculate1(None))
# Creating a function2
def calculate2(a):
calculate2 = arr.var(axis = a)
return(calculate2)
result["variance"].append(calculate2(0))
result["variance"].append(calculate2(1))
result["variance"].append(calculate2(None))
# Creating a function3
def calculate3(a):
calculate3 = arr.std(axis = a)
return(calculate3)
result["standard deviation"].append(calculate3(0))
result["standard deviation"].append(calculate3(1))
result["standard deviation"].append(calculate3(None))
# Creating a function4
def calculate4(a):
calculate4 = arr.max(axis = a)
return(calculate4)
result["max"].append(calculate4(0))
result["max"].append(calculate4(1))
result["max"].append(calculate4(None))
# Creating a function5
def calculate5(a):
calculate5 = arr.min(axis = a)
return(calculate5)
result["min"].append(calculate5(0))
result["min"].append(calculate5(1))
result["min"].append(calculate5(None))
# Creating a function6
def calculate6(a):
calculate6 = arr.sum(axis = a)
return(calculate6)
result["sum"].append(calculate6(0))
result["sum"].append(calculate6(1))
result["sum"].append(calculate6(None))
for k, v in result.items():
print(k, v)
And here is the result
mean [array([3., 4., 5.]), array([1., 4., 7.]), 4.0]
variance [array([6., 6., 6.]), array([0.66666667, 0.66666667, 0.66666667]), 6.666666666666667]
standard deviation [array([2.44948974, 2.44948974, 2.44948974]), array([0.81649658, 0.81649658, 0.81649658]), 2.581988897471611]
max [array([6, 7, 8]), array([2, 5, 8]), 8]
min [array([0, 1, 2]), array([0, 3, 6]), 0]
sum [array([ 9, 12, 15]), array([ 3, 12, 21]), 36]
I have two questions here:
1- Is there a way that I can combine or minimize the number of functions to one or something like that. Please note that I (have to) use the function.
2- The output is correct (in values), however I am not sure why the word (array) is printing as well, and when I check the type of the values inside the dictionary, it shows that they are <class 'list'>, so where this array word is coming from?
I tried tolist value and plenty of online suggestions but nothing worked
Any help or suggestion is highly appreciated
You can store your functions inside a dict and then iterate over it:
from pprint import pprint
import numpy as np
def main():
arr = np.random.rand(3, 3)
functions = {
"mean": lambda axis: arr.mean(axis=axis),
"var": lambda axis: arr.var(axis=axis),
"std": lambda axis: arr.std(axis=axis),
"max": lambda axis: arr.max(axis=axis),
"min": lambda axis: arr.min(axis=axis),
"sum": lambda axis: arr.sum(axis=axis),
}
axes = (0, 1, None)
result = {}
for funcname, func in functions.items():
result[funcname] = [func(axis).tolist() for axis in axes]
# Alternatively:
result = {
funcname: [func(axis).tolist() for axis in axes]
for funcname, func in functions.items()
}
pprint(result)
if __name__ == "__main__":
main()
Prints:
{'max': [[0.33149413492721314, 0.9252576833729358, 0.9616249059176883],
[0.37580580905770067, 0.9616249059176883, 0.9252576833729358],
0.9616249059176883],
'mean': [[0.23391570323037428, 0.4063894010374775, 0.6764668740080081],
[0.20197437573445387, 0.4652236940918113, 0.6495739084495947],
0.43892399275862],
'min': [[0.0958037701384552, 0.13431354800720574, 0.37580580905770067],
[0.0958037701384552, 0.15959697173229104, 0.33149413492721314],
0.0958037701384552],
'std': [[0.10039824223253171, 0.3670404461719236, 0.23941075106262735],
[0.1239187264736742, 0.35412651334119355, 0.24424967197333333],
0.3170854368356986],
'sum': [[0.7017471096911229, 1.2191682031124325, 2.029400622024024],
[0.6059231272033616, 1.395671082275434, 1.948721725348784],
3.95031593482758],
'var': [[0.010079807043382115, 0.13471868912608476, 0.057317507724371324],
[0.015355850770857285, 0.12540558745119054, 0.05965790225908093],
0.10054317425328584]}
As for why there is "array" printed, it is because, e.g., np.mean(arr, axis=1) returns a numpy array.

Plotting a list vs a list of arrays with matplotlib

Let's say I have two lists a and b, whereas one is a list of arrays
a = [1200, 1400, 1600, 1800]
b = [array([ 1.84714754, 4.94204658, 11.61580355, ..., 17.09772144,
17.09537562, 17.09499705]), array([ 3.08541849, 5.11338795, 10.26957508, ..., 16.90633304,
16.90417909, 16.90458781]), array([ 4.61916789, 4.58351918, 4.37590053, ..., -2.76705271,
-2.46715664, -1.94577492]), array([7.11040853, 7.79529924, 8.48873734, ..., 7.78736448, 8.47749987,
9.36040364])]
The shape of both is said to be (4,)
If I now try to plot these via plt.scatter(a, b)
I get an error I can't relate to: ValueError: setting an array element with a sequence.
At the end I want a plot where per n-th value in a a set of values stored as n-th array in b shall be plotted.
I'm pretty sure I've done this before, but I can't get this working.
Any ideas? ty
You need to adjust the elements in a to match the elements in b
len_b = [len(sub_array) for sub_array in b]
a = [repeat_a for i,repeat_a in enumerate(a) for _ in range(len_b[i])]
# convert list of array to just list of values
b = np.ravel(b).tolist()
# check if lengths are same
assert len(a) == len(b)
# if yes, now this should work
plt.scatter(a,b)
I am afraid repetition it is. If all lists in b have the same length, you can use numpy.repeat:
import numpy as np
import matplotlib.pyplot as plt
#fake data
np.random.seed(123)
a = [1200, 1400, 1600, 1800]
b = np.random.randint(1, 100, (4, 11)).tolist()
plt.scatter(np.repeat(a, len(b[0])), b)
plt.show()
If you are not sure and want to be on the safe side, list comprehension it is.
import numpy as np
import matplotlib.pyplot as plt
#fake data
np.random.seed(123)
a = [1200, 1400, 1600, 1800]
b = np.random.randint(1, 100, (4, 11)).tolist()
plt.scatter([[x]*len(b[i]) for i, x in enumerate(a)], b)
plt.show()
The output is the same:
Referring to the suggestion of #sai I tried
import numpy as np
arr0 = np.array([1, 2, 3, 4, 5])
arr1 = np.array([6, 7, 8, 9])
arr2 = np.array([10, 11])
old_b = [arr0, arr1, arr2]
b = np.ravel(old_b).tolist()
print(len(b))
Which will give me length 3 instead of the length 11 I expected. How can I collapse a list of arrays to a single list?
edit:
b = np.concatenate(old_b).ravel().tolist()
will lead to the desired result. Thanks all.

Iterating over for an Array Column with dynamic size in Spark Scala Dataframe

I am familiar with this approach - case in point an example from How to obtain the average of an array-type column in scala-spark over all row entries per entry?
val array_size = 3
val avgAgg = for (i <- 0 to array_size -1) yield avg($"value".getItem(i))
df.select(array(avgAgg: _*).alias("avg_value")).show(false)
However, the 3 is hard-coded in reality.
No matter how hard I try not to use an UDF, I cannot do this type of thing dynamically based on the size of an array column already present in the data frame. E.g:
...
val z = for (i <- 1 to size($"sortedCol") ) yield array (element_at($"sortedCol._2", i), element_at($"sortedCol._3", i) )
...
...
.withColumn("Z", array(z: _*) )
I am looking as to how this can be done by applying to an existing array col which is variable in length. transform, expr? Not sure.
Full code as per request:
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
case class abc(year: Int, month: Int, item: String, quantity: Int)
val df0 = Seq(abc(2019, 1, "TV", 8),
abc(2019, 7, "AC", 10),
abc(2018, 1, "TV", 2),
abc(2018, 2, "AC", 3),
abc(2019, 2, "CO", 10)).toDS()
val df1 = df0.toDF()
// Gen some data, can be done easier, but not the point.
val itemsList= collect_list(struct("month", "item", "quantity"))
// This nn works.
val nn = 3
val z = for (i <- 1 to nn) yield array (element_at($"sortedCol.item", i), element_at($"sortedCol.quantity", i) )
// But want this.
//val z = for (i <- 1 to size($"sortedCol") ) yield array (element_at($"sortedCol.item", i), element_at($"sortedCol.quantity", i) )
val df2 = df1.groupBy($"year")
.agg(itemsList as "items")
.withColumn("sortedCol", sort_array($"items", asc = true))
.withColumn("S", size($"sortedCol")) // cannot use this either
.withColumn("Z", array(z: _*) )
.drop("items")
.orderBy($"year".desc)
df2.show(false)
// Col Z is the output I want, but not the null value Array
UPD
In apache spark SQL, how to remove the duplicate rows when using collect_list in window function? there I solve with a very simple UDF but I was looking for a way without UDF and in particular the dynamic setting of the to value in the for loop. The answer proves that certain constructs are not possible - which was the verification being sort.
If I correctly understand your need, you can simply use transform function like this:
val df2 = df1.groupBy($"year")
.agg(itemsList as "items")
.withColumn("sortedCol", sort_array($"items", asc = true))
val transform_expr = "transform(sortedCol, x -> array(x.item, x.quantity))"
df2.withColumn("Z", expr(transform_expr)).show(false)
//+----+--------------------------------------+--------------------------------------+-----------------------------+
//|year|items |sortedCol |Z |
//+----+--------------------------------------+--------------------------------------+-----------------------------+
//|2018|[[1, TV, 2], [2, AC, 3]] |[[1, TV, 2], [2, AC, 3]] |[[TV, 2], [AC, 3]] |
//|2019|[[1, TV, 8], [7, AC, 10], [2, CO, 10]]|[[1, TV, 8], [2, CO, 10], [7, AC, 10]]|[[TV, 8], [CO, 10], [AC, 10]]|
//+----+--------------------------------------+--------------------------------------+-----------------------------+

Turn array into array of arrays following structure of another array

I would like to turn an array into an array of arrays following another array of arrays. I'm not sure how to do this, here are the arrays:
orig_array = [[0,1],[4],[3],[],[3,2,6],[]]
my_array = [2,0,1,3,3,4,5]
wanted_array = [[2,0],[1],[3],[],[3,4,5],[]]
I would like to keep the empty arrays.
Thanks
Get the lengths of each element in orig_array, perform cumumlative summations along the length values to give us the indices at which my_array needs to be split and finally use np.split to actually perform the splitting. Thus, the implementation would look something like this -
lens = [len(item) for item in orig_array]
out = np.split(my_array,np.cumsum(lens))[:-1]
Sample run -
In [72]: orig_array = np.array([[0,1],[4],[3],[],[3,2,6],[]])
...: my_array = np.array([2,0,1,3,3,4,5])
...:
In [73]: lens = [len(item) for item in orig_array]
...: out = np.split(my_array,np.cumsum(lens))[:-1]
...:
In [74]: out
Out[74]:
[array([2, 0]),
array([1]),
array([3]),
array([], dtype=int64),
array([3, 4, 5]),
array([], dtype=int64)]
def do(format, values):
if type(format) == list:
return [do(v, values) for v in format]
else:
return values.pop(0)
print do(orig_array, my_array)
Note: this destroys the array where the values come from.
You could do the following:
import copy
def reflect_array(orig_array, order):
wanted_array = copy.deepcopy(orig_array)
for i, part_list in enumerate(orig_array):
for j, _ in enumerate(part_list):
wanted_array[i][j] = order.pop()
return wanted_array
Test run:
orig_array = [[0,1],[4],[3],[],[3,2,6],[]]
my_array = [2,0,1,3,3,4,5]
print reflect_array(orig_array, my_array)
# [[2, 0], [1], [3], [], [3, 4, 5], []]
In [858]: my_array = [2,0,1,3,3,4,5]
In [859]: [[my_array.pop(0) for _ in range(len(x))] for x in orig_array]
Out[859]: [[2, 0], [1], [3], [], [3, 4, 5], []]
Use b=my_array[:] if you don't want to change my_array.
This operates on the same principle as #karoly's answer; just more direct because it assumes only one level of nesting.

Perform different calculation on each element of an array

I have an array. I need to perform a different calculation on each element. I thought I could do something like the following:
def calc(a, b, c)
arr = [a, b, c]
arr.map { |i| (i[0] * 600), (i[1] * 800), (i[2] * 1000) }
end
calc(5, 8, 15)
but this does not work. How can I perform different calculations on each element of a single array?
Here are some other implementations that might be helpful. By putting the multipliers into an array, we can use zip to connect the element in the input array with the appropriate multiplier value. In addition, that makes it simple to abstract the logic further by removing the multiplier values from the logic that does the multiplication (in multiply_arrays and transform_arrays).
#!/usr/bin/env ruby
VALUES = [1, 1, 1]
MULTIPLIERS = [600, 800, 1000]
def transform(*values)
values.zip(MULTIPLIERS).map { |x, y| x * y }
end
def multiply_arrays(array1, array2)
array1.zip(array2).map { |n1, n2| n1 * n2 }
end
def transform_arrays(array1, array2, method_name)
array1.zip(array2).map { |n1, n2| n1.public_send(method_name, n2) }
end
p transform(*VALUES) # [600, 800, 1000]
p multiply_arrays(VALUES, MULTIPLIERS) # [600, 800, 1000]
p transform_arrays(VALUES, MULTIPLIERS, :*) # [600, 800, 1000]
If the calculations need to be substantially different (different operators, values, more complex logic), than I'd consider using an array of lambdas:
def transform_with_lambdas(values, transforms)
values.zip(transforms).map do |value, transform|
transform.(value)
end
end
TRANSFORMS = [
->(x) { x * 600 },
->(x) { x + 100 },
->(x) { x / 3.0 },
]
p transform_with_lambdas(VALUES, TRANSFORMS) # [600, 101, 0.3333333333333333]
Here is a solution which will help you to apply different operations on two different operands:
def calc(first_operand_arr, operator_arr, second_operand_arr)
result_arr = []
operator_arr.each_with_index do |o, i|
result_arr << (first_operand_arr[i]).method(o).(second_operand_arr[i])
end
result_arr
end
calc([5, 8, 15], ['+', '-', '*'], [5, 3, 2])
def calc *arr
ops = [600, 800, 1000]
arr.map { |x| x * ops.shift }
end
calc(5, 8, 15)
#=> [3000, 6400, 15000]
You could generalize this as follows:
def calc(*arr)
arr.map { |op1, op2, m| op1.send(m, op2) }
end
calc [5, 6, :*], [2, 3, :+], [10, 8, :-]
#=> [30, 5, 2]
Here is an option using a second array of lambda's that can be arbitrary functions of each entry in your main array.
operands = [1.0,2.0,3.0]
operations = [
->(e) { e * 10} ,
->(e) { e + 10 },
->(e) { e * e }
]
results = operands.each_with_index.map { |operand, index| operations[index].call(operand) }
puts results
Edit
I just noticed this is a really a variation on Keith Bennett's answer above, I will leave it here, since it is different in how the lambda is retrieved from the array.

Resources