I have the following table:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
cols = [ 'a1', 'a2']
data = [([2, 3], [4, 5]),
([1, 3], [2, 4])]
df = spark.createDataFrame(data, cols)
df.show()
# +------+------+
# | a1| a2|
# +------+------+
# |[2, 3]|[4, 5]|
# |[1, 3]|[2, 4]|
# +------+------+
I know how to multiply array by a scalar. But how to multiply members of one array with corresponding members of another array?
Desired result:
# +------+------+-------+
# | a1| a2| res|
# +------+------+-------+
# |[2, 3]|[4, 5]|[8, 15]|
# |[1, 3]|[2, 4]|[2, 12]|
# +------+------+-------+
Similarly to your example, you can access the 2nd array from the transform function. This assumes that both arrays have same length:
from pyspark.sql.functions import expr
cols = [ 'a1', 'a2']
data = [([2, 3], [4, 5]),
([1, 3], [2, 4])]
df = spark.createDataFrame(data, cols)
df = df.withColumn("res", expr("transform(a1, (x, i) -> a2[i] * x)"))
# +------+------+-------+
# | a1| a2| res|
# +------+------+-------+
# |[2, 3]|[4, 5]|[8, 15]|
# |[1, 3]|[2, 4]|[2, 12]|
# +------+------+-------+
Assuming you can have arrays with different sizes:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.getOrCreate()
cols = ['a1', 'a2']
data = [([2, 3], [4, 5]),
([1, 3], [2, 4]),
([1, 3], [2, 4, 6])]
df = spark.createDataFrame(data, cols)
df = df.withColumn("res", expr("transform(arrays_zip(a1, a2), x -> coalesce(x.a1 * x.a2, 0))"))
df.show(truncate=False)
# +------+---------+----------+
# |a1 |a2 |res |
# +------+---------+----------+
# |[2, 3]|[4, 5] |[8, 15] |
# |[1, 3]|[2, 4] |[2, 12] |
# |[1, 3]|[2, 4, 6]|[2, 12, 0]|
# +------+---------+----------+
Use User Defined Functions(UDF) to create a function to perform the multiplication and the call this function.
def sum(x, y):
return [x[0] * y[0], x[1] * y[1]]
sum_cols = udf(sum, ArrayType(IntegerType()))
df1 = df.withColumn("res", sum_cols('a1', 'a2'))
df1.show()
+------+------+-------+
| a1| a2| res|
+------+------+-------+
|[2, 3]|[4, 5]|[8, 15]|
|[1, 3]|[2, 4]|[2, 12]|
+------+------+-------+
https://docs.databricks.com/spark/latest/spark-sql/udf-python.html
Related
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[( 1, 'aa', [None, 9]),
( 1, None, [ 9, 1]),
( 1, 'bb', [ 1, 4]),
( 1, 'cc', [ 4, 5]),
( 2, 'ee', [None, 2]),
( 2, None, [ 2, 8]),
( 2, 'dd', [ 8, 7]),
( 2, None, [ 7, 1])],
['col_id', 'col_val', 'col_arr'])
Desired result - I want to group by col_id and return the last non-null item from col_val:
+------+-------+
|col_id|col_val|
+------+-------+
| 1| cc|
| 2| dd|
+------+-------+
The problem is the order column. It's an array where its last element is repeated as the first element of the following row. In the above example, the order of col_id=2 goes:
[None, 2], [2, 8], [8, 7], [7, 1].
Since col_val of [7, 1] is null, the result of [8, 7] should be returned, i.e. 'dd'. The ordering always starts with null (None).
I've tried
df = (df
.filter(~F.isnull('col_val'))
.groupBy('col_id')
.agg(F.max_by('col_val', F.col('col_arr')[1]))
)
df.show()
# +------+---------------------------+
# |col_id|max_by(col_val, col_arr[1])|
# +------+---------------------------+
# | 1| aa|
# | 2| dd|
# +------+---------------------------+
It's not successful, as my order column does not follow a simple ascending / descending order.
So, after some decent thinking, I have found a working approach. The steps:
collecting modified rows (as structs) for every col_id into lists
creating a map for every col_id with first elements of the inner lists as keys
sequential lookup in maps, "looping" through elements in array to create ordered lists
removing nulls and extracting the last item
from pyspark.sql import functions as F, Window as W
df = df.withColumn('col_arr', F.transform('col_arr', lambda x: F.coalesce(x, F.lit(-9))))
inner_struct = F.struct('col_val', F.col('col_arr')[1].alias('last'))
c = F.collect_set(F.struct(F.col('col_arr')[0], inner_struct))
df = df.groupBy('col_id').agg(
F.element_at(F.filter(F.aggregate(
c,
F.expr("array(struct(string(null) col_val, -9L last))"),
lambda acc, x: F.array_union(
acc,
F.array(F.map_from_entries(c)[F.element_at(acc, -1)['last']])
)
), lambda x: x.col_val.isNotNull()), -1).col_val.alias('col_val')
)
df.show()
# +------+-------+
# |col_id|col_val|
# +------+-------+
# | 1| cc|
# | 2| dd|
# +------+-------+
please tell what am I doing wrong here
the output is repeated
import numpy as np
hj = np.array([[2, 3, 0], [6, 5, 7], [8, 9, 7], [1, 1, 1]])
print(hj)
print("2-D")
grr = hj
for x in grr:
for y in grr:
print(grr)
I even tried to do
hj = np.arange(0, 10).reshape(5, 2)
I also did
import numpy as np
hj = np.array([[2, 3, 0], [6, 5, 7], [8, 9, 7], [1, 1, 1]])
# hj = np.arange(0, 10).reshape(5, 2)
# print(hj)
print("2-D")
for x in hj:
for y in hj:
print(hj)
In [25]: arr = np.array([[1,2],[3,4]])
Lets look in more detail at what the variables are in the loop:
In [26]: for x in arr:
...: print('%r %r'%(x, arr))
...: for y in x:
...: print('%r %r %r'%(y, x, arr))
...:
array([1, 2]) array([[1, 2], # a (2,) and (2,2) array
[3, 4]])
1 array([1, 2]) array([[1, 2], # a scalar, a (2,) and (2,2)
[3, 4]])
2 array([1, 2]) array([[1, 2],
[3, 4]])
array([3, 4]) array([[1, 2],
[3, 4]])
3 array([3, 4]) array([[1, 2],
[3, 4]])
4 array([3, 4]) array([[1, 2],
[3, 4]])
You could use a double list comprehension to produce a list of lists:
In [27]: [[y for y in x] for x in arr]
Out[27]: [[1, 2], [3, 4]]
In [28]: arr.tolist() # faster
Out[28]: [[1, 2], [3, 4]]
This loop is wrong:
for x in grr:
for y in grr:
print(grr)
You're basically printing grr over and over again. Probably you meant something like this:
for x in grr:
print(x)
We have a function typedLit in Scala API for Spark to add the Array or Map as column value.
import org.apache.spark.sql.functions.typedLit
val df1 = Seq((1, 0), (2, 3)).toDF("a", "b")
df1.withColumn("seq", typedLit(Seq(1,2,3)))
.show(truncate=false)
+---+---+---------+
|a |b |seq |
+---+---+---------+
|1 |0 |[1, 2, 3]|
|2 |3 |[1, 2, 3]|
+---+---+---------+
I couldn't find the equivalent in PySpark. How can we create a column in PySpark with Array as a column value?
There isn't an equivalent function in pyspark yet, but you can have an array column as shown below:
from pyspark.sql.functions import array, lit
df = sc.parallelize([[1,2], [3,4]]).toDF(['a', 'b'])
df.withColumn('seq', array([lit(i) for i in [1,2,3]])).show()
Output:
+---+---+---------+
| a| b| seq|
+---+---+---------+
| 1| 2|[1, 2, 3]|
| 3| 4|[1, 2, 3]|
+---+---+---------+
Using expr and array looks the most elegant to me:
df = df.withColumn('seq', F.expr('array(1,2,3)'))
Test results:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([(1,0), (2,3)], ['a', 'b'])
df = df.withColumn('seq', F.expr('array(1,2,3)'))
df.show()
# +---+---+---------+
# | a| b| seq|
# +---+---+---------+
# | 1| 0|[1, 2, 3]|
# | 2| 3|[1, 2, 3]|
# +---+---+---------+
Use F.expr('sequence(1,3)') if array numbers need to go in a sequence.
You can use .cast() directly after the lit() call to type the Column:
import pyspark.sql.functions as sf
from pyspark.sql.types import LongType
df1.withColumn("long", sf.lit(1).cast(LongType()))
The same works for array():
import pyspark.sql.functions as sf
from pyspark.sql.types import LongType, ArrayType
df1.withColumn("pirate", sf.array([sf.lit(x).cast(LongType()) for x in [1, 2, 3]]))
df1.withColumn("pirate", sf.array([sf.lit(x) for x in [1, 2, 3]]).cast(ArrayType(LongType())))
and if you really like text and typing but hate types, you could use:
df1.withColumn("pirate", sf.array(sf.lit("1"), sf.lit("2")).cast("array<int>"))
;)
PS Also consider using map with sf.lit instead of the for comprehension.
Let us assume dataframe df as:
df.show()
Output:
+------+----------------+
|letter| list_of_numbers|
+------+----------------+
| A| [3, 1, 2, 3]|
| B| [1, 2, 1, 1]|
+------+----------------+
What I want to do is to count number of a specific element in column list_of_numbers. Something like this:
+------+----------------+----+
|letter| list_of_numbers|ones|
+------+----------------+----+
| A| [3, 1, 2, 3]| 1|
| B| [1, 2, 1, 1]| 3|
+------+----------------+----+
I have so far tried creating udf and it perfectly works, but I'm wondering if I can do it without defining any udf.
You can explode the array and filter the exploded values for 1. Then groupBy and count:
from pyspark.sql.functions import col, count, explode
df.select("*", explode("list_of_numbers").alias("exploded"))\
.where(col("exploded") == 1)\
.groupBy("letter", "list_of_numbers")\
.agg(count("exploded").alias("ones"))\
.show()
#+------+---------------+----+
#|letter|list_of_numbers|ones|
#+------+---------------+----+
#| A| [3, 1, 2, 3]| 1|
#| B| [1, 2, 1, 1]| 3|
#+------+---------------+----+
In order to keep all rows, even when the count is 0, you can convert the exploded column into an indicator variable. Then groupBy and sum.
from pyspark.sql.functions import col, count, explode, sum as sum_
df.select("*", explode("list_of_numbers").alias("exploded"))\
.withColumn("exploded", (col("exploded") == 1).cast("int"))\
.groupBy("letter", "list_of_numbers")\
.agg(sum_("exploded").alias("ones"))\
.show()
Note, I have imported pyspark.sql.functions.sum as sum_ as to not overwrite the builtin sum function.
From pyspark 3+, we can use array transformations.
https://mungingdata.com/spark-3/array-exists-forall-transform-aggregate-zip_with/
https://medium.com/expedia-group-tech/deep-dive-into-apache-spark-array-functions-720b8fbfa729
import pyspark.sql.functions as F
df = spark_session.createDataFrame(
[
['A',[3, 1, 2, 3]],
['B',[1, 2, 1, 1]]
],
['letter','list_of_numbers'])
df1 = df.selectExpr('*','filter(list_of_numbers, x->x=1) as ones_array')
df2 = df1.selectExpr('*', 'size(ones_array) as ones')
df2.show()
+------+---------------+----------+----+
|letter|list_of_numbers|ones_array|ones|
+------+---------------+----------+----+
| A| [3, 1, 2, 3]| [1]| 1|
| B| [1, 2, 1, 1]| [1, 1, 1]| 3|
+------+---------------+----------+----+
Assuming that the length of the list is constant, one way i can think of is,
from operator import add
from functools import reduce
import pyspark.sql.functions as F
df = sql.createDataFrame(
[
['A',[3, 1, 2, 3]],
['B',[1, 2, 1, 1]]
],
['letter','list_of_numbers'])
expr = reduce(add,[F.when(F.col('list_of_numbers').getItem(x)==1, 1)\
.otherwise(0) for x in range(4)])
df = df.withColumn('ones', expr)
df.show()
+------+---------------+----+
|letter|list_of_numbers|ones|
+------+---------------+----+
| A| [3, 1, 2, 3]| 1|
| B| [1, 2, 1, 1]| 3|
+------+---------------+----+
There was a comment above from Ala Tarighati that the solution did not work for arrays with different lengths. The following is a udf that will solve that problem
from operator import add
from functools import reduce
import pyspark.sql.functions as F
df = sql.createDataFrame(
[
['A',[3, 1, 2, 3]],
['B',[1, 2, 1, 1]]
],
['letter','list_of_numbers'])
df_ones = (
df.withColumn(
'ones',
reduce(
add,
[
F.when(
F.col("list_of_numbers").getItem(x) == F.lit("1"), 1
).otherwise(0)
for x in range(len("drivers"))
],
),
)
)
df_ones.show()
+------+---------------+----+
|letter|list_of_numbers|ones|
+------+---------------+----+
| A| [3, 1, 2, 3]| 1|
| B| [1, 2, 1, 1]| 3|
+------+---------------+----+
I have a CSV data file with rows that may have lots of columns 500+ and some with a lot less. I need to transpose it so that each row becomes a column in the output file. The problem is that the rows in the original file may not all have the same number of columns so when I try the transpose method of array I get:
`transpose': element size differs (12 should be 5) (IndexError)
Is there an alternative to transpose that works with uneven array length?
I would insert nulls to fill the holes in your matrix, something such as:
a = [[1, 2, 3], [3, 4]]
# This would throw the error you're talking about
# a.transpose
# Largest row
size = a.max { |r1, r2| r1.size <=> r2.size }.size
# Enlarge matrix inserting nils as needed
a.each { |r| r[size - 1] ||= nil }
# So now a == [[1, 2, 3], [3, 4, nil]]
aa = a.transpose
# aa == [[1, 3], [2, 4], [3, nil]]
# Intitial CSV table data
csv_data = [ [1,2,3,4,5], [10,20,30,40], [100,200] ]
# Finding max length of rows
row_length = csv_data.map(&:length).max
# Inserting nil to the end of each row
csv_data.map do |row|
(row_length - row.length).times { row.insert(-1, nil) }
end
# Let's check
csv_data
# => [[1, 2, 3, 4, 5], [10, 20, 30, 40, nil], [100, 200, nil, nil, nil]]
# Transposing...
transposed_csv_data = csv_data.transpose
# Hooray!
# => [[1, 10, 100], [2, 20, 200], [3, 30, nil], [4, 40, nil], [5, nil, nil]]