Create a numpy.recarray from two lists python - arrays

Is there a easy way to create a numpy.recarray from two lists. For instance, give the following lists:
list1 = ["a","b","c"]
list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
What I am trying to do is to get the following result:
rec_array = np.rec.array([('a', 1), ('a', 2),('a', 3),('a', 4),
('b', 5), ('b', 6),('b', 7),('b', 8),
('c', 9), ('c', 10),('c', 11),('c', 12)] dtype = [('string','|U5'),('int', '<i4')])
I mean I know how a rec.array works but don't really know how to create one from lists. Maybe dicts could make things easy since the key ,value option. But from lists is there a way to do this?.

In [73]: list1 = ["a","b","c"]
...: list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
...:
In [74]: dt = [('string','|U5'),('int', '<i4')]
A simple pairing of elements:
In [75]: [(i,j) for i, j in zip(list1,list2)]
Out[75]: [('a', 1), ('b', 2), ('c', 3)]
break list2 into 3 groups:
In [79]: list3 = [list2[i:i+4] for i in range(0,12,4)]
In [80]: list3
Out[80]: [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
double list comprehension:
In [81]: [(i,j) for i,row in zip(list1,list3) for j in row]
Out[81]:
[('a', 1),
('a', 2),
('a', 3),
('a', 4),
('b', 5),
('b', 6),
('b', 7),
('b', 8),
('c', 9),
('c', 10),
('c', 11),
('c', 12)]
make a structured array from that:
In [82]: np.array(_, dtype=dt)
Out[82]:
array([('a', 1), ('a', 2), ('a', 3), ('a', 4), ('b', 5), ('b', 6),
('b', 7), ('b', 8), ('c', 9), ('c', 10), ('c', 11), ('c', 12)],
dtype=[('string', '<U5'), ('int', '<i4')])
OR to make a (3,4) array:
In [86]: [[(i,j) for j in row] for i,row in zip(list1, list3)]
Out[86]:
[[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]]
In [87]: np.array(_, dt)
Out[87]:
array([[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]],
dtype=[('string', '<U5'), ('int', '<i4')])
In [88]: _.shape
Out[88]: (3, 4)
Or replicate list1 to same size as list2:
In [97]: np.array([(i,j) for i,j in zip(np.repeat(list1,4),list2)],dt).reshape(3
...: ,4)
Out[97]:
array([[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]],
dtype=[('string', '<U5'), ('int', '<i4')])

In addition to #hpaulj's methods you could also allocate and then fill the array like so:
dtype = [('string','|U5'),('int', '<i4')]
>>> list1 = ["a","b","c"]
>>> list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
>>>
>>> result = np.recarray((12,), dtype=dtype)
>>> result['string'].reshape(3, 4).T[...] = list1
>>> result['int'] = list2
>>> result
rec.array([('a', 1), ('a', 2), ('a', 3), ('a', 4), ('b', 5),
('b', 6), ('b', 7), ('b', 8), ('c', 9), ('c', 10),
('c', 11), ('c', 12)],
dtype=[('string', '<U5'), ('int', '<i4')])
The (small) advantage here is that one can use broadcasting on list1.

Related

Window based averages based on ranges within another table

I have a table that has positions like so:
create or replace table data (
pos int not null,
val float not null,
constraint data_pk primary key (pos)
);
And, a ranges type table like so:
create or replace table ranges (
label varchar(32) not null,
left int not null,
right int not null,
constraint ranges_pk primary key (label)
);
with ranges like
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...
I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':
The results for charlie in the select should be:
('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
-- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
-- -> avg(36.333, 40.333, 43.667) -> 40.111
(for data like)
insert into data (pos, val) values
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..
Thanks!
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?
SQL language is expressive enough to handle such case.
Key point here is to use windowed average with windows size of 3 and then average moving averages:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS
BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;
Output:
Intermediate step to ilustrate:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT *
FROM cte
ORDER BY label, r;
Output:
Here is a step by step answer:
WITH data(pos, val) AS (
SELECT * FROM VALUES
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
SELECT * FROM VALUES
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
), ranges as (
SELECT row_number() over (order by null)-1 as seq
FROM table(generator(rowcount => 200))
), boost_codes AS (
select c.name
,c.s_val + r.seq + 0 as b1
,c.s_val + r.seq + 2 as b3
from codes as c
join ranges as r
ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
select
bc.name
,avg(d.val) as partial
from boost_codes as bc
join data as d ON d.pos between bc.b1 and bc.b3
GROUP BY 1, bc.b1
)
SELECT name
,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;
which gives:
NAME
OUTPUT
alpha
36.333
bravo
38.333
charlie
40.111
delta
50.778
echo
48.467
foxtrot
55.111

Most efficient way of finding duplicates SQL Server

The fiddle:
CREATE TABLE person
([first_name] varchar(10), [surname] varchar(10), [date_of_birth] date, [person_id] int);
INSERT INTO person
([first_name], [surname], [date_of_birth] ,[person_id])
VALUES
('Alice', 'AA', '1/1/1990', 1),
('Bob' , 'BB', '1/1/1990', 3),
('Carol', 'CC', '1/1/1990', 4),
('Kate' , 'KK', '1/1/1990', 7);
CREATE TABLE person_membership
([person_id] int, [status_flag] varchar(1), [membership_id] int);
INSERT INTO person_membership
([person_id], [status_flag], [membership_id])
VALUES
(1, 'A', 10),
(1, 'A', 20),
(3, 'A', 30),
(4, 'A', 40),
(7, 'A', 60),
(7, 'T', 70);
CREATE TABLE memship
([membership_id] int, [memship_status] varchar(1));
INSERT INTO memship
([membership_id], [memship_status])
VALUES
(10, 'A'),
(20, 'A'),
(30, 'A'),
(40, 'A'),
(50, 'T'),
(60, 'A'),
(70, 'A');
The query:
WITH t AS
(SELECT first_name, surname, date_of_birth, p.person_id, m.membership_id
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A' and m.memship_status='A')
SELECT t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
FROM t
INNER JOIN t t1 ON t.person_id=t1.person_id
GROUP BY t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
HAVING count(*) > 1
The problem:
Find and display only those reconds marked as active and with multiple membership IDs assigned to one person id.
The expected outcome:
The question:
My query works fine and gives me the expected outcome but the execution plan looks rather convoluted. What are the better, more elegant, expert-recommended ways of doing it?
Seems like you don't need that big GROUP BY at all, you could use a windowed function inside the CTE instead:
WITH Counts AS(
SELECT p.first_name,
p.surname,
p.date_of_birth,
p.person_id,
m.membership_id,
COUNT(*) OVER (PARTITION BY p.person_id) AS PersonMemCount
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A'
AND m.memship_status='A')
SELECT C.first_name,
C.surname,
C.date_of_birth,
C.person_id,
C.membership_id
FROM Counts C
WHERE C.PersonMemCount > 1;

How to retrieve only the records where stat changes?

I want to get same output:
using the following sample data
create table x
(
id int,
date datetime,
stat int
)
insert into x
values (1, '2017-01-01', 100), (1, '2017-01-03', 100), (1, '2017-01-05', 100),
(1, '2017-01-07', 150), (1, '2017-01-09', 150), (1, '2017-02-01', 150),
(1, '2017-02-02', 100), (1, '2017-02-12', 100), (1, '2017-02-15', 100),
(1, '2017-02-17', 150), (1, '2017-03-09', 150), (1, '2017-03-11', 150),
(2, '2017-01-01', 100), (2, '2017-01-03', 100), (2, '2017-01-05', 100),
(2, '2017-01-07', 150), (2, '2017-01-09', 150), (2, '2017-02-01', 150),
(2, '2017-02-02', 100), (2, '2017-02-12', 100), (2, '2017-02-15', 100),
(2, '2017-02-17', 150), (2, '2017-03-09', 150), (2, '2017-03-11', 150)
I tried to use something like this
with a as
(
select
id, date,
ROW_NUMBER() over (partition by date order by id) as rowNum
from
x
), b as
(
select
id, date,
ROW_NUMBER() over (partition by id, stat order by date) as rowNum
from
x
)
select min(b.date)
from a
join b on b.id = a.id
having max(a.date) > max(b.date)
What you are looking for is a gaps-and-islands scenario, where you only have islands. In this scenario what defines the start of an island is a change in the stat value within a id, while evaluating the dataset in date order.
The lag window function is used below to compare values across rows, and see if you need to include it in the output.
select b.id
, b.stat
, b.date
from (
select a.id
, a.date
, a.stat
, case lag(a.stat,1,NULL) over (partition by a.id order by a.date asc) when a.stat then 0 else 1 end as include_flag
from x as a
) as b
where b.include_flag = 1

How to join tables with REPEATED RECORDS

I'm struggling with joining tables, when using REPEATED RECORD fields on the ON clause. The error i get is:
No matching signature for operator = for argument types: ARRAY<STRUCT<experiment INT64>>, INT64. Supported signature: ANY = ANY at [6:5]
My REPEATED RECORD is called ab_test and it has 4 fields inside (experiment, group ,name, state)
My Query:
SELECT be.type, be.group, be.user.id, be.uid,
ARRAY(SELECT STRUCT(ab_test.experiment as experiment , ab_test.group as group, ab_test.name as name, ab_test.state, uid_allocation_timestamp) FROM UNNEST(ab_test) AS ab_test) as ab_test
FROM fiverr-bigquery.dwh.bi_events be
JOIN staging_tables.ab_tests_uid_allocation_history uid_alloc
ON be.uid = uid_alloc.uid
AND ***ARRAY(SELECT STRUCT(ab_test.experiment) FROM UNNEST(ab_test) AS ab_test ) = uid_alloc.test_id***
WHERE be._PARTITIONTIME = '2017-04-24 00:00:00'
AND DATE(created_at) = DATE('2017-04-24')
AND ARRAY(SELECT STRUCT(ab_test.experiment) FROM UNNEST(ab_test) AS ab_test ) IS NOT NULL
AND type = 'order.success'
I also tried replacing the second ON clause with:
CAST((SELECT experiment FROM UNNEST(ab_test) as experiment ) AS INT64) = uid_alloc.test_id
But with no luck (the error i get:Invalid cast from STRUCT<experiment INT64,groupINT64, name STRING, ...> to INT64 at [40:10]
Any ideas ?
I also tried replacing ... But with no luck ... Any ideas ?
Below is attempt to mimic your use case - at least that part of it that responsible for the error you see
If you run below (BigQuery Standard SQL) - you will get exactly same error as in your case
#dtandardSQL
WITH data AS (
SELECT 1 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(911, 2, 'a'), (2, 2, 'b'), (3, 2, 'c')] AS ab_test UNION ALL
SELECT 2 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(11, 3, 'a'), (12, 3, 'b'), (13, 3, 'c')] AS ab_test UNION ALL
SELECT 3 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(21, 4, 'a'), (911, 4, 'b'), (23, 4, 'c')] AS ab_test
)
SELECT id
FROM data
WHERE CAST((SELECT experiment FROM UNNEST(ab_test) AS experiment ) AS INT64) = 911
The error will be
Error: Invalid cast from STRUCT<experiment INT64, grp INT64, name STRING> to INT64 at [12:12]
To resolve this - use below approach
#dtandardSQL
WITH data AS (
SELECT 1 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(911, 2, 'a'), (2, 2, 'b'), (3, 2, 'c')] AS ab_test UNION ALL
SELECT 2 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(11, 3, 'a'), (12, 3, 'b'), (13, 3, 'c')] AS ab_test UNION ALL
SELECT 3 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(21, 4, 'a'), (911, 4, 'b'), (23, 4, 'c')] AS ab_test
)
SELECT id
FROM data
WHERE (SELECT COUNT(1)
FROM UNNEST(ab_test) AS ab_test
WHERE ab_test.experiment = 911
) > 0
No errors now and output will be
id
1
3
because those rows have elements of ab_test with experiment = 911
Finally, below is example with test values from JOIN table as in your question
#dtandardSQL
WITH data AS (
SELECT 1 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(911, 2, 'a'), (2, 2, 'b'), (3, 2, 'c')] AS ab_test UNION ALL
SELECT 2 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(11, 3, 'a'), (12, 3, 'b'), (13, 3, 'c')] AS ab_test UNION ALL
SELECT 3 AS id, [ STRUCT<experiment INT64, grp INT64, name STRING>
(21, 4, 'a'), (911, 4, 'b'), (23, 4, 'c')] AS ab_test
),
tests AS (
SELECT 911 AS test_id UNION ALL
SELECT 912 AS test_id
)
SELECT data.id
FROM data
CROSS JOIN tests
WHERE (SELECT COUNT(1)
FROM UNNEST(ab_test) AS ab_test
WHERE ab_test.experiment = tests.test_id
) > 0
Hope you can apply above to your specific case
When joinning two tables, if the column is not supplied with the table name, it will return STRUCT data type.
To solved this, can you try:
select table.column

SQL SELECT TOP <dynamic> BASED ON CATEGORY/PERCENTAGE

Here is the sample data that I have:
Out of the 11 rows, I need to SELECT TOP 60% of the rows from CAT-1, 30% from CAT-2 AND 10% from CAT-3. Can someone please help me with building a SQL? The target is a SQL 2014 DB.
I add this as a new answer, because my first answer is something completely different. User "Les H" brought me to this:
--Credits to #Les H
SELECT *
INTO #Test
FROM (VALUES
(1, 'A', 'CAT-1', 60),
(2, 'B', 'CAT-1', 60),
(3, 'C', 'CAT-1', 60),
(4, 'D', 'CAT-1', 60),
(5, 'E', 'CAT-1', 60),
(6, 'F', 'CAT-2', 30),
(7, 'G', 'CAT-2', 30),
(8, 'H', 'CAT-2', 30),
(9, 'I', 'CAT-3', 10),
(10, 'J', 'CAT-3', 10),
(11, 'K', 'CAT-1', 60)
) A (RowID, Customer, Category, Percentage)
SELECT Percentages.*
FROM (SELECT DISTINCT Category,Percentage FROM #Test) AS c
CROSS APPLY(SELECT TOP (c.Percentage) PERCENT * FROM #Test WHERE #Test.Category=c.Category ORDER BY #Test.RowID) AS Percentages;
DROP TABLE #Test;
The result:
1 A CAT-1 60
2 B CAT-1 60
3 C CAT-1 60
4 D CAT-1 60
6 F CAT-2 30
9 I CAT-3 10
I haven't tested the query but you should be able to use UNION ALL
SELECT TOP(60) PERCENT *
FROM Table1
WHERE Category = 'CAT-1'
UNION ALL
SELECT TOP(30) PERCENT *
FROM Table1
WHERE Category = 'CAT-2'
UNION ALL
SELECT TOP(10) PERCENT *
FROM Table1
WHERE Category = 'CAT-3'
Obviously you will have to define some ORDER BY criteria or the top 60% will be an arbitrary result.
Whilst you can do
DECLARE #N INT = 20
SELECT TOP (#n) PERCENT * FROM BLAH
I couldn't grok a way of setting #N for each group in your data (CROSS APPLY anyone?).
So here's a solution using two CTEs. It's probably far from optimal :)
Test Data
SELECT *
INTO #Test
FROM (VALUES
(1, 'A', 'CAT-1', 60),
(2, 'B', 'CAT-1', 60),
(3, 'C', 'CAT-1', 60),
(4, 'D', 'CAT-1', 60),
(5, 'E', 'CAT-1', 60),
(6, 'F', 'CAT-2', 30),
(7, 'G', 'CAT-2', 30),
(8, 'H', 'CAT-2', 30),
(9, 'I', 'CAT-3', 10),
(10, 'J', 'CAT-3', 10),
(11, 'K', 'CAT-1', 60)
) A (RowID, Customer, Category, Percentage)
Solution
Here I'm ranking and counting each group in the first CTE then setting the 'percentage bracket range' in the second CTE (this is to catch for example a top 10% query that only has two rows where the brackets would be 50% and 100%).
;WITH Ranked AS (
SELECT *,
RANK() OVER (PARTITION BY Category ORDER BY RowId) * 100 RANK,
COUNT(*) OVER (PARTITION BY Category ) COUNT
FROM #Test),
Grouped AS (
SELECT *,
COALESCE(LAG(RANK) OVER (PARTITION BY Category order BY Rank) / COUNT, 0) BracketStart,
RANK / COUNT BracketEnd
FROM Ranked
)
SELECT
G.RowID
,G.Customer
,G.Category
FROM Grouped G
WHERE G.BracketEnd <= G.Percentage OR G.Percentage BETWEEN G.BracketStart AND G.BracketEnd
ORDER BY G.Category
RowID Customer Category
----------- -------- --------
1 A CAT-1
2 B CAT-1
3 C CAT-1
4 D CAT-1
6 F CAT-2
9 I CAT-3
This is an approach with dynamic SQL. First I create separate tables for customers and categories. Then a SQL command is generated. Check it out:
CREATE TABLE #Cat(CatID INT IDENTITY PRIMARY KEY,Category VARCHAR(100),Percentage INT);
INSERT INTO #Cat(Category,Percentage) VALUES('CAT-1',60),('CAT-2',30),('CAT-3',10);
CREATE TABLE #Cust(CustID INT IDENTITY PRIMARY KEY
,Customer VARCHAR(100)
,CatID INT FOREIGN KEY REFERENCES #Cat(CatID));
INSERT INTO #Cust(Customer,CatID) VALUES
('A',1),('B',1),('C',1),('D',1),('E',1),('F',2),('G',2),('H',2),('I',3),('J',3),('K',1);
DECLARE #cmd VARCHAR(MAX)=
(
SELECT STUFF
(
(
SELECT 'UNION ALL SELECT TOP(' + CAST(c.Percentage AS VARCHAR(10)) + ') PERCENT * FROM #Cust WHERE CatID=' + CAST(c.CatID AS VARCHAR(10)) + ' '
FROM #Cat AS c
FOR XML PATH('')
),1,10,''
)
);
SET #cmd='SELECT tbl.CustID,tbl.Customer,tbl.CatID,c.Category,c.Percentage FROM(' + #cmd + ') AS tbl INNER JOIN #Cat AS c ON c.CatID=tbl.CatID';
--This is the generated query
SELECT #cmd;
--And this is its execution
EXEC (#cmd);
DROP TABLE #Cust;
DROP TABLE #Cat;
The result:
1 A 1 CAT-1 60
2 B 1 CAT-1 60
3 C 1 CAT-1 60
4 D 1 CAT-1 60
6 F 2 CAT-2 30
9 I 3 CAT-3 10

Resources