Window based averages based on ranges within another table - snowflake-cloud-data-platform

I have a table that has positions like so:
create or replace table data (
pos int not null,
val float not null,
constraint data_pk primary key (pos)
);
And, a ranges type table like so:
create or replace table ranges (
label varchar(32) not null,
left int not null,
right int not null,
constraint ranges_pk primary key (label)
);
with ranges like
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...
I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':
The results for charlie in the select should be:
('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
-- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
-- -> avg(36.333, 40.333, 43.667) -> 40.111
(for data like)
insert into data (pos, val) values
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..
Thanks!

Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?
SQL language is expressive enough to handle such case.
Key point here is to use windowed average with windows size of 3 and then average moving averages:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS
BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;
Output:
Intermediate step to ilustrate:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT *
FROM cte
ORDER BY label, r;
Output:

Here is a step by step answer:
WITH data(pos, val) AS (
SELECT * FROM VALUES
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
SELECT * FROM VALUES
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
), ranges as (
SELECT row_number() over (order by null)-1 as seq
FROM table(generator(rowcount => 200))
), boost_codes AS (
select c.name
,c.s_val + r.seq + 0 as b1
,c.s_val + r.seq + 2 as b3
from codes as c
join ranges as r
ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
select
bc.name
,avg(d.val) as partial
from boost_codes as bc
join data as d ON d.pos between bc.b1 and bc.b3
GROUP BY 1, bc.b1
)
SELECT name
,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;
which gives:
NAME
OUTPUT
alpha
36.333
bravo
38.333
charlie
40.111
delta
50.778
echo
48.467
foxtrot
55.111

Related

Cumulative Formula That Resets If Meets Criteria (SQL Server)

Hello and Happy New Years!
I have a question regarding cumulative formulas that will "reset" if the "days between last fd" is over 30. Example below.
First is a test temp with limited info. I don't need help with the f-sdays or Days Between Last fd. It's only the cumulative Days calc. I want the output to look like #results
DECLARE #test table(ID int, startdate date, finishdate date)
INSERT INTO #test(ID, startdate, finishdate) VALUES
(123, '2019-12-30', '2019-12-31'),
(123, '2019-11-15', '2019-12-10'),
(123, '2019-09-12', '2019-10-10'),
(123, '2019-09-02', '2019-09-09'),
(123, '2019-08-30', '2019-09-01'),
(789, '2019-11-30', '2019-12-31'),
(789, '2019-11-15', '2019-11-17'),
(789, '2019-09-12', '2019-10-10'),
(789, '2019-09-02', '2019-09-04'),
(789, '2019-08-30', '2019-09-01')
select *
from #test
DECLARE #results TABLE(ID int, startdate date, finishdate date ,[F-SDays] int, DaysBetweenLastFD int, cumulativeDays int)
INSERT INTO #results(ID, startdate, finishdate, [F-SDays], DaysBetweenLastFD, cumulativeDays) VALUES
(123, '2019-12-30', '2019-12-31', 1, 20, 26),
(123, '2019-11-15', '2019-12-10', 25, 36, 25),
(123, '2019-09-12', '2019-10-10', 28, 3, 37),
(123, '2019-09-02', '2019-09-09', 7, 1, 9),
(123, '2019-08-30', '2019-09-01', 2, 0, 2),
(789, '2019-11-30', '2019-12-31', 31, 13, 33),
(789, '2019-11-15', '2019-11-17', 2, 36, 2),
(789, '2019-09-12', '2019-10-10', 28, 8, 32),
(789, '2019-09-02', '2019-09-04', 2, 1, 4),
(789, '2019-08-30', '2019-09-01', 2, 0, 2)
select *
from #results
if the "days between last fd" is over 30 then a newgroupflag is set. Summing the newgroupflags (till the current row) will identify the current group/ordinal of each row. Summing the datediffs per ID & groupordinal will give the reset running total:
select *, sum(datediff(day, startdate, finishdate)) over(partition by Id, groupordinal order by startdate) as runningtotal
from
(
select *, sum(newgroupflag) over (partition by id order by startdate) as groupordinal
from
(
select *,
case when lag(finishdate) over(partition by id order by startdate) < dateadd(day, -30, startdate) then 1 else 0 end as newgroupflag
from #test
) as gfl
) as src
order by ID, startdate;

UPDATE using Rank(), Row_Number excluding duplicate values

I've a dataset similar to the one below.
I need to update the base lookup table based on the values provided in the updated_CustomerId column. The base tables is the same as the dataset but it does not have updated_CustomerId column.
The challenge here that the base table has a unique constraint based on combination of three columns below:
Current_CustomerID
Order_ID
OrderCategory
DESIRED OUTPUT:
After the update either one of Old_customerIds (17360410 - Pk 8, 21044488 - Pk = 9) can be reassigned to the Update_CustomerID
PrimaryKey 2 will not updated as that would lead to Unique constraint violation, but it will then be deleted along with one of the PrimaryKeys from the above either 8 or 9, depending on which one was updated (re-assigned to the new id)
After everything is updated on the base table I then delete from the base table all records where Current_CustomerID was not re-assigned to the updated_CustomerId (if different)
IF OBJECT_ID('tempdb..#DataSet') IS NOT NULL
DROP TABLE #DataSet
IF OBJECT_ID('tempdb..#BaseTable') IS NOT NULL
DROP TABLE #BaseTable
CREATE TABLE #DataSet
(
PrimaryKey INT NOT NULL CONSTRAINT [PK_dataset_ID] PRIMARY KEY,
Current_CustomerID INT NOT NULL,
Order_ID INT NOT NULL,
OrderCategory VARCHAR(50) NOT NULL,
Updated_CustomerId INT NOT NULL
)
INSERT INTO #DataSet (PrimaryKey, Current_CustomerID, Order_ID, OrderCategory, updated_CustomerId)
VALUES
(1, 17395001, 4451784, 'Kitchen', 25693110),
(2, 25693110, 4451784, 'Kitchen', 25693110),
(3, 25693110, 2083059, 'Kitchen', 25693110),
(4, 25693110, 2163679, 'Kitchen', 25693110),
(5, 25693110, 2171466, 'Kitchen', 25693110),
(6, 25693110, 2163679, 'Bathroom', 25693110),
(7, 25693110, 2171466, 'Bathroom', 25693110),
(8, 17360410, 3377931, 'Furniture', 16303984),
(9, 21044488, 3377931, 'Furniture', 16303984),
(10, 1534323, 2641714, 'Furniture', 16303984),
(11, 16303984, 2641726, 'Furniture', 16303984),
(12, 16303984, 2641793, 'Furniture', 16303984),
(13, 16303984, 2641816, 'Furniture', 16303984),
(14, 16303345, 2641816, 'Garden', 16301239),
(15, 12345678, 1239065, 'Medicine', 1075432)
CREATE TABLE #BaseTable
(
PrimaryKey INT NOT NULL CONSTRAINT [PK_baseTable_ID] PRIMARY KEY,
CustomerID INT NOT NULL,
Order_ID INT NOT NULL,
OrderCategory VARCHAR(50) NOT NULL,
)
CREATE UNIQUE NONCLUSTERED INDEX [IDX_LookUp] ON #BaseTable
(
CustomerID ASC,
Order_ID ASC,
OrderCategory ASC
) ON [PRIMARY]
INSERT INTO #BaseTable (PrimaryKey, CustomerID, Order_ID, OrderCategory)
VALUES
(1, 17395001, 4451784, 'Kitchen'),
(2, 25693110, 4451784, 'Kitchen'),
(3, 25693110, 2083059, 'Kitchen'),
(4, 25693110, 2163679, 'Kitchen'),
(5, 25693110, 2171466, 'Kitchen'),
(6, 25693110, 2163679, 'Bathroom'),
(7, 25693110, 2171466, 'Bathroom'),
(8, 17360410, 3377931, 'Furniture'),
(9, 21044488, 3377931, 'Furniture'),
(10, 1534323, 2641714, 'Furniture'),
(11, 16303984, 2641726, 'Furniture'),
(12, 16303984, 2641793, 'Furniture'),
(13, 16303984, 2641816, 'Furniture'),
(14, 16303345, 2641816, 'Garden'),
(15, 12345678, 1239065, 'Medicine')
-- select * from #BaseTable
-- select * from #DataSet
; with CTE AS (
select a.*
,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory
order by a.Current_CustomerID) as flag
from #DataSet a
)
with CTE AS (
select a.*
,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory order by a.Current_CustomerID) as flag
from #DataSet a
)
update b
set CustomerID = a.Updated_CustomerId
from #BaseTable b
inner join CTE a on b.PrimaryKey = a.PrimaryKey
where flag <> 2
Msg 2601, Level 14, State 1, Line 82
Cannot insert duplicate key row in object 'dbo.#BaseTable' with unique index 'IDX_LookUp'. The duplicate key value is (25693110, 4451784, Kitchen).
The statement has been terminated.
I think you just want to get a row_number for the #DataTable, and then delete where there are more than one based on the unique key:
//...
DELETE bt
FROM #BaseTable bt
INNER JOIN (
SELECT a.PrimaryKey,
a.Updated_CustomerId,
a.Order_ID,
a.OrderCategory,
row = ROW_NUMBER() OVER (PARTITION BY a.Updated_CustomerId, a.Order_ID, a.OrderCategory ORDER BY a.Current_CustomerID)
FROM #BaseTable b
INNER JOIN #DataSet a
ON b.PrimaryKey = a.PrimaryKey
) x
ON bt.PrimaryKey = x.PrimaryKey
AND x.row > 1

Most efficient way of finding duplicates SQL Server

The fiddle:
CREATE TABLE person
([first_name] varchar(10), [surname] varchar(10), [date_of_birth] date, [person_id] int);
INSERT INTO person
([first_name], [surname], [date_of_birth] ,[person_id])
VALUES
('Alice', 'AA', '1/1/1990', 1),
('Bob' , 'BB', '1/1/1990', 3),
('Carol', 'CC', '1/1/1990', 4),
('Kate' , 'KK', '1/1/1990', 7);
CREATE TABLE person_membership
([person_id] int, [status_flag] varchar(1), [membership_id] int);
INSERT INTO person_membership
([person_id], [status_flag], [membership_id])
VALUES
(1, 'A', 10),
(1, 'A', 20),
(3, 'A', 30),
(4, 'A', 40),
(7, 'A', 60),
(7, 'T', 70);
CREATE TABLE memship
([membership_id] int, [memship_status] varchar(1));
INSERT INTO memship
([membership_id], [memship_status])
VALUES
(10, 'A'),
(20, 'A'),
(30, 'A'),
(40, 'A'),
(50, 'T'),
(60, 'A'),
(70, 'A');
The query:
WITH t AS
(SELECT first_name, surname, date_of_birth, p.person_id, m.membership_id
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A' and m.memship_status='A')
SELECT t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
FROM t
INNER JOIN t t1 ON t.person_id=t1.person_id
GROUP BY t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
HAVING count(*) > 1
The problem:
Find and display only those reconds marked as active and with multiple membership IDs assigned to one person id.
The expected outcome:
The question:
My query works fine and gives me the expected outcome but the execution plan looks rather convoluted. What are the better, more elegant, expert-recommended ways of doing it?
Seems like you don't need that big GROUP BY at all, you could use a windowed function inside the CTE instead:
WITH Counts AS(
SELECT p.first_name,
p.surname,
p.date_of_birth,
p.person_id,
m.membership_id,
COUNT(*) OVER (PARTITION BY p.person_id) AS PersonMemCount
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A'
AND m.memship_status='A')
SELECT C.first_name,
C.surname,
C.date_of_birth,
C.person_id,
C.membership_id
FROM Counts C
WHERE C.PersonMemCount > 1;

Create a numpy.recarray from two lists python

Is there a easy way to create a numpy.recarray from two lists. For instance, give the following lists:
list1 = ["a","b","c"]
list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
What I am trying to do is to get the following result:
rec_array = np.rec.array([('a', 1), ('a', 2),('a', 3),('a', 4),
('b', 5), ('b', 6),('b', 7),('b', 8),
('c', 9), ('c', 10),('c', 11),('c', 12)] dtype = [('string','|U5'),('int', '<i4')])
I mean I know how a rec.array works but don't really know how to create one from lists. Maybe dicts could make things easy since the key ,value option. But from lists is there a way to do this?.
In [73]: list1 = ["a","b","c"]
...: list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
...:
In [74]: dt = [('string','|U5'),('int', '<i4')]
A simple pairing of elements:
In [75]: [(i,j) for i, j in zip(list1,list2)]
Out[75]: [('a', 1), ('b', 2), ('c', 3)]
break list2 into 3 groups:
In [79]: list3 = [list2[i:i+4] for i in range(0,12,4)]
In [80]: list3
Out[80]: [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
double list comprehension:
In [81]: [(i,j) for i,row in zip(list1,list3) for j in row]
Out[81]:
[('a', 1),
('a', 2),
('a', 3),
('a', 4),
('b', 5),
('b', 6),
('b', 7),
('b', 8),
('c', 9),
('c', 10),
('c', 11),
('c', 12)]
make a structured array from that:
In [82]: np.array(_, dtype=dt)
Out[82]:
array([('a', 1), ('a', 2), ('a', 3), ('a', 4), ('b', 5), ('b', 6),
('b', 7), ('b', 8), ('c', 9), ('c', 10), ('c', 11), ('c', 12)],
dtype=[('string', '<U5'), ('int', '<i4')])
OR to make a (3,4) array:
In [86]: [[(i,j) for j in row] for i,row in zip(list1, list3)]
Out[86]:
[[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]]
In [87]: np.array(_, dt)
Out[87]:
array([[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]],
dtype=[('string', '<U5'), ('int', '<i4')])
In [88]: _.shape
Out[88]: (3, 4)
Or replicate list1 to same size as list2:
In [97]: np.array([(i,j) for i,j in zip(np.repeat(list1,4),list2)],dt).reshape(3
...: ,4)
Out[97]:
array([[('a', 1), ('a', 2), ('a', 3), ('a', 4)],
[('b', 5), ('b', 6), ('b', 7), ('b', 8)],
[('c', 9), ('c', 10), ('c', 11), ('c', 12)]],
dtype=[('string', '<U5'), ('int', '<i4')])
In addition to #hpaulj's methods you could also allocate and then fill the array like so:
dtype = [('string','|U5'),('int', '<i4')]
>>> list1 = ["a","b","c"]
>>> list2 = [1,2,3,4,5,6,7,8,9,10,11,12]
>>>
>>> result = np.recarray((12,), dtype=dtype)
>>> result['string'].reshape(3, 4).T[...] = list1
>>> result['int'] = list2
>>> result
rec.array([('a', 1), ('a', 2), ('a', 3), ('a', 4), ('b', 5),
('b', 6), ('b', 7), ('b', 8), ('c', 9), ('c', 10),
('c', 11), ('c', 12)],
dtype=[('string', '<U5'), ('int', '<i4')])
The (small) advantage here is that one can use broadcasting on list1.

How to retrieve only the records where stat changes?

I want to get same output:
using the following sample data
create table x
(
id int,
date datetime,
stat int
)
insert into x
values (1, '2017-01-01', 100), (1, '2017-01-03', 100), (1, '2017-01-05', 100),
(1, '2017-01-07', 150), (1, '2017-01-09', 150), (1, '2017-02-01', 150),
(1, '2017-02-02', 100), (1, '2017-02-12', 100), (1, '2017-02-15', 100),
(1, '2017-02-17', 150), (1, '2017-03-09', 150), (1, '2017-03-11', 150),
(2, '2017-01-01', 100), (2, '2017-01-03', 100), (2, '2017-01-05', 100),
(2, '2017-01-07', 150), (2, '2017-01-09', 150), (2, '2017-02-01', 150),
(2, '2017-02-02', 100), (2, '2017-02-12', 100), (2, '2017-02-15', 100),
(2, '2017-02-17', 150), (2, '2017-03-09', 150), (2, '2017-03-11', 150)
I tried to use something like this
with a as
(
select
id, date,
ROW_NUMBER() over (partition by date order by id) as rowNum
from
x
), b as
(
select
id, date,
ROW_NUMBER() over (partition by id, stat order by date) as rowNum
from
x
)
select min(b.date)
from a
join b on b.id = a.id
having max(a.date) > max(b.date)
What you are looking for is a gaps-and-islands scenario, where you only have islands. In this scenario what defines the start of an island is a change in the stat value within a id, while evaluating the dataset in date order.
The lag window function is used below to compare values across rows, and see if you need to include it in the output.
select b.id
, b.stat
, b.date
from (
select a.id
, a.date
, a.stat
, case lag(a.stat,1,NULL) over (partition by a.id order by a.date asc) when a.stat then 0 else 1 end as include_flag
from x as a
) as b
where b.include_flag = 1

Resources