Related
I have a table that has positions like so:
create or replace table data (
pos int not null,
val float not null,
constraint data_pk primary key (pos)
);
And, a ranges type table like so:
create or replace table ranges (
label varchar(32) not null,
left int not null,
right int not null,
constraint ranges_pk primary key (label)
);
with ranges like
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...
I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':
The results for charlie in the select should be:
('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
-- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
-- -> avg(36.333, 40.333, 43.667) -> 40.111
(for data like)
insert into data (pos, val) values
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..
Thanks!
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?
SQL language is expressive enough to handle such case.
Key point here is to use windowed average with windows size of 3 and then average moving averages:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS
BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;
Output:
Intermediate step to ilustrate:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT *
FROM cte
ORDER BY label, r;
Output:
Here is a step by step answer:
WITH data(pos, val) AS (
SELECT * FROM VALUES
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
SELECT * FROM VALUES
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
), ranges as (
SELECT row_number() over (order by null)-1 as seq
FROM table(generator(rowcount => 200))
), boost_codes AS (
select c.name
,c.s_val + r.seq + 0 as b1
,c.s_val + r.seq + 2 as b3
from codes as c
join ranges as r
ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
select
bc.name
,avg(d.val) as partial
from boost_codes as bc
join data as d ON d.pos between bc.b1 and bc.b3
GROUP BY 1, bc.b1
)
SELECT name
,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;
which gives:
NAME
OUTPUT
alpha
36.333
bravo
38.333
charlie
40.111
delta
50.778
echo
48.467
foxtrot
55.111
The fiddle:
CREATE TABLE person
([first_name] varchar(10), [surname] varchar(10), [date_of_birth] date, [person_id] int);
INSERT INTO person
([first_name], [surname], [date_of_birth] ,[person_id])
VALUES
('Alice', 'AA', '1/1/1990', 1),
('Bob' , 'BB', '1/1/1990', 3),
('Carol', 'CC', '1/1/1990', 4),
('Kate' , 'KK', '1/1/1990', 7);
CREATE TABLE person_membership
([person_id] int, [status_flag] varchar(1), [membership_id] int);
INSERT INTO person_membership
([person_id], [status_flag], [membership_id])
VALUES
(1, 'A', 10),
(1, 'A', 20),
(3, 'A', 30),
(4, 'A', 40),
(7, 'A', 60),
(7, 'T', 70);
CREATE TABLE memship
([membership_id] int, [memship_status] varchar(1));
INSERT INTO memship
([membership_id], [memship_status])
VALUES
(10, 'A'),
(20, 'A'),
(30, 'A'),
(40, 'A'),
(50, 'T'),
(60, 'A'),
(70, 'A');
The query:
WITH t AS
(SELECT first_name, surname, date_of_birth, p.person_id, m.membership_id
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A' and m.memship_status='A')
SELECT t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
FROM t
INNER JOIN t t1 ON t.person_id=t1.person_id
GROUP BY t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
HAVING count(*) > 1
The problem:
Find and display only those reconds marked as active and with multiple membership IDs assigned to one person id.
The expected outcome:
The question:
My query works fine and gives me the expected outcome but the execution plan looks rather convoluted. What are the better, more elegant, expert-recommended ways of doing it?
Seems like you don't need that big GROUP BY at all, you could use a windowed function inside the CTE instead:
WITH Counts AS(
SELECT p.first_name,
p.surname,
p.date_of_birth,
p.person_id,
m.membership_id,
COUNT(*) OVER (PARTITION BY p.person_id) AS PersonMemCount
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A'
AND m.memship_status='A')
SELECT C.first_name,
C.surname,
C.date_of_birth,
C.person_id,
C.membership_id
FROM Counts C
WHERE C.PersonMemCount > 1;
I have a table called DimWorkerCode. It has a column called WorkerCode. This is our business key here. Changes that can happen to a WorkerCode is UnitCode and WindowsID as shown in figure below.
I want to ignore WindowsID, and just select WorkerCode, Unitcode and StartDate which will be minimum StartDate and EndDate which will be maximum EndDate.
I tried this query:
SELECT
WorkerCode, UnitCode,
MIN(StartDate) AS StartDate,
MAX(ISNULL(EndDate, '9999/12/31')) AS EndDate
FROM
dbo.DimWorkerCode
GROUP BY
WorkerCode, UnitCode
and got this result set:
But I am expecting result something like this.
How can I do it in T-SQL? Help please.
The issue is that you are trying to pull some data without grouping on it (enddate for min(startdate).
I'm not certain this is the best solution, but it should work. Using Row_Number(), we're listing the records by min(startdate) and max(enddate) without grouping them; then you pull the records at the beginning of both listings.
select
WorkerCode,
UnitCode,
StartDate,
EndDate
from
(
select
WorkerCode,
UnitCode,
StartDate,
EndDate
row_number() over (partition by WorkerCode, UnitCode order by StartDate) as MinStartDateRow,
row_number() over (partition by WorkerCode, UnitCode order by EndDate desc) as MaxEndDateRow
from
dbo.DimWorkerCode
) x
where
MinStartDateRow = 1
or MaxEndDateRow = 1
If I understand your question correctly and you want to get min and max dates, next approach may help. The important part here is to define groups (each new group begins when WorkerCode or Unitcode are changed).
Table:
CREATE TABLE DimWorkerCode (
ID int,
WorkerCode varchar(4),
UnitCode varchar(4),
WindowID int,
StartDate date,
EndDate date
)
INSERT INTO DimWorkerCode
(ID, WorkerCode, UnitCode, WindowID, StartDate, EndDate)
VALUES
(1, 'AA01', 'AA00', 2, '2007-01-01', '2008-01-01'),
(2, 'AA01', 'AA00', 5, '2008-01-01', '2008-01-01'),
(3, 'AA01', 'AA00', 3, '2009-01-01', '2010-01-01'),
(4, 'AA01', 'XYZ0', 9, '2010-01-01', '2011-01-01'),
(5, 'AA01', 'XYZ0', 12, '2011-01-01', '2012-01-01'),
(6, 'AA01', 'AA00', 13, '2012-01-01', '2013-01-01'),
(7, 'AA01', 'AA00', 24, '2013-01-01', '2014-01-01'),
(8, 'AA01', 'AA00', 17, '2014-01-01', '2015-01-01'),
(9, 'AA01', 'AA00', 18, '2015-01-01', '2016-01-01'),
(10, 'AA01', 'AA00', 22, '2016-01-01', NULL)
Statement:
;WITH ChangeCTE AS (
SELECT
*,
CASE
WHEN (UnitCode = LAG(UnitCode) OVER (ORDER BY ID)) AND (WorkerCode = LAG(WorkerCode) OVER (ORDER BY ID)) THEN 0
ELSE 1
END AS Change
FROM DimWorkerCode
), GroupCTE AS (
SELECT
*,
SUM(Change) OVER (ORDER BY ID) AS GroupID
FROM ChangeCTE
)
SELECT
MAX(WorkerCode) AS WorkerCode,
MAX(UnitCode) AS UnitCode,
MIN(StartDate) AS StartDate,
MAX(ISNULL(EndDate, '9999/12/31')) AS EndDate
FROM GroupCTE
GROUP BY GroupID
Output:
WorkerCode UnitCode StartDate EndDate
AA01 AA00 01/01/2007 00:00:00 01/01/2010 00:00:00
AA01 XYZ0 01/01/2010 00:00:00 01/01/2012 00:00:00
AA01 AA00 01/01/2012 00:00:00 31/12/9999 00:00:00
I got the idea from Zhorov and modifying it to meet my exact requirement.
CREATE TABLE DimWorkerCode (
ID int,
WorkerCode varchar(4),
UnitCode varchar(4),
WindowID int,
StartDate date,
EndDate date
);
INSERT INTO DimWorkerCode
(ID, WorkerCode, UnitCode, WindowID, StartDate, EndDate)
VALUES
(1, 'AA01', 'AA00', 2, '2007-01-01', '2008-01-01'),
(2, 'AA01', 'AA00', 5, '2008-01-01', '2008-01-01'),
(3, 'AA01', 'AA00', 3, '2009-01-01', '2010-01-01'),
(4, 'AA01', 'XYZ0', 9, '2010-01-01', '2011-01-01'),
(5, 'AA01', 'XYZ0', 12, '2011-01-01', '2012-01-01'),
(6, 'AA01', 'AA00', 13, '2012-01-01', '2013-01-01'),
(7, 'AA01', 'AA00', 24, '2013-01-01', '2014-01-01'),
(8, 'AA01', 'AA00', 17, '2014-01-01', '2015-01-01'),
(9, 'AA01', 'AA00', 18, '2015-01-01', '2016-01-01'),
(10, 'AA01', 'AA00', 22, '2016-01-01', NULL)
Here it goes
WITH CTE AS
(
SELECT
ID,
WorkerCode, LAG(WorkerCode, 1, WorkerCode) OVER (ORDER BY ID) AS PrevWorkerCode,
UnitCode, LAG(UnitCode, 1, UnitCode) OVER (ORDER BY ID) AS PrevUnitCode,
StartDate,
ISNULL(EndDate , '9999/12/31') AS EndDate
FROM DimWorkerCode
)
,
ChangedCTE AS
(
SELECT *, IIF(WorkerCode = PrevWorkerCode AND UnitCode = PrevUnitCode, 0, 1) AS Changed FROM CTE
)
,
GroupedCTE AS
(
SELECT *, SUM(Changed) OVER(ORDER BY ID) AS GroupID FROM ChangedCTE
)
,
MinMaxCTE As
(
SELECT MAX(WorkerCode) AS WorkerCode, MAX(UnitCode) AS UnitCode, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate FROM GroupedCTE GROUP BY GroupID
)
SELECT WorkerCode, UnitCode, StartDate, IIF(EndDate = '9999-12-31', NULL, EndDate) AS EndDate FROM MinMaxCTE
Output:
Using MSSQL: Just for clarification
Customer Table
CustomerNumber Name
===================
1 David
2 Thomas
3 Mangold
4 Issac
------------------------------------------------------------
CustomerAddress Table
CustomerNumber State EffectiveDate
==================================
1 AL 01/01/2017
1 VA 06/01/2017
1 GA 02/01/2018
1 FL 10/01/2018
2 TX 01/01/2017
3 MA
4 IL 04/01/2015
SalesOrder Table
CUSTOMER ORDERNo OrderDate
========================
1 1000 03/01/2017
2 1001 10/10/2017
1 1002 11/01/2017
3 1003 12/01/2017
4 1004 01/01/2018
1 1005 02/01/2018
1 1006 01/01/2019
I need to fetch all the orders with the customer detail and the customer address on the order date.
SELECT T1.ORDERNo, T1.ORDERDATE, T1.CUSTOMER, T2.NAME, T3.STATE
FROM SALESORDER T1, CUSTOMER T2, CUSTOMERADDRESS T3
RIGHT JOIN(
SELECT CUSTOMER, MAX(EFFECTIVEDATE) FROM CUSTOMERADDRESS
--WHERE EFFECTIVEDATE <= T1.ORDERDATE
GROUP BY CUSTOMER)T4
ON T3.CUSTOMER = T4.CUSTOMER AND T3.EFFECTIVEDATE=T4.EFFECTIVEDATE
WHERE T1.CUSTOMER = T2.CUSTOMERNUMBER
AND T1.CUSTOMER = T3.CUSTOMERNUMBER
Want to see how to do compare in the join where i compare first table to the 3rd table in the join. see the commented code --WHERE EFFECTIVEDATE <= T1.ORDERDATE
If I remove the commented code, the table in the join cannot reference a table outside.
The expected output is:
CUSTOMER ORDERNo OrderDate CustomerName State
=============================================
1 1000 03/01/2017 David AL
2 1001 10/10/2017 Thomas TX
1 1002 11/01/2017 David VA
3 1003 12/01/2017 Mangold MA
4 1004 01/01/2018 Issac IL
1 1005 02/01/2018 David GA
1 1006 01/01/2019 David FL
The tables in sql fiddle http://sqlfiddle.com/#!18/9eecb:
CREATE TABLE Customer
('CustomerNumber' int, 'CustomerName' varchar(30))
;
INSERT INTO Customer
('CustomerNumber', 'CustomerName')
VALUES
(1, 'David'),
(2, 'Thomas'),
(3, 'Mangold'),
(4, 'Issac')
;
CREATE TABLE CustomerAddress
('CustomerNumber' int, 'State' varchar(2), 'EffectiveDate' date)
;
INSERT INTO CustomerAddress
('CustomerNumber', 'State', 'EffectiveDate')
VALUES
(1, 'AL', 01/01/2017),
(1, 'VA', 06/01/2017),
(1, 'GA', 02/01/2018),
(1, 'FL', 10/01/2018),
(2, 'TX', 01/01/2017),
(3, 'MA',),
(4, 'IL', 04/01/2015)
;
CREATE TABLE SalesOrder
('CUSTOMER' int, 'ORDERNO' int, 'OrderDate' Date)
;
INSERT INTO SalesOrder
('CUSTOMER', 'ORDERNO', 'OrderDate')
VALUES
(1, 1000, 03/01/2017),
(2, 1001, 10/10/2017),
(1, 1002, 11/01/2017),
(3, 1003, 12/01/2017),
(4, 1004, 01/01/2018),
(1, 1005, 02/01/2018),
(1, 1006, 01/01/2019)
;
CREATE TABLE CustomerAddress
(`CustomerNumber` int, 'State' varchar(2), `EffectiveDate` date)
;
INSERT INTO CustomerAddress
(`CustomerNumber`, `State`, 'EffectiveDate')
VALUES
(1, 'AL', 01/01/2017),
(1, 'VA', 06/01/2017),
(1, 'GA', 02/01/2018),
(1, 'FL', 10/01/2018),
(2, 'TX', 01/01/2017),
(3, 'MA',),
(4, 'IL', 04/01/2015)
;
CREATE TABLE SalesOrder
(`CUSTOMER` int, 'ORDERNO' int, `OrderDate` Date)
;
INSERT INTO SalesOrder
(`CUSTOMER `, `ORDERNO`, 'OrderDate')
VALUES
(1, 1000, 03/01/2017),
(2, 1001, 10/10/2017),
(1, 1002, 11/01/2017),
(3, 1003, 12/01/2017),
(4, 1004, 01/01/2018),
(1, 1005, 02/01/2018),
(1, 1006, 01/01/2019)
;
'sql server version'
CREATE TABLE Customer
(CustomerNumber int, CustomerName varchar(30))
;
INSERT INTO Customer
(CustomerNumber, CustomerName)
VALUES
(1, 'David'),
(2, 'Thomas'),
(3, 'Mangold'),
(4, 'Issac');
;
CREATE TABLE CustomerAddress
(CustomerNumber int, State varchar(2), EffectiveDate date)
;
INSERT INTO CustomerAddress
(CustomerNumber, State, EffectiveDate)
VALUES
(1, 'AL', '01/01/2017'),
(1, 'VA', '06/01/2017'),
(1, 'GA', '02/01/2018'),
(1, 'FL', '10/01/2018'),
(2, 'TX', '01/01/2017'),
(4, 'IL', '04/01/2015')
;
INSERT INTO CustomerAddress
(CustomerNumber, State)
VALUES
(3, 'MA' )
;
CREATE TABLE SalesOrder
(CUSTOMER int, ORDERNO int, OrderDate Date)
;
INSERT INTO SalesOrder
(CUSTOMER, ORDERNO, OrderDate)
VALUES
(1, 1000, '03/01/2017'),
(2, 1001, '10/10/2017'),
(1, 1002, '11/01/2017'),
(3, 1003, '12/01/2017'),
(4, 1004, '01/01/2018'),
(1, 1005, '02/01/2018'),
(1, 1006, '01/01/2019')
;
The problem: Need to Pick all the Sales Orders and their customer Name, and the Customer Address. The important and tricky part is the customer address changes based on the date of the sales order.
--MODIFIED VERSION OF THE INCOMPLETE QUERY
SELECT T1.ORDERNo, T1.ORDERDATE, T1.CUSTOMER, T2.CustomerName, T3.STATE
FROM CUSTOMER T2, SALESORDER T1 INNER JOIN CUSTOMERADDRESS T3 ON T1.CUSTOMER = T3.CUSTOMERNUMBER
RIGHT JOIN(
SELECT CustomerNumber, MAX(EFFECTIVEDATE) as EffectiveDate4 FROM CUSTOMERADDRESS
--WHERE EFFECTIVEDATE < T1.ORDERDATE
GROUP BY CustomerNumber
--HAVING EFFECTIVEDATE < T1.ORDERDATE
) T4
ON T3.CustomerNumber = T4.CustomerNumber AND T3.EFFECTIVEDATE=T4.EffectiveDate4
WHERE T1.CUSTOMER = T2.CUSTOMERNUMBER
OUTER APPLY should solve your problem. based on your needs you can change your query.
SELECT T1.ORDERNo, T1.ORDERDATE, T1.CUSTOMER, T2.NAME, T3.STATE
FROM SALESORDER T1, CUSTOMER T2, CUSTOMERADDRESS T3
OUTER APPLY(
SELECT CUSTOMER, MAX(EFFECTIVEDATE) FROM CUSTOMERADDRESS
WHERE EFFECTIVEDATE <= T1.ORDERDATE
AND T3.CUSTOMER = CUSTOMER )T4
WHERE T1.CUSTOMER = T2.CUSTOMERNUMBER
AND T1.CUSTOMER = T3.CUSTOMERNUMBER
AND T3.EFFECTIVEDATE = T4.EFFECTIVEDATE
I want to get same output:
using the following sample data
create table x
(
id int,
date datetime,
stat int
)
insert into x
values (1, '2017-01-01', 100), (1, '2017-01-03', 100), (1, '2017-01-05', 100),
(1, '2017-01-07', 150), (1, '2017-01-09', 150), (1, '2017-02-01', 150),
(1, '2017-02-02', 100), (1, '2017-02-12', 100), (1, '2017-02-15', 100),
(1, '2017-02-17', 150), (1, '2017-03-09', 150), (1, '2017-03-11', 150),
(2, '2017-01-01', 100), (2, '2017-01-03', 100), (2, '2017-01-05', 100),
(2, '2017-01-07', 150), (2, '2017-01-09', 150), (2, '2017-02-01', 150),
(2, '2017-02-02', 100), (2, '2017-02-12', 100), (2, '2017-02-15', 100),
(2, '2017-02-17', 150), (2, '2017-03-09', 150), (2, '2017-03-11', 150)
I tried to use something like this
with a as
(
select
id, date,
ROW_NUMBER() over (partition by date order by id) as rowNum
from
x
), b as
(
select
id, date,
ROW_NUMBER() over (partition by id, stat order by date) as rowNum
from
x
)
select min(b.date)
from a
join b on b.id = a.id
having max(a.date) > max(b.date)
What you are looking for is a gaps-and-islands scenario, where you only have islands. In this scenario what defines the start of an island is a change in the stat value within a id, while evaluating the dataset in date order.
The lag window function is used below to compare values across rows, and see if you need to include it in the output.
select b.id
, b.stat
, b.date
from (
select a.id
, a.date
, a.stat
, case lag(a.stat,1,NULL) over (partition by a.id order by a.date asc) when a.stat then 0 else 1 end as include_flag
from x as a
) as b
where b.include_flag = 1