Related
I have a table that has positions like so:
create or replace table data (
pos int not null,
val float not null,
constraint data_pk primary key (pos)
);
And, a ranges type table like so:
create or replace table ranges (
label varchar(32) not null,
left int not null,
right int not null,
constraint ranges_pk primary key (label)
);
with ranges like
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...
I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':
The results for charlie in the select should be:
('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
-- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
-- -> avg(36.333, 40.333, 43.667) -> 40.111
(for data like)
insert into data (pos, val) values
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..
Thanks!
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?
SQL language is expressive enough to handle such case.
Key point here is to use windowed average with windows size of 3 and then average moving averages:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS
BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;
Output:
Intermediate step to ilustrate:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT *
FROM cte
ORDER BY label, r;
Output:
Here is a step by step answer:
WITH data(pos, val) AS (
SELECT * FROM VALUES
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
SELECT * FROM VALUES
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
), ranges as (
SELECT row_number() over (order by null)-1 as seq
FROM table(generator(rowcount => 200))
), boost_codes AS (
select c.name
,c.s_val + r.seq + 0 as b1
,c.s_val + r.seq + 2 as b3
from codes as c
join ranges as r
ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
select
bc.name
,avg(d.val) as partial
from boost_codes as bc
join data as d ON d.pos between bc.b1 and bc.b3
GROUP BY 1, bc.b1
)
SELECT name
,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;
which gives:
NAME
OUTPUT
alpha
36.333
bravo
38.333
charlie
40.111
delta
50.778
echo
48.467
foxtrot
55.111
I deal with big job to find max period by FULL months of enrollment within the year (12 months period), which I did OK if we have 2 periods. Just got stuck at the end while testing if I have 3+ periods. Data below and picture hope will provide all information and easy start. Thanks to all. This is final work table I got at the end of my process, thanks all. Code below produces partially correct results. My global task find MAX period for each member, so some fields are just for easy working.
/*
DROP TABLE IF EXISTS #t;
CREATE TABLE #t ( Cust VARCHAR(10), mm INT, mm_prev INT, rn INT)
INSERT #t values
(123456, 1, NULL, 1), (123456, 2, 1, 2),
(123456, 4, 2, 3), (123456, 5, 4, 4), (123456, 6, 5, 5),
(123456, 8, 6, 6), (123456, 9, 8, 7), (123456, 10, 9, 8), (123456, 11, 10, 9), (123456, 12, 11, 10),
(777 , 1, NULL, 1),(777 , 2, 1, 2)
SELECT * from #t
*/
select
Cust, MIN(mm) mmStart, MAX(mm) mmEnd,
CASE WHEN mm = rn THEN 'Grp A' ELSE 'Grp B' END Grp
,COUNT(*) mm_count
FROM #t
WHERE 1=1
--mm - ISNULL(mm_prev,0) = 1 --check for conseq but we drop mm=6--> start of new period
-- AND mm = rn -- this brings only first group by mm
GROUP BY Cust, CASE WHEN mm = rn THEN 'Grp A' ELSE 'Grp B' END
ORDER BY 1,4
just for the case if somebody prefer to deal with initial raw data I posting it here too with some gap and islands:
CREATE TABLE #tr ( Cust varchar(10), ENR_START date, enr_END date, rn INT); -- SELECT * FROM #t
INSERT #tr VALUES
('123456' , '2018-12-01', '2019-3-1' , 1),
('123456' , '2019-3-28', '2019-6-30' , 2), -- 6 month with 2 periods, island
('123456' , '2019-7-26', '2019-8-20' , 3),
('123456' , '2019-8-15', '2019-12-31' , 4),
('777' , '2018-11-4', '2019-3-3' , 1)
select * from #tr
Screenshot is here:
looks to me, you wanted this. Not really sure what is the purpose of the case statement in your query
with cte as
(
SELECT *,
grp = mm - rn
from #t
)
SELECT Cust, MIN(mm) as mmStart, MAX(mm) as mmEnd, grp,
count(*) as mm_count
FROM cte
GROUP BY Cust, grp
order by Cust, mmStart
I have a table called DimWorkerCode. It has a column called WorkerCode. This is our business key here. Changes that can happen to a WorkerCode is UnitCode and WindowsID as shown in figure below.
I want to ignore WindowsID, and just select WorkerCode, Unitcode and StartDate which will be minimum StartDate and EndDate which will be maximum EndDate.
I tried this query:
SELECT
WorkerCode, UnitCode,
MIN(StartDate) AS StartDate,
MAX(ISNULL(EndDate, '9999/12/31')) AS EndDate
FROM
dbo.DimWorkerCode
GROUP BY
WorkerCode, UnitCode
and got this result set:
But I am expecting result something like this.
How can I do it in T-SQL? Help please.
The issue is that you are trying to pull some data without grouping on it (enddate for min(startdate).
I'm not certain this is the best solution, but it should work. Using Row_Number(), we're listing the records by min(startdate) and max(enddate) without grouping them; then you pull the records at the beginning of both listings.
select
WorkerCode,
UnitCode,
StartDate,
EndDate
from
(
select
WorkerCode,
UnitCode,
StartDate,
EndDate
row_number() over (partition by WorkerCode, UnitCode order by StartDate) as MinStartDateRow,
row_number() over (partition by WorkerCode, UnitCode order by EndDate desc) as MaxEndDateRow
from
dbo.DimWorkerCode
) x
where
MinStartDateRow = 1
or MaxEndDateRow = 1
If I understand your question correctly and you want to get min and max dates, next approach may help. The important part here is to define groups (each new group begins when WorkerCode or Unitcode are changed).
Table:
CREATE TABLE DimWorkerCode (
ID int,
WorkerCode varchar(4),
UnitCode varchar(4),
WindowID int,
StartDate date,
EndDate date
)
INSERT INTO DimWorkerCode
(ID, WorkerCode, UnitCode, WindowID, StartDate, EndDate)
VALUES
(1, 'AA01', 'AA00', 2, '2007-01-01', '2008-01-01'),
(2, 'AA01', 'AA00', 5, '2008-01-01', '2008-01-01'),
(3, 'AA01', 'AA00', 3, '2009-01-01', '2010-01-01'),
(4, 'AA01', 'XYZ0', 9, '2010-01-01', '2011-01-01'),
(5, 'AA01', 'XYZ0', 12, '2011-01-01', '2012-01-01'),
(6, 'AA01', 'AA00', 13, '2012-01-01', '2013-01-01'),
(7, 'AA01', 'AA00', 24, '2013-01-01', '2014-01-01'),
(8, 'AA01', 'AA00', 17, '2014-01-01', '2015-01-01'),
(9, 'AA01', 'AA00', 18, '2015-01-01', '2016-01-01'),
(10, 'AA01', 'AA00', 22, '2016-01-01', NULL)
Statement:
;WITH ChangeCTE AS (
SELECT
*,
CASE
WHEN (UnitCode = LAG(UnitCode) OVER (ORDER BY ID)) AND (WorkerCode = LAG(WorkerCode) OVER (ORDER BY ID)) THEN 0
ELSE 1
END AS Change
FROM DimWorkerCode
), GroupCTE AS (
SELECT
*,
SUM(Change) OVER (ORDER BY ID) AS GroupID
FROM ChangeCTE
)
SELECT
MAX(WorkerCode) AS WorkerCode,
MAX(UnitCode) AS UnitCode,
MIN(StartDate) AS StartDate,
MAX(ISNULL(EndDate, '9999/12/31')) AS EndDate
FROM GroupCTE
GROUP BY GroupID
Output:
WorkerCode UnitCode StartDate EndDate
AA01 AA00 01/01/2007 00:00:00 01/01/2010 00:00:00
AA01 XYZ0 01/01/2010 00:00:00 01/01/2012 00:00:00
AA01 AA00 01/01/2012 00:00:00 31/12/9999 00:00:00
I got the idea from Zhorov and modifying it to meet my exact requirement.
CREATE TABLE DimWorkerCode (
ID int,
WorkerCode varchar(4),
UnitCode varchar(4),
WindowID int,
StartDate date,
EndDate date
);
INSERT INTO DimWorkerCode
(ID, WorkerCode, UnitCode, WindowID, StartDate, EndDate)
VALUES
(1, 'AA01', 'AA00', 2, '2007-01-01', '2008-01-01'),
(2, 'AA01', 'AA00', 5, '2008-01-01', '2008-01-01'),
(3, 'AA01', 'AA00', 3, '2009-01-01', '2010-01-01'),
(4, 'AA01', 'XYZ0', 9, '2010-01-01', '2011-01-01'),
(5, 'AA01', 'XYZ0', 12, '2011-01-01', '2012-01-01'),
(6, 'AA01', 'AA00', 13, '2012-01-01', '2013-01-01'),
(7, 'AA01', 'AA00', 24, '2013-01-01', '2014-01-01'),
(8, 'AA01', 'AA00', 17, '2014-01-01', '2015-01-01'),
(9, 'AA01', 'AA00', 18, '2015-01-01', '2016-01-01'),
(10, 'AA01', 'AA00', 22, '2016-01-01', NULL)
Here it goes
WITH CTE AS
(
SELECT
ID,
WorkerCode, LAG(WorkerCode, 1, WorkerCode) OVER (ORDER BY ID) AS PrevWorkerCode,
UnitCode, LAG(UnitCode, 1, UnitCode) OVER (ORDER BY ID) AS PrevUnitCode,
StartDate,
ISNULL(EndDate , '9999/12/31') AS EndDate
FROM DimWorkerCode
)
,
ChangedCTE AS
(
SELECT *, IIF(WorkerCode = PrevWorkerCode AND UnitCode = PrevUnitCode, 0, 1) AS Changed FROM CTE
)
,
GroupedCTE AS
(
SELECT *, SUM(Changed) OVER(ORDER BY ID) AS GroupID FROM ChangedCTE
)
,
MinMaxCTE As
(
SELECT MAX(WorkerCode) AS WorkerCode, MAX(UnitCode) AS UnitCode, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate FROM GroupedCTE GROUP BY GroupID
)
SELECT WorkerCode, UnitCode, StartDate, IIF(EndDate = '9999-12-31', NULL, EndDate) AS EndDate FROM MinMaxCTE
Output:
I want to get same output:
using the following sample data
create table x
(
id int,
date datetime,
stat int
)
insert into x
values (1, '2017-01-01', 100), (1, '2017-01-03', 100), (1, '2017-01-05', 100),
(1, '2017-01-07', 150), (1, '2017-01-09', 150), (1, '2017-02-01', 150),
(1, '2017-02-02', 100), (1, '2017-02-12', 100), (1, '2017-02-15', 100),
(1, '2017-02-17', 150), (1, '2017-03-09', 150), (1, '2017-03-11', 150),
(2, '2017-01-01', 100), (2, '2017-01-03', 100), (2, '2017-01-05', 100),
(2, '2017-01-07', 150), (2, '2017-01-09', 150), (2, '2017-02-01', 150),
(2, '2017-02-02', 100), (2, '2017-02-12', 100), (2, '2017-02-15', 100),
(2, '2017-02-17', 150), (2, '2017-03-09', 150), (2, '2017-03-11', 150)
I tried to use something like this
with a as
(
select
id, date,
ROW_NUMBER() over (partition by date order by id) as rowNum
from
x
), b as
(
select
id, date,
ROW_NUMBER() over (partition by id, stat order by date) as rowNum
from
x
)
select min(b.date)
from a
join b on b.id = a.id
having max(a.date) > max(b.date)
What you are looking for is a gaps-and-islands scenario, where you only have islands. In this scenario what defines the start of an island is a change in the stat value within a id, while evaluating the dataset in date order.
The lag window function is used below to compare values across rows, and see if you need to include it in the output.
select b.id
, b.stat
, b.date
from (
select a.id
, a.date
, a.stat
, case lag(a.stat,1,NULL) over (partition by a.id order by a.date asc) when a.stat then 0 else 1 end as include_flag
from x as a
) as b
where b.include_flag = 1
On this SQL Server 2008 database I have a table of attendance, students come into school every day and check in, the table looks something like this:
SchoolID | StudentID | Date
There will be a record for every day for every student on this table. What I want to find out is, given a start date, an end date, and a number of days (gap), find any student that has not checked into school for that number of days. So for example, I need to know which students missed 3 days straight during the month of December, and spit out the list of StudentIDs.
How can I accomplish something like that?
You can produce date ranges from startdate to enddate
then outer join this data with your table, if the student wouldn't have come consider it as 1 then summarize this data.
for generating date range you can use this function as below
CREATE FUNCTION [dbo].[DateRange]
(
#Increment CHAR(1),
#StartDate DATETIME,
#EndDate DATETIME
)
RETURNS
#SelectedRange TABLE
(IndividualDate DATETIME)
AS
BEGIN
;WITH cteRange (DateRange) AS (
SELECT #StartDate
UNION ALL
SELECT
CASE
WHEN #Increment = 'd' THEN DATEADD(dd, 1, DateRange)
WHEN #Increment = 'w' THEN DATEADD(ww, 1, DateRange)
WHEN #Increment = 'm' THEN DATEADD(mm, 1, DateRange)
END
FROM cteRange
WHERE DateRange <=
CASE
WHEN #Increment = 'd' THEN DATEADD(dd, -1, #EndDate)
WHEN #Increment = 'w' THEN DATEADD(ww, -1, #EndDate)
WHEN #Increment = 'm' THEN DATEADD(mm, -1, #EndDate)
END)
INSERT INTO #SelectedRange (IndividualDate)
SELECT DateRange
FROM cteRange
OPTION (MAXRECURSION 3660);
RETURN
END
GO
then
select sum(isAbsent) absentDays, s.studentid from
(
select case when studentid is null then 1 else 0 end isAbsent,individualDate,s.studentid from DateRange('d', '01/11/2014', '30/11/2014') d
cross join tblstudent s
left outer join yourtable on yourtable.Date = d.IndividualDate and yourtable.studentid = s.studentid
) x
group by s.studentid
having sum(isAbsent) > 3
Just look at this. I think you will be able to figure out you own answer from that.This solution take care of the weekend days and holidays :
SQL Fiddle
MS SQL Server 2008 Schema Setup:
CREATE TABLE attendance
([SchoolID] int, [StudentID] int, [Date] datetime)
;
INSERT INTO attendance
([SchoolID], [StudentID], [Date])
VALUES
(1, 1, '2014-12-01 00:00:00'),
(1, 1, '2014-12-02 00:00:00'),
(1, 1, '2014-12-03 00:00:00'),
(1, 1, '2014-12-04 00:00:00'),
(1, 1, '2014-12-05 00:00:00'),
(1, 1, '2014-12-08 00:00:00'),
(1, 1, '2014-12-09 00:00:00'),
(1, 1, '2014-12-10 00:00:00'),
(1, 1, '2014-12-11 00:00:00'),
(1, 1, '2014-12-12 00:00:00'),
(1, 1, '2014-12-15 00:00:00'),
(1, 1, '2014-12-16 00:00:00'),
(1, 1, '2014-12-17 00:00:00'),
(1, 1, '2014-12-18 00:00:00'),
(1, 1, '2014-12-19 00:00:00'),
(1, 2, '2014-12-01 00:00:00'),
(1, 2, '2014-12-02 00:00:00'),
(1, 2, '2014-12-08 00:00:00'),
(1, 2, '2014-12-09 00:00:00'),
(1, 2, '2014-12-10 00:00:00'),
(1, 2, '2014-12-11 00:00:00'),
(1, 2, '2014-12-12 00:00:00'),
(1, 2, '2014-12-15 00:00:00'),
(1, 2, '2014-12-16 00:00:00'),
(1, 2, '2014-12-17 00:00:00'),
(1, 2, '2014-12-18 00:00:00'),
(1, 2, '2014-12-19 00:00:00')
;
CREATE TABLE holidays
([Date] datetime)
;
INSERT INTO holidays
([Date])
VALUES
('2014-12-22 00:00:00'),
('2014-12-23 00:00:00'),
('2014-12-24 00:00:00'),
('2014-12-25 00:00:00'),
('2014-12-26 00:00:00'),
('2014-12-29 00:00:00'),
('2014-12-30 00:00:00'),
('2014-12-31 00:00:00')
;
CREATE TABLE students
([StudentID] int, [Name] varchar(5))
;
INSERT INTO students
([StudentID], [Name])
VALUES
(1, 'John'),
(2, 'Peter')
;
Query 1:
DECLARE #start DATE, #end DATE
SELECT #start = '20141201', #end = '20141231'
;WITH tdate AS
(
SELECT TOP (DATEDIFF(DAY, #start, #end) + 1)
n = ROW_NUMBER() OVER (ORDER BY [object_id])
FROM sys.all_objects
)
SELECT DISTINCT Name
FROM students s
INNER JOIN attendance a ON s.StudentID = a.StudentID
INNER JOIN tdate ON DATEADD(DAY, n-1, #start) = a.Date
GROUP BY NAME
HAVING
(SELECT count(*)
FROM tdate
LEFT OUTER JOIN holidays h ON DATEADD(DAY, n-1, #start) = h.Date
WHERE h.date is null
AND DATEPART(dw,DATEADD(DAY, n-1, #start)) not in (1,7))
- COUNT(*) >= 3
Results:
| NAME |
|-------|
| Peter |
UPDATE
SELECT s.StudentID, d.Date
FROM students s
INNER JOIN (
SELECT DATEADD(DAY, n-1, #start) as Date
FROM tdate
LEFT OUTER JOIN holidays h ON DATEADD(DAY, n-1, #start) = h.Date
WHERE h.date is null
AND DATEPART(dw,DATEADD(DAY, n-1, #start)) not in (1,7)) d ON 1 = 1
LEFT OUTER JOIN attendance a ON s.StudentID = a.StudentID AND d.Date = a.Date
WHERE a.StudentID IS NULL
ORDER BY s.StudentID, d.Date
Results:
| STUDENTID | DATE |
|-----------|------------|
| 2 | 2014-12-03 |
| 2 | 2014-12-04 |
| 2 | 2014-12-05 |