How to retrieve only the records where stat changes? - sql-server

I want to get same output:
using the following sample data
create table x
(
id int,
date datetime,
stat int
)
insert into x
values (1, '2017-01-01', 100), (1, '2017-01-03', 100), (1, '2017-01-05', 100),
(1, '2017-01-07', 150), (1, '2017-01-09', 150), (1, '2017-02-01', 150),
(1, '2017-02-02', 100), (1, '2017-02-12', 100), (1, '2017-02-15', 100),
(1, '2017-02-17', 150), (1, '2017-03-09', 150), (1, '2017-03-11', 150),
(2, '2017-01-01', 100), (2, '2017-01-03', 100), (2, '2017-01-05', 100),
(2, '2017-01-07', 150), (2, '2017-01-09', 150), (2, '2017-02-01', 150),
(2, '2017-02-02', 100), (2, '2017-02-12', 100), (2, '2017-02-15', 100),
(2, '2017-02-17', 150), (2, '2017-03-09', 150), (2, '2017-03-11', 150)
I tried to use something like this
with a as
(
select
id, date,
ROW_NUMBER() over (partition by date order by id) as rowNum
from
x
), b as
(
select
id, date,
ROW_NUMBER() over (partition by id, stat order by date) as rowNum
from
x
)
select min(b.date)
from a
join b on b.id = a.id
having max(a.date) > max(b.date)

What you are looking for is a gaps-and-islands scenario, where you only have islands. In this scenario what defines the start of an island is a change in the stat value within a id, while evaluating the dataset in date order.
The lag window function is used below to compare values across rows, and see if you need to include it in the output.
select b.id
, b.stat
, b.date
from (
select a.id
, a.date
, a.stat
, case lag(a.stat,1,NULL) over (partition by a.id order by a.date asc) when a.stat then 0 else 1 end as include_flag
from x as a
) as b
where b.include_flag = 1

Related

Why does the TOP function not function as I expect in SQL?

I thought I knew how the TOP function works, but with this code below I'm not sure.
Please can someone tell me why it returns fruit:b ordinal:9, instead of the expected fruit:b ordinal:8?
;WITH CTE
AS (
SELECT fruit, ordinal, row_number() OVER (
ORDER BY (
SELECT 1
)
) AS rn
FROM (
VALUES (1, 'a'), (2, 'b'), (3, 'b'), (4, 'c'), (5, 'c'), (6, 'a'), (7, 'a'), (8, 'b'), (9, 'b')
) fruits(ordinal, fruit)
), CTE2
AS (
SELECT fruit, ordinal
FROM cte AS cteouter
WHERE rn = 1
OR fruit != (
SELECT fruit
FROM cte AS cteinner
WHERE cteinner.rn = cteouter.rn - 1
)
)
--SELECT * FROM CTE2
SELECT TOP 1 *
FROM cte2
ORDER BY ordinal DESC
row_number() OVER (ORDER BY (SELECT 1)) (as well as over (order by 1) or over (order by 1/0)) does not guarantee stable reproducible numbering of the incoming rows. Quite opposite, it effectively switches off the order by clause and makes it random.
When you run the top query with TOP 1, you get one execution plan, and when without TOP, you get another. These plans happen to randomly result in a different ordering of the rows in CTE, which in turn changes which rows are returned from CTE2.

Window based averages based on ranges within another table

I have a table that has positions like so:
create or replace table data (
pos int not null,
val float not null,
constraint data_pk primary key (pos)
);
And, a ranges type table like so:
create or replace table ranges (
label varchar(32) not null,
left int not null,
right int not null,
constraint ranges_pk primary key (label)
);
with ranges like
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...
I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':
The results for charlie in the select should be:
('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
-- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
-- -> avg(36.333, 40.333, 43.667) -> 40.111
(for data like)
insert into data (pos, val) values
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..
Thanks!
Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?
SQL language is expressive enough to handle such case.
Key point here is to use windowed average with windows size of 3 and then average moving averages:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS
BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;
Output:
Intermediate step to ilustrate:
WITH cte AS (
SELECT r.label, r.left, r.right, d.val,
AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos
ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
FROM ranges r
JOIN data d
ON d.pos BETWEEN r.left AND r.right
QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2
)
SELECT *
FROM cte
ORDER BY label, r;
Output:
Here is a step by step answer:
WITH data(pos, val) AS (
SELECT * FROM VALUES
(1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
(9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
(17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
(25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
(33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
(41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
(49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
SELECT * FROM VALUES
('alpha', 11, 13),
('bravo', 11, 14),
('charlie', 11, 15),
('echo', 12, 18),
('delta', 12, 19),
('foxtrot', 13, 20)
), ranges as (
SELECT row_number() over (order by null)-1 as seq
FROM table(generator(rowcount => 200))
), boost_codes AS (
select c.name
,c.s_val + r.seq + 0 as b1
,c.s_val + r.seq + 2 as b3
from codes as c
join ranges as r
ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
select
bc.name
,avg(d.val) as partial
from boost_codes as bc
join data as d ON d.pos between bc.b1 and bc.b3
GROUP BY 1, bc.b1
)
SELECT name
,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;
which gives:
NAME
OUTPUT
alpha
36.333
bravo
38.333
charlie
40.111
delta
50.778
echo
48.467
foxtrot
55.111

UPDATE using Rank(), Row_Number excluding duplicate values

I've a dataset similar to the one below.
I need to update the base lookup table based on the values provided in the updated_CustomerId column. The base tables is the same as the dataset but it does not have updated_CustomerId column.
The challenge here that the base table has a unique constraint based on combination of three columns below:
Current_CustomerID
Order_ID
OrderCategory
DESIRED OUTPUT:
After the update either one of Old_customerIds (17360410 - Pk 8, 21044488 - Pk = 9) can be reassigned to the Update_CustomerID
PrimaryKey 2 will not updated as that would lead to Unique constraint violation, but it will then be deleted along with one of the PrimaryKeys from the above either 8 or 9, depending on which one was updated (re-assigned to the new id)
After everything is updated on the base table I then delete from the base table all records where Current_CustomerID was not re-assigned to the updated_CustomerId (if different)
IF OBJECT_ID('tempdb..#DataSet') IS NOT NULL
DROP TABLE #DataSet
IF OBJECT_ID('tempdb..#BaseTable') IS NOT NULL
DROP TABLE #BaseTable
CREATE TABLE #DataSet
(
PrimaryKey INT NOT NULL CONSTRAINT [PK_dataset_ID] PRIMARY KEY,
Current_CustomerID INT NOT NULL,
Order_ID INT NOT NULL,
OrderCategory VARCHAR(50) NOT NULL,
Updated_CustomerId INT NOT NULL
)
INSERT INTO #DataSet (PrimaryKey, Current_CustomerID, Order_ID, OrderCategory, updated_CustomerId)
VALUES
(1, 17395001, 4451784, 'Kitchen', 25693110),
(2, 25693110, 4451784, 'Kitchen', 25693110),
(3, 25693110, 2083059, 'Kitchen', 25693110),
(4, 25693110, 2163679, 'Kitchen', 25693110),
(5, 25693110, 2171466, 'Kitchen', 25693110),
(6, 25693110, 2163679, 'Bathroom', 25693110),
(7, 25693110, 2171466, 'Bathroom', 25693110),
(8, 17360410, 3377931, 'Furniture', 16303984),
(9, 21044488, 3377931, 'Furniture', 16303984),
(10, 1534323, 2641714, 'Furniture', 16303984),
(11, 16303984, 2641726, 'Furniture', 16303984),
(12, 16303984, 2641793, 'Furniture', 16303984),
(13, 16303984, 2641816, 'Furniture', 16303984),
(14, 16303345, 2641816, 'Garden', 16301239),
(15, 12345678, 1239065, 'Medicine', 1075432)
CREATE TABLE #BaseTable
(
PrimaryKey INT NOT NULL CONSTRAINT [PK_baseTable_ID] PRIMARY KEY,
CustomerID INT NOT NULL,
Order_ID INT NOT NULL,
OrderCategory VARCHAR(50) NOT NULL,
)
CREATE UNIQUE NONCLUSTERED INDEX [IDX_LookUp] ON #BaseTable
(
CustomerID ASC,
Order_ID ASC,
OrderCategory ASC
) ON [PRIMARY]
INSERT INTO #BaseTable (PrimaryKey, CustomerID, Order_ID, OrderCategory)
VALUES
(1, 17395001, 4451784, 'Kitchen'),
(2, 25693110, 4451784, 'Kitchen'),
(3, 25693110, 2083059, 'Kitchen'),
(4, 25693110, 2163679, 'Kitchen'),
(5, 25693110, 2171466, 'Kitchen'),
(6, 25693110, 2163679, 'Bathroom'),
(7, 25693110, 2171466, 'Bathroom'),
(8, 17360410, 3377931, 'Furniture'),
(9, 21044488, 3377931, 'Furniture'),
(10, 1534323, 2641714, 'Furniture'),
(11, 16303984, 2641726, 'Furniture'),
(12, 16303984, 2641793, 'Furniture'),
(13, 16303984, 2641816, 'Furniture'),
(14, 16303345, 2641816, 'Garden'),
(15, 12345678, 1239065, 'Medicine')
-- select * from #BaseTable
-- select * from #DataSet
; with CTE AS (
select a.*
,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory
order by a.Current_CustomerID) as flag
from #DataSet a
)
with CTE AS (
select a.*
,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory order by a.Current_CustomerID) as flag
from #DataSet a
)
update b
set CustomerID = a.Updated_CustomerId
from #BaseTable b
inner join CTE a on b.PrimaryKey = a.PrimaryKey
where flag <> 2
Msg 2601, Level 14, State 1, Line 82
Cannot insert duplicate key row in object 'dbo.#BaseTable' with unique index 'IDX_LookUp'. The duplicate key value is (25693110, 4451784, Kitchen).
The statement has been terminated.
I think you just want to get a row_number for the #DataTable, and then delete where there are more than one based on the unique key:
//...
DELETE bt
FROM #BaseTable bt
INNER JOIN (
SELECT a.PrimaryKey,
a.Updated_CustomerId,
a.Order_ID,
a.OrderCategory,
row = ROW_NUMBER() OVER (PARTITION BY a.Updated_CustomerId, a.Order_ID, a.OrderCategory ORDER BY a.Current_CustomerID)
FROM #BaseTable b
INNER JOIN #DataSet a
ON b.PrimaryKey = a.PrimaryKey
) x
ON bt.PrimaryKey = x.PrimaryKey
AND x.row > 1

Most efficient way of finding duplicates SQL Server

The fiddle:
CREATE TABLE person
([first_name] varchar(10), [surname] varchar(10), [date_of_birth] date, [person_id] int);
INSERT INTO person
([first_name], [surname], [date_of_birth] ,[person_id])
VALUES
('Alice', 'AA', '1/1/1990', 1),
('Bob' , 'BB', '1/1/1990', 3),
('Carol', 'CC', '1/1/1990', 4),
('Kate' , 'KK', '1/1/1990', 7);
CREATE TABLE person_membership
([person_id] int, [status_flag] varchar(1), [membership_id] int);
INSERT INTO person_membership
([person_id], [status_flag], [membership_id])
VALUES
(1, 'A', 10),
(1, 'A', 20),
(3, 'A', 30),
(4, 'A', 40),
(7, 'A', 60),
(7, 'T', 70);
CREATE TABLE memship
([membership_id] int, [memship_status] varchar(1));
INSERT INTO memship
([membership_id], [memship_status])
VALUES
(10, 'A'),
(20, 'A'),
(30, 'A'),
(40, 'A'),
(50, 'T'),
(60, 'A'),
(70, 'A');
The query:
WITH t AS
(SELECT first_name, surname, date_of_birth, p.person_id, m.membership_id
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A' and m.memship_status='A')
SELECT t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
FROM t
INNER JOIN t t1 ON t.person_id=t1.person_id
GROUP BY t.first_name, t.surname, t.date_of_birth, t.person_id, t1.membership_id
HAVING count(*) > 1
The problem:
Find and display only those reconds marked as active and with multiple membership IDs assigned to one person id.
The expected outcome:
The question:
My query works fine and gives me the expected outcome but the execution plan looks rather convoluted. What are the better, more elegant, expert-recommended ways of doing it?
Seems like you don't need that big GROUP BY at all, you could use a windowed function inside the CTE instead:
WITH Counts AS(
SELECT p.first_name,
p.surname,
p.date_of_birth,
p.person_id,
m.membership_id,
COUNT(*) OVER (PARTITION BY p.person_id) AS PersonMemCount
FROM person p
INNER JOIN person_membership pm ON p.person_id=pm.person_id
INNER JOIN memship m ON pm.membership_id = m.membership_id
WHERE pm.status_flag='A'
AND m.memship_status='A')
SELECT C.first_name,
C.surname,
C.date_of_birth,
C.person_id,
C.membership_id
FROM Counts C
WHERE C.PersonMemCount > 1;

Finding a date gap (missing date range) within a SQL table

On this SQL Server 2008 database I have a table of attendance, students come into school every day and check in, the table looks something like this:
SchoolID | StudentID | Date
There will be a record for every day for every student on this table. What I want to find out is, given a start date, an end date, and a number of days (gap), find any student that has not checked into school for that number of days. So for example, I need to know which students missed 3 days straight during the month of December, and spit out the list of StudentIDs.
How can I accomplish something like that?
You can produce date ranges from startdate to enddate
then outer join this data with your table, if the student wouldn't have come consider it as 1 then summarize this data.
for generating date range you can use this function as below
CREATE FUNCTION [dbo].[DateRange]
(
#Increment CHAR(1),
#StartDate DATETIME,
#EndDate DATETIME
)
RETURNS
#SelectedRange TABLE
(IndividualDate DATETIME)
AS
BEGIN
;WITH cteRange (DateRange) AS (
SELECT #StartDate
UNION ALL
SELECT
CASE
WHEN #Increment = 'd' THEN DATEADD(dd, 1, DateRange)
WHEN #Increment = 'w' THEN DATEADD(ww, 1, DateRange)
WHEN #Increment = 'm' THEN DATEADD(mm, 1, DateRange)
END
FROM cteRange
WHERE DateRange <=
CASE
WHEN #Increment = 'd' THEN DATEADD(dd, -1, #EndDate)
WHEN #Increment = 'w' THEN DATEADD(ww, -1, #EndDate)
WHEN #Increment = 'm' THEN DATEADD(mm, -1, #EndDate)
END)
INSERT INTO #SelectedRange (IndividualDate)
SELECT DateRange
FROM cteRange
OPTION (MAXRECURSION 3660);
RETURN
END
GO
then
select sum(isAbsent) absentDays, s.studentid from
(
select case when studentid is null then 1 else 0 end isAbsent,individualDate,s.studentid from DateRange('d', '01/11/2014', '30/11/2014') d
cross join tblstudent s
left outer join yourtable on yourtable.Date = d.IndividualDate and yourtable.studentid = s.studentid
) x
group by s.studentid
having sum(isAbsent) > 3
Just look at this. I think you will be able to figure out you own answer from that.This solution take care of the weekend days and holidays :
SQL Fiddle
MS SQL Server 2008 Schema Setup:
CREATE TABLE attendance
([SchoolID] int, [StudentID] int, [Date] datetime)
;
INSERT INTO attendance
([SchoolID], [StudentID], [Date])
VALUES
(1, 1, '2014-12-01 00:00:00'),
(1, 1, '2014-12-02 00:00:00'),
(1, 1, '2014-12-03 00:00:00'),
(1, 1, '2014-12-04 00:00:00'),
(1, 1, '2014-12-05 00:00:00'),
(1, 1, '2014-12-08 00:00:00'),
(1, 1, '2014-12-09 00:00:00'),
(1, 1, '2014-12-10 00:00:00'),
(1, 1, '2014-12-11 00:00:00'),
(1, 1, '2014-12-12 00:00:00'),
(1, 1, '2014-12-15 00:00:00'),
(1, 1, '2014-12-16 00:00:00'),
(1, 1, '2014-12-17 00:00:00'),
(1, 1, '2014-12-18 00:00:00'),
(1, 1, '2014-12-19 00:00:00'),
(1, 2, '2014-12-01 00:00:00'),
(1, 2, '2014-12-02 00:00:00'),
(1, 2, '2014-12-08 00:00:00'),
(1, 2, '2014-12-09 00:00:00'),
(1, 2, '2014-12-10 00:00:00'),
(1, 2, '2014-12-11 00:00:00'),
(1, 2, '2014-12-12 00:00:00'),
(1, 2, '2014-12-15 00:00:00'),
(1, 2, '2014-12-16 00:00:00'),
(1, 2, '2014-12-17 00:00:00'),
(1, 2, '2014-12-18 00:00:00'),
(1, 2, '2014-12-19 00:00:00')
;
CREATE TABLE holidays
([Date] datetime)
;
INSERT INTO holidays
([Date])
VALUES
('2014-12-22 00:00:00'),
('2014-12-23 00:00:00'),
('2014-12-24 00:00:00'),
('2014-12-25 00:00:00'),
('2014-12-26 00:00:00'),
('2014-12-29 00:00:00'),
('2014-12-30 00:00:00'),
('2014-12-31 00:00:00')
;
CREATE TABLE students
([StudentID] int, [Name] varchar(5))
;
INSERT INTO students
([StudentID], [Name])
VALUES
(1, 'John'),
(2, 'Peter')
;
Query 1:
DECLARE #start DATE, #end DATE
SELECT #start = '20141201', #end = '20141231'
;WITH tdate AS
(
SELECT TOP (DATEDIFF(DAY, #start, #end) + 1)
n = ROW_NUMBER() OVER (ORDER BY [object_id])
FROM sys.all_objects
)
SELECT DISTINCT Name
FROM students s
INNER JOIN attendance a ON s.StudentID = a.StudentID
INNER JOIN tdate ON DATEADD(DAY, n-1, #start) = a.Date
GROUP BY NAME
HAVING
(SELECT count(*)
FROM tdate
LEFT OUTER JOIN holidays h ON DATEADD(DAY, n-1, #start) = h.Date
WHERE h.date is null
AND DATEPART(dw,DATEADD(DAY, n-1, #start)) not in (1,7))
- COUNT(*) >= 3
Results:
| NAME |
|-------|
| Peter |
UPDATE
SELECT s.StudentID, d.Date
FROM students s
INNER JOIN (
SELECT DATEADD(DAY, n-1, #start) as Date
FROM tdate
LEFT OUTER JOIN holidays h ON DATEADD(DAY, n-1, #start) = h.Date
WHERE h.date is null
AND DATEPART(dw,DATEADD(DAY, n-1, #start)) not in (1,7)) d ON 1 = 1
LEFT OUTER JOIN attendance a ON s.StudentID = a.StudentID AND d.Date = a.Date
WHERE a.StudentID IS NULL
ORDER BY s.StudentID, d.Date
Results:
| STUDENTID | DATE |
|-----------|------------|
| 2 | 2014-12-03 |
| 2 | 2014-12-04 |
| 2 | 2014-12-05 |

Resources