Snowflake Windows Analytical Function to set grouping set - snowflake-cloud-data-platform

I have following data set for data lake which is acting as source for Dimension where I want to migrate the history data in Dimension
For e.g.: image
Primarykey Checksum DateFrom Dateto ActiveFlag
1 11 01:00 03:00 False
1 22 03:00 05:00 False
1 22 05:00 07:00 False
1 11 07:00 09:00 False
1 11 09:00 12/31/999 TRUE
Please note that datalake table have multiple columns which are not part of dimension so we are recalculating the checking show same value but datefrom and dateto
with base as (
Select
Primary_key,
checksum,
first_value ( datefrom ) over ( partition by Primary_key ,checksum order by datefrom ) as Datefrom,
last_value ( dateto ) over ( partition by Primary_key ,checksum order by datefrom ) as Dateto,
rownumber () over ( partition by Primary_key ,checksum order by datefrom ) as latest_record
from Datalake.user)
select * from base where latest_record = 1
Data shown as
Primarykey Checksum DateFrom Dateto
1 11 01:00 12/31/999
1 22 03:00 07:00
But Expected out is
Primarykey Checksum DateFrom Dateto
1 11 01:00 03:00
1 22 03:00 07:00
1 11 07:00 12/31/999
I tried using multiple ways in single query but any good suggestions?

The reason you are get only two rows is you have two columns in your partitions Primarykey and checksum and those only have two combinations. The line you are wanting in the Expected output has the same Primarykey and checksum (1,11) as the first row in your expected output.
The thing I see in your data that would get your result would be if you included ActiveFlag into your partitions.
WITH base AS (
SELECT
primary_key,
checksum,
FIRST_VALUE (datefrom) OVER ( PARTITION BY primary_key, checksum, active_flag order by datefrom) AS datefrom,
LAST_VALUE (dateto) OVER ( partition BY primary_key, checksum, active_flag order by datefrom) AS dateto,
ROWNUMBER () OVER ( partition BY primary_key, checksum, active_flag order by datefrom) AS latest_record
FROM Datalake.user
)
SELECT * FROM base WHERE latest_record = 1

Try this code. Should work both in Snowflake and Oracle:
Create a separate group if checksum changes order by datefrom
**SNOWFLAKE**:
WITH base AS (
SELECT
Primarykey,
checksum,
FIRST_VALUE( datefrom ) OVER ( PARTITION BY Primarykey ,checksum,checksum_group ORDER BY datefrom ) AS Datefrom,
LAST_VALUE( dateto ) OVER ( PARTITION BY Primarykey ,checksum,checksum_group ORDER BY datefrom ) AS Dateto,
ROW_NUMBER() over ( PARTITION BY Primarykey ,checksum,checksum_group ORDER BY datefrom ) AS latest_record
FROM(
SELECT
Primarykey,
checksum,
checksum_prev,
datefrom,
dateto,
LAST_VALUE((case when checksum<>checksum_prev THEN group1 END)) IGNORE NULLS OVER (
ORDER BY group1
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) checksum_group
FROM (
SELECT
Primarykey,
checksum,
datefrom,
dateto,
LAG(checksum, 1, 0) OVER (ORDER BY datefrom) AS checksum_prev,
LPAD(1000 + ROW_NUMBER() OVER(ORDER BY (SELECT NULL)), 4, 0) as group1
FROM Datalake.user)
)
)
SELECT * FROM base WHERE latest_record = 1
**Oracle**:
WITH base AS (
SELECT
Primarykey,
checksum,
FIRST_VALUE ( datefrom ) OVER ( partition by Primarykey ,checksum,checksum_group order by datefrom ) AS Datefrom,
LAST_VALUE ( dateto ) OVER ( partition by Primarykey ,checksum,checksum_group order by datefrom ) AS Dateto,
ROW_NUMBER() OVER ( PARTITION BY Primarykey ,checksum,checksum_group ORDER BY datefrom ) AS latest_record
FROM(
SELECT
Primarykey,
checksum,
checksum_prev,
datefrom,
dateto,
LAST_VALUE((CASE WHEN checksum<>checksum_prev THEN group1 END)) IGNORE NULLS
OVER (ORDER BY group1 ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) checksum_group
FROM (
SELECT
Primarykey,
checksum,
datefrom,
dateto,
LAG(checksum, 1, 0) OVER (ORDER BY DATEFROM) AS checksum_prev,
LPAD(1000 + ROWNUM, 4, 0) as group1
FROM Datalake.user)))
SELECT * FROM base WHERE latest_record = 1

I tweaked the query so it could work on entire data set.
Due to missing primary key , it was failing for entire data.
Modified working query
enter image description here

Related

SQL query to get start and end date from a result set

I am working on one of requirement the raw data is in following format
Requirement - Startdate should be the date when status changed to 1 and enddate should be the 1st date after the record status changed from 1 to any other number.
Customer
Status
Date
A123
0
7/2/2021
A123
0
7/15/2021
A123
0
7/22/2021
A123
1
8/18/2021
A123
1
9/8/2021
A123
0
12/1/2021
A123
0
1/21/2022
A123
1
3/6/2022
A123
1
3/7/2022
A123
0
3/15/2022
B123
1
1/1/2022
B123
0
1/6/2022
C123
1
1/2/2022
C123
2
1/8/2022
C123
0
1/9/2022
expected output
Customer
StartDate
EndDate
A123
8/18/2021
12/1/2021
A123
9/8/2021
12/1/2021
A123
3/6/2022
3/15/2022
A123
3/7/2022
3/15/2022
B123
1/1/2022
1/6/2022
C123
1/2/2022
1/8/2022
Query I tried to get the output is below, I am getting the output for Customer B123 and C123, but not for A123 as expected.
Query Explanation - In 1st part of query I am taking all the records with status = 1 and in next part taking only those records where status is not equal to 1, and joining these 2 datasets based on Customer and row number generated.
SELECT A.[Customer],A.StartDate,B.EndDate
from
(
SELECT [Customer],MIN(Date) AS STARTDATE,[Status],RANK() OVER (PARTITION BY [STATUS] ORDER BY Date ASC) AS ROWNUM
FROM table1
WHERE [STATUS] = 1
GROUP BY Customer,Date,[Status]
) A
LEFT JOIN
(
SELECT [Customer],MIN(Date) AS ENDDATE,[Status],RANK() OVER (PARTITION BY [STATUS] ORDER BY Date ASC) AS ROWNUM
FROM table1
WHERE [STATUS] != 1
AND Date>(
SELECT MIN(Date) AS STARTDATE
FROM table1
WHERE [STATUS] = 1
)
GROUP BY Customer,Date,[Status]
) B
ON
(
A.[Customer] = B.[Customer]
AND A.RowNum = B.RowNum
)
ORDER BY A.Startdate
First you list the rows where Status = 1 and then use CROSS APPLY to get the corresponding minimum Date where the Status is not equal to 1
select s.[Customer],
StartDate = s.[Date],
EndDate = e.[Date]
from Table1 s
cross apply
(
select [Date] = min(e.[Date])
from Table1 e
where e.[Customer] = s.[Customer]
and e.[Date] > s.[Date]
and e.[Status] <> 1
) e
where s.[Status] = 1
order by s.[Customer], s.[Date]
Here is a more efficient way to do this without a self-join.
WITH cte01only AS
( SELECT *, CASE Status WHEN 1 THEN 1 ELSE 0 END AS Status1 FROM table1 ),
cteDifference AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY Customer ORDER BY Date, Status1)
- ROW_NUMBER() OVER (PARTITION BY Customer, Status1 ORDER BY Date) AS StatusGroup
FROM cte01only
),
cteGroup AS
(
SELECT Customer, StatusGroup, Status1, MIN(Date) As StartDate
FROM cteDifference
GROUP BY Customer, StatusGroup, Status1
),
cteNextDate AS
(
SELECT Customer, StatusGroup, Status1, StartDate,
LEAD(StartDate, 1, NULL) OVER (PARTITION BY Customer ORDER BY StatusGroup) AS EndDate
FROM cteGroup
)
SELECT Customer, StartDate, EndDate
FROM cteNextDate
WHERE Status1 = 1
ORDER BY Customer, StateDate
The key trick here is the second CTE which uses the difference of two ROW_NUMBER() functions to tag the customer records (with the StatusGroup column) into separate partitions by contiguous runs of records whose status is 1 or not 1. After that they can be grouped according to that tag to get the start dates, and then use the LEAD() function to get the following group's StartDate as the current groupings EndDate.
(There may be a more compact way to express this, but I like to layout each stage as a separate CTE.)

Case when first instance of unique ID

I'm in Snowflake and am trying to mark the first occurrence of a unique ID in a column. I've been playing around with first_value but am not really getting anywhere.
So my data looks something like this:
ID Date
123 1/2019
123 2/2019
123 3/2019
234 2/2019
234 3/2019
And ideally I want something like this:
ID Date First?
123 1/2019 1
123 2/2019 0
123 3/2019 0
234 2/2019 1
234 3/2019 0
How do I accomplish this?
You want ROW_NUMBER:
SELECT
ID,
Date,
IFF(ROW_NUMBER() OVER (PARTITION BY ID ORDER BY Date) = 1, 1, 0) AS First
FROM
schema.table
ORDER BY ID, Date
;
This checks whether the current row is the first date for the ID, and if it is, assigns a value of 1 (otherwise 0).
LAG can also be used to solve this..
SELECT id
,date
,lag(id) over (partition by id order by date) is null as first
FROM table_name;
Which can be also done with FIRST_VALUE like
SELECT id
,date
,first_value(id) over (partition by id order by date) = date as first
FROM table_name;
If your intention is to retrieve the first occurrence of a unique ID in a column then the row_number() or the dense_rank() function can help.
with cte as
(
select ID, Date,
row_number() over (partition by ID order by date) as row_number
from table1
)
select * from cte where row_number = 1;
with cte as
(
select ID, Date,
dense_rank() over (partition by ID order by date) as rank
from stack1
)
select * from cte where rank = 1;

T-SQL : how to select row with multiple conditions

I have below data set in SQL Server and I need to select the data with conditions in order below:
First, check to see if date_end is 1/1/2099, then select the row that has smallest days gap and skill_group is not SWAT for rows have same employee_id, in this case that is row 2.
Second, for rows that do not have 1/1/2099 date_end, select row that has most recent day date_end, in this case it's row 4.
ID employee_id last_name first_name date_start date_end skill_group
---------------------------------------------------------------------------
1 N05E0F Mike Pamela 12/19/2013 1/1/2099 SWAT
2 N05E0F Mike Pamela 9/16/2015 1/1/2099 Welcome Team
3 NSH8A David Smith 12/19/2013 9/16/2016 Unlicensed
4 NSH8A David Smith 8/16/2015 10/16/2016 CMT
There are many ways to do this. Here are some of them:
top with ties version:
select top 1 with ties
*
from tbl
where skill_group != 'SWAT'
order by
row_number() over (
partition by employee_id
order by date_end desc, datediff(day,date_start,date_end) asc
)
with common_table_expression as () using row_number() version:
with cte as (
select *
, rn = row_number() over (
partition by employee_id
order by date_end desc, datediff(day,date_start,date_end) asc
)
from tbl
where skill_group != 'SWAT'
)
select *
from cte
where rn = 1

SQL Server, first of each time series

A table 'readings' has a list of dates
[Date] [Value]
2015-03-19 00:30:00 1.2
2015-03-19 00:40:00 1.2
2015-03-19 00:50:00 0.1
2015-03-19 01:00:00 0.1
2015-03-19 01:10:00 2
2015-03-19 01:20:00 0.5
2015-03-19 01:30:00 0.5
I need to get the most recent instance where the value is below a set point (in this case the value 1.0), but I only want the start (earliest datetime) where the value was below 1 for consecutive times.
So with the above data I want to return 2015-03-19 01:20:00, as the most recent block of times where value < 1, but I want the start of that block.
This SQL just returns the most recent date, rather than the first date whilst the value has been low (so returns 2015-03-19 01:30:00 )
select top 1 *
from readings where value <=1
order by [date] desc
I can't work out how to group the consecutive dates, to therefore only get the first ones
It is SQL Server, the real data isn't at exactly ten min intervals, and the readings table is about 70,000 rows- so fairly large!
Thanks, Charli
Demo
SELECT * FROM (
SELECT [Date]
,Value
,ROW_NUMBER() OVER (PARTITION BY cast([Date] AS DATE) ORDER BY [Date] ASC) AS RN FROM #table WHERE value <= 1
) t WHERE t.RN = 1
Select Max( [date] )
From [dbo].[readings]
Where ( [value] <= 1 )
You seem to want the minimum date for each set of consecutive records having a value that is less than 1. The query below returns exactly these dates:
SELECT MIN([Date])
FROM (
SELECT [Date], [Value],
ROW_NUMBER() OVER (ORDER BY [Date]) -
COUNT(CASE WHEN [Value] < 1 THEN 1 END) OVER (ORDER BY [Date]) AS grp
FROM mytable) AS t
WHERE Value < 1
GROUP BY grp
grp calculated field identifies consecutive records having Value<1.
Note: The above query will work for SQL Server 2012+.
Demo here
Edit:
To get the date value of the last group you can modify the above query to:
SELECT TOP 1 MIN([Date])
FROM (
SELECT [Date], [Value],
ROW_NUMBER() OVER (ORDER BY [Date]) -
COUNT(CASE WHEN [Value] < 1 THEN 1 END) OVER (ORDER BY [Date]) AS grp
FROM mytable) AS t
WHERE Value < 1
GROUP BY grp
ORDER BY grp DESC
Demo here

T-SQL - aggregate in all rows using aggregate from correlated sub-query

store item datekey onhand salesunits
--------------------------------------------
001 A 50 65 2
001 A 51 8 4
001 A 52 0 8
--------------------------------------------
What I need to accomplish: to get the latest onhand greater than zero minus the total units sold, by store and item. So in the example above it would be 8-14=-6.
I am using a correlated sub-query to determine the latest datekey and then joining back to the main query. But obviously by doing so I lose the data related to the other rows necessary to sum the salesunits:
This is what I have and it's wrong:
select s1.Store, s1.Item, s1.OnHand, sum(salesunit)
from sales s1
join (select top 1 store,item, max(DateKey) as datekey
from sales
where isnull(onhand,0) > 0
and DateKey in (50,51,52)
group by store, item) s2 on s2.store=s1.store and s2.item=s1.item and s2.datekey=s1.datekey
group by s1.Store, s1.Item, s1.OnHand
Thanks for your help!
;
WITH totals AS (
SELECT
*,
totalsalesunits = SUM(salesunits) OVER (PARTITION BY store, item),
rnk = ROW_NUMBER() OVER (PARTITION BY store, item
ORDER BY SIGN(onhand) DESC, datekey DESC)
FROM sales
)
SELECT
store,
item,
onhand,
totalsalesunits
FROM totals
WHERE rnk = 1
I would do it something like this:
First some test data:
DECLARE #tbl TABLE
(
store VARCHAR(4),
item VARCHAR(2),
datekey INT,
onhand INT,
salesUnits INT
)
INSERT INTO #tbl
VALUES
('001','A',50,65,2),
('001','A',51,8,4),
('001','A',52,0,8)
The the query like this:
;WITH cteTotalSales AS
(
SELECT
SUM(tbl.salesUnits) OVER(PARTITION BY 1) AS TotalSalesUnit,
tbl.store,
tbl.item,
ISNULL(tbl.onhand,0) AS onhand,
tbl.salesUnits,
tbl.datekey
FROM
#tbl AS tbl
), cteLatest AS
(
SELECT
RANK() OVER
(
PARTITION BY cteTotalSales.store,cteTotalSales.item
ORDER BY cteTotalSales.datekey DESC
) AS iRank,
cteTotalSales.store,
cteTotalSales.item,
cteTotalSales.onhand,
cteTotalSales.salesUnits,
cteTotalSales.datekey
FROM
cteTotalSales
WHERE
(cteTotalSales.onhand-cteTotalSales.TotalSalesUnit)>0
)
SELECT
*
FROM
cteLatest
WHERE
iRank=1
;with a as
(
select rn = row_number() over (partition by store, item order by case when onhand = 0 then -1 else datekey end desc),
Store, Item, OnHand, salesunit
from sales
)
select store, item, sum(case when rn = 1 then onhand end)-sum(salesunit) OnHand, sum(salesunit) sumsalesunit from a
group by store, item

Resources