SQL Server : finding gaps in employment - island and gap problem - sql-server

I have been going through stack overflow to try and work this out over the last week and I still can't work out a viable solution so was wondering if anyone could offer me some help/advice?
Explanation of the data structures
I have the following tables:
Position table (zz_position) which is used to hold the details of the
position (Job ID) include the date range that it is valid for.
PosNo Description Date_From Date_To
---------------------------------------------------------
10001 System Administrator 20170101 20231231
Resource table (zz_resource) which is used to hold the details of a resource (employee) including the date that they joined the company and left it
resID description date_from date_to
------------------------------------------
100 Sam 20160101 20991231
101 Joe 20150101 20991231
Employment table (zz_employment) which is used to link position to resources within a date from and to range
PosNo resID Date_From Date_To seqNo
---------------------------------------------------
10001 100 20180101 20180401 1
10001 101 20180601 20191231 2
10001 100 20200101 20991231 3
Problem
Now due to people changing positions, a post might not be filled for a period of time and what I am trying to do is produce a report that I can use to give me a breakdown of the status of a post at any point in time.
I know that I can produce one which fully maps each day using a calendar table however what I want is a report which produces the data in the following aggregated format:
PosNo resID Date_From Date_To seqNo
-------------------------------------------------
10001 NULL 20170101 20171231 0
10001 100 20180101 20180401 1
10001 NULL 20180402 20180530 0
10001 101 20180601 20191231 2
10001 100 20200101 20231231 3
insert into zz_employment
values ('10001', '100', '2018-01-01 00:00:00.000', '2018-04-01 00:00:00.000', 1),
('10001', '101', '2018-06-01 00:00:00.000', '2019-12-31 00:00:00.000', 2),
('10001', '100', '2020-01-01 00:00:00.000', '2099-12-31 00:00:00.000', 3)
(note how the report has taken the two lines in the table and produced a fully speced out life of the employment where the first null line date from is pulled from the position start date and the last line date to is pulled from the position end date.
Ideally I would like this as a view/function however due to the complexity I would be more than happy to have a series of T SQL statements that I can run each night as part of a data warehouse routine.
Rules
all dates are truncated to datetime so that an date_to is referencing the date that it ends not the date and time that it ends
if the post/employment/resource has no end date then it will be denoted as 20991231
if the employment itself is open ended then the date to in the employment table is denoted as 20991231 even through the position itself might end in 20231231. Ideally I would like the result to respect the position end date.
SQL code:
CREATE TABLE zz_position
(
posNo varchar(25) NOT NULL,
description varchar(25) NOT NULL,
date_from datetime NULL,
date_to datetime NULL
)
insert into zz_position
values ('10001', 'System Administrator', '2017-01-01 00:00:00.000', '2020-12-31 00:00:00.000')
go
CREATE TABLE zz_resource
(
resID varchar(25) NOT NULL,
description varchar(25) NOT NULL,
date_from datetime NULL,
date_to datetime NULL
)
insert into zz_resource
values ('100', 'Sam', '2016-01-01 00:00:00.000', '2099-12-31 00:00:00.000'),
('101', 'Joe', '2015-01-01 00:00:00.000', '2099-12-31 00:00:00.000')
go
CREATE TABLE zz_employment
(
posNo varchar(25) NOT NULL,
resID varchar(25) NOT NULL,
date_from datetime NULL,
date_to datetime NULL,
seqNo int NULL
)
insert into zz_employment
values ('10001', '100', '2018-01-01 00:00:00.000', '2018-04-01 00:00:00.000', 1),
('10001', '101', '2018-06-01 00:00:00.000', '2019-12-31 00:00:00.000', 2),
('10001', '100', '2020-01-01 00:00:00.000', '2099-12-31 00:00:00.000', 3)

There are 2 caveats for this problem:
A calendar table.
A way to correctly group unemployed periods when there's an employed period in between.
The following solution uses a calendar table (SQL included) and an DATEDIFF() with anchor-date trick to group correctly for the 2nd point.
Complete DB Fiddle here.
Solution (explanation below):
;WITH AllPositionDates AS
(
SELECT
T.posNo,
C.GeneratedDate
FROM
zz_position AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
),
AllEmployedDates AS
(
SELECT
T.posNo,
T.resID,
T.seqNo,
C.GeneratedDate
FROM
zz_employment AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
),
PositionsByEmployed AS
(
SELECT
P.posNo,
P.GeneratedDate,
E.resID,
E.seqNo,
NullRowNumber = ROW_NUMBER() OVER (
PARTITION BY
P.posNo,
CASE WHEN E.posNo IS NULL THEN 1 ELSE 2 END
ORDER BY
P.GeneratedDate ASC)
FROM
AllPositionDates AS P
LEFT JOIN AllEmployedDates AS E ON
P.posNo = E.posNo AND
P.GeneratedDate = E.GeneratedDate
)
SELECT
P.posNo,
P.resID,
Date_From = MIN(P.GeneratedDate),
Date_To = MAX(P.GeneratedDate),
seqNo = ISNULL(P.seqNo, 0)
FROM
PositionsByEmployed AS P
GROUP BY
P.posNo,
P.resID,
P.seqNo,
CASE WHEN P.resId IS NULL THEN P.NullRowNumber - DATEDIFF(DAY, '2000-01-01', P.GeneratedDate) END -- GroupingValueGroupingValue
ORDER BY
P.posNo,
Date_From,
Date_To
The result:
posNo resID Date_From Date_To seqNo
10001 NULL 2017-01-01 2017-12-31 0
10001 100 2018-01-01 2018-04-01 1
10001 NULL 2018-04-02 2018-05-31 0
10001 101 2018-06-01 2019-12-31 2
10001 100 2020-01-01 2020-12-31 3
Explanation
First the creating of a calendar table. This holds 1 row for each day and in this example it's limited to the first and last possible day of the job positions:
DECLARE #DateStart DATE = (SELECT MIN(P.date_from) FROM zz_position AS P)
DECLARE #DateEnd DATE = (SELECT(MAX(P.date_to)) FROM zz_position AS P)
;WITH GeneratedDates AS
(
SELECT
GeneratedDate = #DateStart
UNION ALL
SELECT
GeneratedDate = DATEADD(DAY, 1, G.GeneratedDate)
FROM
GeneratedDates AS G
WHERE
DATEADD(DAY, 1, G.GeneratedDate) <= #DateEnd
)
SELECT
DateID = IDENTITY(INT, 1, 1),
G.GeneratedDate
INTO
Calendar
FROM
GeneratedDates AS G
OPTION
(MAXRECURSION 0)
This generates the following (up to 2020-12-31, which is max date from sample data):
DateID GeneratedDate
1 2017-01-01
2 2017-01-02
3 2017-01-03
4 2017-01-04
5 2017-01-05
6 2017-01-06
7 2017-01-07
Now we use a join with a between to "spread" the periods of both the positions and the employees periods (on different CTEs), so we get 1 row for each day, for each position/employee.
-- AllPositionDates
SELECT
T.posNo,
C.GeneratedDate
FROM
zz_position AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
-- AllEmployedDates
SELECT
T.posNo,
T.resID,
T.seqNo,
C.GeneratedDate
FROM
zz_employment AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
With these, we join them together by position and date using LEFT JOIN, so we get all days of each position and the matching employee (if exists). We also calculate a row number for all NULL values for each position that we are gonna use later. Note that this row number increases 1 by 1 with each following date accordingly.
;WITH AllPositionDates AS
(
SELECT
T.posNo,
C.GeneratedDate
FROM
zz_position AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
),
AllEmployedDates AS
(
SELECT
T.posNo,
T.resID,
T.seqNo,
C.GeneratedDate
FROM
zz_employment AS T
INNER JOIN Calendar AS C ON C.GeneratedDate BETWEEN T.date_from AND T.date_to
)
-- PositionsByEmployee
SELECT
P.posNo,
P.GeneratedDate,
E.resID,
E.seqNo,
NullRowNumber = ROW_NUMBER() OVER (
PARTITION BY
P.posNo,
CASE WHEN E.posNo IS NULL THEN 1 ELSE 2 END
ORDER BY
P.GeneratedDate ASC)
FROM
AllPositionDates AS P
LEFT JOIN AllEmployedDates AS E ON
P.posNo = E.posNo AND
P.GeneratedDate = E.GeneratedDate
Now with the tricky part. If we calculate the amount of days of difference between a hard-coded date and each day, we get a similar "row number" that increases consistently for each date.
SELECT
P.posNo,
P.GeneratedDate,
DateDiff = DATEDIFF(DAY, '2000-01-01', P.GeneratedDate),
P.NullRowNumber
FROM
PositionsByEmployed AS P -- This is declare with the WITH (full solution below)
ORDER BY
P.posNo,
P.GeneratedDate
We get the following:
posNo GeneratedDate DateDiff NullRowNumber
10001 2017-01-01 6210 1
10001 2017-01-02 6211 2
10001 2017-01-03 6212 3
10001 2017-01-04 6213 4
10001 2017-01-05 6214 5
10001 2017-01-06 6215 6
10001 2017-01-07 6216 7
10001 2017-01-08 6217 8
10001 2017-01-09 6218 9
If we add another column with the rest of these 2 you will see that the value remains the same:
SELECT
P.posNo,
P.GeneratedDate,
DateDiff = DATEDIFF(DAY, '2000-01-01', P.GeneratedDate),
P.NullRowNumber,
GroupingValue = P.NullRowNumber - DATEDIFF(DAY, '2000-01-01', P.GeneratedDate)
FROM
PositionsByEmployed AS P
ORDER BY
P.posNo,
P.GeneratedDate
We get:
posNo GeneratedDate DateDiff NullRowNumber GroupingValue
10001 2017-01-01 6210 1 -6209
10001 2017-01-02 6211 2 -6209
10001 2017-01-03 6212 3 -6209
10001 2017-01-04 6213 4 -6209
10001 2017-01-05 6214 5 -6209
10001 2017-01-06 6215 6 -6209
10001 2017-01-07 6216 7 -6209
10001 2017-01-08 6217 8 -6209
10001 2017-01-09 6218 9 -6209
10001 2017-01-10 6219 10 -6209
But if we scroll down until we see values that are NULL for employee (from the ROW_NUMBER() PARTITION BY expression E.PosNo), we see that the rest differs, since the ROW_NUMBER() kept increasing 1 by 1 and the DATEDIFF jumped because there are employed people in between:
posNo GeneratedDate DateDiff NullRowNumber GroupingValue
10001 2017-12-28 6571 362 -6209
10001 2017-12-29 6572 363 -6209
10001 2017-12-30 6573 364 -6209
10001 2017-12-31 6574 365 -6209
...
10001 2018-04-02 6666 366 -6300
10001 2018-04-03 6667 367 -6300
10001 2018-04-04 6668 368 -6300
10001 2018-04-05 6669 369 -6300
10001 2018-04-06 6670 370 -6300
10001 2018-04-07 6671 371 -6300
Use use this "GroupingValue" as an additional GROUP BY to correctly separate position intervals that fall outside employed intervals.

Related

Calculate time between startdate and enddate and subtracting days that have no worktime

My goal is to check if an email is answered within 24 hours during workdays. de definition of a workday is if there is time registered in another table. this because we sometimes work on a Saturday or a Sunday or to exclude holidays. I made a view from that table that gives a 1 if the date has worktime or a 0 if there is no worktime registered.
DateWorked
HasWorked
2021-04-01 00:00:00.000
1
2021-04-02 00:00:00.000
1
2021-04-03 00:00:00.000
1
2021-04-04 00:00:00.000
0
2021-04-05 00:00:00.000
1
So for example a few situations:
1. MailIncoming: 2021-04-01 16:30:00, MailAnswering: 2021-04-02 14:00:00
This one is easy, I don't have to subtract anything and the mail is answered within 24 hours.
2. MailIncoming: 2021-04-01 09:30:00, MailAnswering: 2021-04-03 14:00:00
This one is also easy, I don't have to subtract anything and the mail is not answered within 24 hours.
3. MailIncoming: 2021-04-03 12:30:00, MailAnswering: 2021-04-05 10:00:00
There is 1 day where no one has worked, so I need to subtract 1 whole day from the total time, and in that case the email is answered within 24 hours during workdays.
4. MailIncoming: 2021-04-04 11:00:00, MailAnswering: 2021-04-05 18:00:00
The remaining 13 hours from 04 do not count toward the '24 hours during workdays' so the email is answered within 24 during workdays.
Also, there can be multiple dates with zero after each other.
So the outcome I'm looking for is:
MailIncoming
MailAnswering
TotalTime
TotalTimeWithoutDaysNotWorked
2021-04-04 11:00:00.000
2021-04-05 18:00:00.000
31
18
How can I calculate this last column? Or am I approaching this in the wrong way?
The query needs a way to generate calculated dates between MailIncoming and MailAnswering so there can be a LEFT JOIN (or INNER JOIN) to the WorkingDay table. In this case the query uses dbo.fnTally which is known to be a fast and efficient way to generate rows.
tables
drop table if exists #WorkingDay;
go
create table #WorkingDay(
DateWorked Date,
HasNotWorked int);
drop table if exists #MailIncoming;
go
create table #MailIncoming(
MailIncoming DateTime,
MailAnswering DateTime);
insert into #WorkingDay values
('2021-04-01', 0),
('2021-04-02', 0),
('2021-04-03', 0),
('2021-04-04', 1),
('2021-04-05', 0),
('2021-04-06', 0);
insert into #MailIncoming values
('2021-04-01 16:30:00', '2021-04-02 14:00:00'),
('2021-04-01 09:30:00', '2021-04-03 14:00:00'),
('2021-04-03 12:30:00', '2021-04-05 10:00:00'),
('2021-04-04 11:00:00', '2021-04-05 18:00:00');
dbo.fnTally
CREATE FUNCTION [dbo].[fnTally]
/**********************************************************************************************************************
Jeff Moden Script on SSC: https://www.sqlservercentral.com/scripts/create-a-tally-function-fntally
**********************************************************************************************************************/
(#ZeroOrOne BIT, #MaxN BIGINT)
RETURNS TABLE WITH SCHEMABINDING AS
RETURN WITH
H2(N) AS ( SELECT 1
FROM (VALUES
(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
,(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)
)V(N)) --16^2 or 256 rows
, H4(N) AS (SELECT 1 FROM H2 a, H2 b) --16^4 or 65,536 rows
, H8(N) AS (SELECT 1 FROM H4 a, H4 b) --16^8 or 4,294,967,296 rows
SELECT N = 0 WHERE #ZeroOrOne = 0 UNION ALL
SELECT TOP(#MaxN)
N = ROW_NUMBER() OVER (ORDER BY N)
FROM H8
;
query
select mi.MailIncoming, mi.MailAnswering,
avg(datediff(hour, MailIncoming, MailAnswering)) hrs_to_ans,
sum(case when w.HasNotWorked=1 and
v.calc_dt > mi_dt.inc_dt and
v.calc_dt < mi_dt.ans_dt
then -24
when w.HasNotWorked=1
then datediff(hour, dateadd(day, 1, mi_dt.inc_dt), mi.MailIncoming)
else 0 end) hrs_to_sub
from #MailIncoming mi
cross apply (values (cast(MailIncoming as date),
cast(MailAnswering as date))) mi_dt(inc_dt, ans_dt)
cross apply dbo.fnTally(0, datediff(day, mi.MailIncoming, mi.MailAnswering)) fn
cross apply (values (dateadd(day, fn.n, mi_dt.inc_dt))) v(calc_dt)
left join #WorkingDay w on v.calc_dt=w.DateWorked
group by mi.MailIncoming, mi.MailAnswering
order by mi.MailIncoming;
MailIncoming MailAnswering hrs_to_ans hrs_to_sub
2021-04-01 09:30:00.000 2021-04-03 14:00:00.000 53 0
2021-04-01 16:30:00.000 2021-04-02 14:00:00.000 22 0
2021-04-03 12:30:00.000 2021-04-05 10:00:00.000 46 -24
2021-04-04 11:00:00.000 2021-04-05 18:00:00.000 31 -13
I suggest you to use a column HasNotWorked, so the tables are
create table WorkingDay(DateWorked Date, HasNotWorked int);
create table MailIncoming(MailIncoming DateTime, MailAnswering DateTime);
and the rows
insert into WorkingDay values('2021-04-01', 0);
insert into WorkingDay values('2021-04-02', 0);
insert into WorkingDay values('2021-04-03', 0);
insert into WorkingDay values('2021-04-04', 1);
insert into WorkingDay values('2021-04-05', 0);
insert into WorkingDay values('2021-04-06', 0);
insert into MailIncoming values('2021-04-04 11:00:00.000', '2021-04-06 18:00:00.000');
I want calculate the start date. If is in working day, we must consider the hour of the mail, else the first working day with
case when
(select HasNotWorked from WorkingDay where DateWorked = convert(date, MailIncoming)) = 1 then
(select min(DateWorked) from WorkingDay where DateWorked > MailIncoming and HasNotWorked = 0)
else MailIncoming end as startDate
and discard the day that are not working day
((select sum(HasNotWorked) from WorkingDay where DateWorked between convert(date, startDate)
and convert(date, MailAnswering)
) * 24) as numNotWorkingDay
so the query could be
select startDate, MailAnswering, MailIncoming, hour, numNotWorkingDay, hour - numNotWorkingDay hourWitoutWorkingDay
from (
select
MailAnswering, startDate, MailIncoming,
DateDiff("hh", startDate, MailAnswering) hour,
((select sum(HasNotWorked) from WorkingDay where DateWorked between convert(date, startDate)
and convert(date, MailAnswering)
) * 24) as numNotWorkingDay
from (
select *,
case when
(select HasNotWorked from WorkingDay where DateWorked = convert(date, MailIncoming)) = 1 then
(select min(DateWorked) from WorkingDay where DateWorked > MailIncoming and HasNotWorked = 0)
else MailIncoming end as startDate
from MailIncoming) as startCalc
) as calcTable;
sqlfiddle

Snowflake: window function 'range' not support, how to query this?

I have a table of transactions that includes txn_date and cust_id.
For each customer that had a transaction in December, I want to know how many transactions that customer had in the 90 days previous to the given transaction.
This seems to be a query that I could run with a window function and a RANGE sliding window, but Snowflake doesn't support the RANGE sliding window frame.
How can I run this query in Snowflake?
How about something like this:
WITH T1 AS (
SELECT CUSTOMER_ID, TX_DATE
FROM TRANSACTIONS
WHERE TX_DATE BETWEEN '2020-12-01' AND '2020-12-31')
SELECT T2.CUSTOMER_ID, T2.TX_DATE
FROM TRANSACTIONS T2
INNER JOIN T1 ON T2.CUSTOMER_ID = T2.CUSTOMER_ID
WHERE T2.TX_DATE BETWEEN (T1.TX_DATE - 90) AND T1.TX_DATE
So much the same is NickW's answer at first.
WITH data AS (
SELECT txn_date::timestamp_ntz as txn_date, cust_id, txn_id
FROM VALUES
('2020-12-04',0, 0),
('2020-12-03',1, 1),
('2020-11-04',1, 2),
('2020-10-04',1, 3),
('2020-09-04',1, 4), -- just on 90 days
('2020-09-02',1, 5), -- too far
('2021-01-05',1, 6) -- in the future
v(txn_date , cust_id, txn_id)
), dec_txn AS (
SELECT txn_id,
cust_id,
DATEADD('day',-90, txn_date) AS win_start,
txn_date AS win_end
FROM data
WHERE date_trunc('month', txn_date) = '2020-12-01'
)
SELECT dt.*
,t.*
,datediff('days', dt.win_end, t.txn_date) as win_time
FROM dec_txn AS dt
LEFT JOIN data AS t
ON t.cust_id = dt.cust_id
AND t.txn_date between dt.win_start and win_end AND t.txn_id != dt.txn_id
;
which gives:
TXN_ID CUST_ID WIN_START WIN_END TXN_DATE CUST_ID TXN_ID WIN_TIME
1 1 2020-09-04 00:00:00.000 2020-12-03 00:00:00.000 2020-11-04 00:00:00.000 1 2 -29
1 1 2020-09-04 00:00:00.000 2020-12-03 00:00:00.000 2020-10-04 00:00:00.000 1 3 -60
1 1 2020-09-04 00:00:00.000 2020-12-03 00:00:00.000 2020-09-04 00:00:00.000 1 4 -90
0 0 2020-09-05 00:00:00.000 2020-12-04 00:00:00.000 NULL NULL NULL NULL
thus to counts we:
WITH data AS (
SELECT txn_date::timestamp_ntz as txn_date, cust_id, txn_id
FROM VALUES
('2020-12-04',0, 0),
('2020-12-03',1, 1),
('2020-11-04',1, 2),
('2020-10-04',1, 3),
('2020-09-04',1, 4), -- just on 90 days
('2020-09-02',1, 5), -- too far
('2021-01-05',1, 6) -- in the future
v(txn_date , cust_id, txn_id)
), dec_txn AS (
SELECT txn_id,
cust_id,
txn_date,
DATEADD('day',-90, txn_date) AS win_start,
txn_date AS win_end
FROM data
WHERE date_trunc('month', txn_date) = '2020-12-01'
)
SELECT dt.cust_id
,dt.txn_id
,dt.txn_date
,count(t.txn_id) as c__prior_90_days_transaction
FROM dec_txn AS dt
LEFT JOIN data AS t
ON t.cust_id = dt.cust_id
AND t.txn_date >= dt.win_start and t.txn_date < dt.win_end AND t.txn_id != dt.txn_id
GROUP BY 1,2,3
ORDER BY 1,2
;
giving:
CUST_ID TXN_ID TXN_DATE C__PRIOR_90_DAYS_TRANSACTION
0 0 2020-12-04 00:00:00.000 0
1 1 2020-12-03 00:00:00.000 3
What is not well defined in the question is what to do if there are many requests in december for one customer
What to do if there are multiple transactions in the same december day.
The above will return a row for each Dec transaction per customer, and it includes transactions that happen on the same day. But if you date/timestamp has time then it will only count transtions earlier in the same day.
But if you want prior days and the txn_date is just a date then
AND t.txn_date >= dt.win_start and t.txn_date < dt.win_end AND t.txn_id != dt.txn_id
should be used.
if txn_date is a timestamp, then dec_txn should be altered to:
dec_txn AS (
SELECT txn_id,
cust_id,
DATEADD('day',-90, txn_date::date) AS win_start,
txn_date::date AS win_end
FROM data
WHERE date_trunc('month', txn_date) = '2020-12-01'
and now that the window timestamps are truncated to days, then you will have to workout if you want midnight transaction to count on the day, or if you don't have midnight timestamps...

13 Period Calendar 4-4-5 Calendar T-SQL MSSQL

I am trying to create a 13 period calendar in mssql but I am a bit stuck. I am not sure if my approach is the best way to achieve this. I have my base script which can be seen below:
Set DateFirst 1
Declare #Date1 date = '20180101' --startdate should always be start of
financial year
Declare #Date2 date = '20181231' --enddate should always be start of
financial year
SELECT * INTO #CalendarTable
FROM dbo.CalendarTable(#Date1,#Date2,0,0,0)c
DECLARE #StartDate datetime,#EndDate datetime
SELECT #StartDate=MIN(CASE WHEN [Day]='Monday' THEN [Date] ELSE NULL END),
#EndDate=MAX([Date])
FROM #CalendarTable
;With Period_CTE(PeriodNo,Start,[End])
AS
(SELECT 1,#StartDate,DATEADD(wk,4,#StartDate) -1
UNION ALL
SELECT PeriodNo+1,DATEADD(wk,4,Start),DATEADD(wk,4,[End])
FROM Period_CTE
WHERE DATEADD(wk,4,[End])< =#EndDate
OR PeriodNo+1 <=13
)
select * from Period_CTE
Which gives me this:
PeriodNo Start End
1 2018-01-01 00:00:00.000 2018-01-28 00:00:00.000
2 2018-01-29 00:00:00.000 2018-02-25 00:00:00.000
3 2018-02-26 00:00:00.000 2018-03-25 00:00:00.000
4 2018-03-26 00:00:00.000 2018-04-22 00:00:00.000
5 2018-04-23 00:00:00.000 2018-05-20 00:00:00.000
6 2018-05-21 00:00:00.000 2018-06-17 00:00:00.000
7 2018-06-18 00:00:00.000 2018-07-15 00:00:00.000
8 2018-07-16 00:00:00.000 2018-08-12 00:00:00.000
9 2018-08-13 00:00:00.000 2018-09-09 00:00:00.000
10 2018-09-10 00:00:00.000 2018-10-07 00:00:00.000
11 2018-10-08 00:00:00.000 2018-11-04 00:00:00.000
12 2018-11-05 00:00:00.000 2018-12-02 00:00:00.000
13 2018-12-03 00:00:00.000 2018-12-30 00:00:00.000
The result i am trying to get is
Even if I have to take a different approach I would not mind, as long as the result is the same as the above.
dbo.CalendarTable() is a function that returns the following results. I can share the code if desired.
I'd create a general number's table like suggested here and add a column Periode13.
The trick to get the tiling is the integer division:
DECLARE #PeriodeSize INT=28; --13 "moon-months" a 28 days
SELECT TOP 100 (ROW_NUMBER() OVER(ORDER BY (SELECT NULL))-1)/#PeriodeSize
FROM master..spt_values --just a table with many rows to show the principles
You can add this to an existing numbers table with a simple update statement.
UPDATE A fully working example (using the logic linked above)
DECLARE #RunningNumbers TABLE (Number INT NOT NULL
,CalendarDate DATE NOT NULL
,CalendarYear INT NOT NULL
,CalendarMonth INT NOT NULL
,CalendarDay INT NOT NULL
,CalendarWeek INT NOT NULL
,CalendarYearDay INT NOT NULL
,CalendarWeekDay INT NOT NULL);
DECLARE #CountEntries INT = 100000;
DECLARE #StartNumber INT = 0;
WITH E1(N) AS(SELECT 1 FROM(VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))t(N)), --10 ^ 1
E2(N) AS(SELECT 1 FROM E1 a CROSS JOIN E1 b), -- 10 ^ 2 = 100 rows
E4(N) AS(SELECT 1 FROM E2 a CROSS JOIN E2 b), -- 10 ^ 4 = 10,000 rows
E8(N) AS(SELECT 1 FROM E4 a CROSS JOIN E4 b), -- 10 ^ 8 = 10,000,000 rows
CteTally AS
(
SELECT TOP(ISNULL(#CountEntries,1000000)) ROW_NUMBER() OVER(ORDER BY(SELECT NULL)) -1 + ISNULL(#StartNumber,0) As Nmbr
FROM E8
)
INSERT INTO #RunningNumbers
SELECT CteTally.Nmbr,CalendarDate.d,CalendarExt.*
FROM CteTally
CROSS APPLY
(
SELECT DATEADD(DAY,CteTally.Nmbr,{ts'2018-01-01 00:00:00'})
) AS CalendarDate(d)
CROSS APPLY
(
SELECT YEAR(CalendarDate.d) AS CalendarYear
,MONTH(CalendarDate.d) AS CalendarMonth
,DAY(CalendarDate.d) AS CalendarDay
,DATEPART(WEEK,CalendarDate.d) AS CalendarWeek
,DATEPART(DAYOFYEAR,CalendarDate.d) AS CalendarYearDay
,DATEPART(WEEKDAY,CalendarDate.d) AS CalendarWeekDay
) AS CalendarExt;
--The mockup table from above is now filled and can be queried
WITH AddPeriode AS
(
SELECT Number/28 +1 AS PeriodNumber
,CalendarDate
,CalendarWeek
,r.CalendarDay
,r.CalendarMonth
,r.CalendarWeekDay
,r.CalendarYear
,r.CalendarYearDay
FROM #RunningNumbers AS r
)
SELECT TOP 100 p.*
,(SELECT MIN(CalendarDate) FROM AddPeriode AS x WHERE x.PeriodNumber=p.PeriodNumber) AS [Start]
,(SELECT MAX(CalendarDate) FROM AddPeriode AS x WHERE x.PeriodNumber=p.PeriodNumber) AS [End]
,(SELECT MIN(CalendarDate) FROM AddPeriode AS x WHERE x.PeriodNumber=p.PeriodNumber AND x.CalendarWeek=p.CalendarWeek) AS [wkStart]
,(SELECT MAX(CalendarDate) FROM AddPeriode AS x WHERE x.PeriodNumber=p.PeriodNumber AND x.CalendarWeek=p.CalendarWeek) AS [wkEnd]
,(ROW_NUMBER() OVER(PARTITION BY PeriodNumber ORDER BY CalendarDate)-1)/7+1 AS WeekOfPeriode
FROM AddPeriode AS p
ORDER BY CalendarDate
Try it out...
Hint: Do not use a VIEW or iTVF for this.
This is non-changing data and much better placed in a physically stored table with appropriate indexes.
Not abundantly sure external links are accepted here, but I wrote an article that pulls of a 5-4-4 'Crop Year' fiscal year with all the code. Feel free to use all the code in these articles.
SQL Server Calendar Table
SQL Server Calendar Table: Fiscal Years

SQL count where between dates by month

Consider the below data:
ID Reference Manager LeaseFirstStart LeaseStop
1 KLEIN John 2008-04-02 00:00:00.000 2010-04-01 00:00:00.000
2 HAWKER John 2008-12-18 00:00:00.000 2010-09-17 00:00:00.000
3 SLEEP Bob 2008-01-23 00:00:00.000 2009-01-22 00:00:00.000
4 CODD Bob 2009-08-03 00:00:00.000 2010-08-02 00:00:00.000
5 ALLEN Bob 2008-01-30 00:00:00.000 2009-07-31 00:00:00.000
The earliest month is Jan 2008 and the latest month is Sep 2010.
How can I count the number of leases that were current per month? The output should look like this:
Month Number of Leases
2008-01 2
2008-02 2
2008-03 2
2008-04 3
2008-05 3
2008-06 3
2008-07 3
2008-08 4
… …
Ultimately, I want to use the answer to the question to create the dataset below for use in excel by the user so they can see who had how many leases during the data period.
Month Manager Number of Leases
2008-01 Bob 2
2008-01 John 0
2008-02 Bob 2
2008-02 John 0
2008-03 Bob 2
2008-03 John 0
2008-04 Bob 2
2008-04 John 1
2008-05 Bob 2
2008-05 John 1
2008-06 Bob 2
2008-06 John 1
2008-07 Bob 2
2008-07 John 1
2008-08 Bob 3
2008-08 John 1
… … …
I know I've done it before, but it was a long time ago and I remember it being messy. Thanks in advance!
select sum (no) as no,datet from ( SELECT COUNT (*) as no ,(convert(varchar,datepart (yyyy,[ Start] )) + '-' + convert(varchar, MONTH([ Start] ))) as datet
FROM <tbl>
GROUP BY (convert(varchar,datepart (yyyy,[ Start] )) + '-' + convert(varchar, MONTH([ Start] )))
union SELECT COUNT (*) as no ,(convert(varchar,datepart (yyyy,[ End] )) + '-' + convert(varchar, MONTH([ End] ))) as datet
FROM <tbl>
GROUP BY (convert(varchar,datepart (yyyy,[ End] )) + '-' + convert(varchar, MONTH([ End] )) ) ) t
This is very logical question, finally I created the sql which gives the desired result.. I verified every date and month count and its all ok.
Declare #t table (ID int, Reference varchar(50), Manager varchar(50),LeaseFirstStart datetime,LeaseStop datetime)
insert into #t
values
(1,'KLEIN','John','2008-04-02 00:00:00.000','2010-04-01 00:00:00.000'),
(2,'HAWKER','John','2008-12-18 00:00:00.000','2010-09-17 00:00:00.000'),
(3,'SLEEP','Bob','2008-01-23 00:00:00.000','2009-01-22 00:00:00.000'),
(4,'CODD','Bob','2009-08-03 00:00:00.000','2010-08-02 00:00:00.000'),
(5,'ALLEN','Bob','2008-02-28 00:00:00.000','2009-07-31 00:00:00.000')
declare #lowerdate datetime , #currentdt datetime
select #lowerdate = min(leasefirststart), #currentdt= max(leasestop) from #t
;with cte as
(
select firstday,DATEADD(d, -1, DATEADD(m, DATEDIFF(m, 0, FirstDay) + 1, 0)) Lastday, mng from
( select dateadd(m,datediff(m,0,#lowerdate)+v.number,0) as FirstDay
From master..spt_values v
Where v.type='P' and v.number between 0 and datediff(m, #lowerdate, #currentdt)
) as a
, (select distinct manager mng from #t ) as b
)
select (convert(varchar,datepart (yyyy,FirstDay )) + '-' + convert(varchar, MONTH(FirstDay ))) MonthAndYear ,mng as mng , count( manager ) cnt
from cte
left join #t on
(
firstday between LeaseFirstStart and LeaseStop
or
Lastday between LeaseFirstStart and LeaseStop
) and cte.mng = Manager
group by firstday, mng
order by FirstDay

islands and gaps tsql

I have been struggling with a problem that should be pretty simple actually but after a full week of reading, googling, experimenting and so on, my colleague and we cannot find the proper solution. :(
The problem: We have a table with two values:
an employeenumber (P_ID, int) <--- identification of employee
a date (starttime, datetime) <--- time employee checked in
We need to know what periods each employee has been working.
When two dates are less then #gap days apart, they belong to the same period
For each employee there can be multiple records for any given day but I just need to know which dates he worked, I am not interested in the time part
As soon as there is a gap > #gap days, the next date is considered the start of a new range
A range is at least 1 day (example: 21-9-2011 | 21-09-2011) but has no maximum length. (An employee checking in every #gap - 1 days should result in a period from the first day he checked in until today)
What we think we need are the islands in this table where the gap in days is greater than #variable (#gap = 30 means 30 days)
So an example:
SOURCETABLE:
P_ID | starttime
------|------------------
12121 | 24-03-2009 7:30
12121 | 24-03-2009 14:25
12345 | 27-06-2011 10:00
99999 | 01-05-2012 4:50
12345 | 27-06-2011 10:30
12345 | 28-06-2011 11:00
98765 | 13-04-2012 10:00
12345 | 21-07-2011 9:00
99999 | 03-05-2012 23:15
12345 | 21-09-2011 12:00
45454 | 12-07-2010 8:00
12345 | 21-09-2011 17:00
99999 | 06-05-2012 11:05
99999 | 20-05-2012 12:45
98765 | 26-04-2012 16:00
12345 | 07-07-2012 14:00
99999 | 01-06-2012 13:55
12345 | 13-08-2012 13:00
Now what I need as a result is:
PERIODS:
P_ID | Start | End
-------------------------------
12121 | 24-03-2009 | 24-03-2009
12345 | 27-06-2012 | 21-07-2012
12345 | 21-09-2012 | 21-09-2012
12345 | 07-07-2012 | (today) OR 13-08-2012 <-- (less than #gap days ago) OR (last date in table)
45454 | 12-07-2010 | 12-07-2010
45454 | 17-06-2012 | 17-06-2012
98765 | 13-04-2012 | 26-04-2012
99999 | 01-05-2012 | 01-06-2012
I hope this is clear this way, I already thank you for reading this far, it would be great if you could contribute!
I've done a rough script that should get you started. Haven't bothered refining the datetimes and the endpoint comparisons might need tweaking.
select
P_ID,
src.starttime,
endtime = case when src.starttime <> lst.starttime or lst.starttime < DATEADD(dd,-1 * #gap,GETDATE()) then lst.starttime else GETDATE() end,
frst.starttime,
lst.starttime
from #SOURCETABLE src
outer apply (select starttime = MIN(starttime) from #SOURCETABLE sub where src.p_id = sub.p_id and sub.starttime > DATEADD(dd,-1 * #gap,src.starttime)) frst
outer apply (select starttime = MAX(starttime) from #SOURCETABLE sub where src.p_id = sub.p_id and src.starttime > DATEADD(dd,-1 * #gap,sub.starttime)) lst
where src.starttime = frst.starttime
order by P_ID, src.starttime
I get the following output, which is a litle different to yours, but I think its ok:
P_ID starttime endtime starttime starttime
----------- ----------------------- ----------------------- ----------------------- -----------------------
12121 2009-03-24 07:30:00.000 2009-03-24 14:25:00.000 2009-03-24 07:30:00.000 2009-03-24 14:25:00.000
12345 2011-06-27 10:00:00.000 2011-07-21 09:00:00.000 2011-06-27 10:00:00.000 2011-07-21 09:00:00.000
12345 2011-09-21 12:00:00.000 2011-09-21 17:00:00.000 2011-09-21 12:00:00.000 2011-09-21 17:00:00.000
12345 2012-07-07 14:00:00.000 2012-07-07 14:00:00.000 2012-07-07 14:00:00.000 2012-07-07 14:00:00.000
12345 2012-08-13 13:00:00.000 2012-08-16 11:23:25.787 2012-08-13 13:00:00.000 2012-08-13 13:00:00.000
45454 2010-07-12 08:00:00.000 2010-07-12 08:00:00.000 2010-07-12 08:00:00.000 2010-07-12 08:00:00.000
98765 2012-04-13 10:00:00.000 2012-04-26 16:00:00.000 2012-04-13 10:00:00.000 2012-04-26 16:00:00.000
The last two output cols are the results of the outer apply sections, and are just there for debugging.
This is based on the following setup:
declare #gap int
set #gap = 30
set dateformat dmy
-----P_ID----|----starttime----
declare #SOURCETABLE table (P_ID int, starttime datetime)
insert #SourceTable values
(12121,'24-03-2009 7:30'),
(12121,'24-03-2009 14:25'),
(12345,'27-06-2011 10:00'),
(12345,'27-06-2011 10:30'),
(12345,'28-06-2011 11:00'),
(98765,'13-04-2012 10:00'),
(12345,'21-07-2011 9:00'),
(12345,'21-09-2011 12:00'),
(45454,'12-07-2010 8:00'),
(12345,'21-09-2011 17:00'),
(98765,'26-04-2012 16:00'),
(12345,'07-07-2012 14:00'),
(12345,'13-08-2012 13:00')
UPDATE: Slight rethink. Now uses a CTE to work out the gaps forwards and backwards from each item, then aggregates those:
--Get the gap between each starttime and the next and prev (use 999 to indicate non-closed intervals)
;WITH CTE_Gaps As (
select
p_id,
src.starttime,
nextgap = coalesce(DATEDIFF(dd,src.starttime,nxt.starttime),999), --Gap to the next entry
prevgap = coalesce(DATEDIFF(dd,prv.starttime,src.starttime),999), --Gap to the previous entry
isold = case when DATEDIFF(dd,src.starttime,getdate()) > #gap then 1 else 0 end --Is starttime more than gap days ago?
from
#SOURCETABLE src
cross apply (select starttime = MIN(starttime) from #SOURCETABLE sub where src.p_id = sub.p_id and sub.starttime > src.starttime) nxt
cross apply (select starttime = max(starttime) from #SOURCETABLE sub where src.p_id = sub.p_id and sub.starttime < src.starttime) prv
)
--select * from CTE_Gaps
select
p_id,
starttime = min(gap.starttime),
endtime = nxt.starttime
from
CTE_Gaps gap
--Find the next starttime where its gap to the next > #gap
cross apply (select starttime = MIN(sub.starttime) from CTE_Gaps sub where gap.p_id = sub.p_id and sub.starttime >= gap.starttime and sub.nextgap > #gap) nxt
group by P_ID, nxt.starttime
order by P_ID, nxt.starttime
Jon most definitively has shown us the right direction. Performance was horrible though (4million+ records in the database). And it looked like we were missing some information. With all that we learned from you we came up with the solution below. It uses elements of all the proposed answers and cycles through 3 temptables before finally spewing results but performance is good enough, as well as the data it generates.
declare #gap int
declare #Employee_id int
set #gap = 30
set dateformat dmy
--------------------------------------------------------------- #temp1 --------------------------------------------------
CREATE TABLE #temp1 ( EmployeeID int, starttime date)
INSERT INTO #temp1 ( EmployeeID, starttime)
select distinct ck.Employee_id,
cast(ck.starttime as date)
from SERVER1.DB1.dbo.checkins pd
inner join SERVER1.DB1.dbo.Team t on ck.team_id = t.id
where t.productive = 1
--------------------------------------------------------------- #temp2 --------------------------------------------------
create table #temp2 (ROWNR int, Employeeid int, ENDOFCHECKIN datetime, FIRSTCHECKIN datetime)
INSERT INTO #temp2
select Row_number() OVER (partition by EmployeeID ORDER BY t.prev) + 1 as ROWNR,
EmployeeID,
DATEADD(DAY, 1, t.Prev) AS start_gap,
DATEADD(DAY, 0, t.next) AS end_gap
from
(
select a.EmployeeID,
a.starttime as Prev,
(
select min(b.starttime)
from #temp1 as b
where starttime > a.starttime and b.EmployeeID = a.EmployeeID
) as Next
from #temp1 as a) as t
where datediff(day, prev, next ) > 30
group by EmployeeID,
t.Prev,
t.next
union -- add first known date for Employee
select 1 as ROWNR,
EmployeeID,
NULL,
min(starttime)
from #temp1 ct
group by ct.EmployeeID
--------------------------------------------------------------- #temp3 --------------------------------------------------
create table #temp3 (ROWNR int, Employeeid int, ENDOFCHECKIN datetime, STARTOFCHECKIN datetime)
INSERT INTO #temp3
select ROWNR,
Employeeid,
ENDOFCHECKIN,
FIRSTCHECKIN
from #temp2
union -- add last known date for Employee
select (select count(*) from #temp2 b where Employeeid = ct.Employeeid)+1 as ROWNR,
ct.Employeeid,
(select dateadd(d,1,max(starttime)) from #temp1 c where Employeeid = ct.Employeeid),
NULL
from #temp2 ct
group by ct.EmployeeID
---------------------------------------finally check our data-------------------------------------------------
select a1.Employeeid,
a1.STARTOFCHECKIN as STARTOFCHECKIN,
ENDOFCHECKIN = CASE WHEN b1.ENDOFCHECKIN <= a1.STARTOFCHECKIN THEN a1.ENDOFCHECKIN ELSE b1.ENDOFCHECKIN END,
year(a1.STARTOFCHECKIN) as JaarSTARTOFCHECKIN,
JaarENDOFCHECKIN = CASE WHEN b1.ENDOFCHECKIN <= a1.STARTOFCHECKIN THEN year(a1.ENDOFCHECKIN) ELSE year(b1.ENDOFCHECKIN) END,
Month(a1.STARTOFCHECKIN) as MaandSTARTOFCHECKIN,
MaandENDOFCHECKIN = CASE WHEN b1.ENDOFCHECKIN <= a1.STARTOFCHECKIN THEN month(a1.ENDOFCHECKIN) ELSE month(b1.ENDOFCHECKIN) END,
(year(a1.STARTOFCHECKIN)*100)+month(a1.STARTOFCHECKIN) as JaarMaandSTARTOFCHECKIN,
JaarMaandENDOFCHECKIN = CASE WHEN b1.ENDOFCHECKIN <= a1.STARTOFCHECKIN THEN (year(a1.ENDOFCHECKIN)*100)+month(a1.STARTOFCHECKIN) ELSE (year(b1.ENDOFCHECKIN)*100)+month(b1.ENDOFCHECKIN) END,
datediff(M,a1.STARTOFCHECKIN,b1.ENDOFCHECKIN) as MONTHSCHECKEDIN
from #temp3 a1
full outer join #temp3 b1 on a1.ROWNR = b1.ROWNR -1 and a1.Employeeid = b1.Employeeid
where not (a1.STARTOFCHECKIN is null AND b1.ENDOFCHECKIN is null)
order by a1.Employeeid, a1.STARTOFCHECKIN

Resources