Moving Median, Mode in T-SQL

Moving Median, Mode in T-SQL - sql-server

I am using SQL Server 2012 and I know it is quite simple to calculate moving averages.
But what I need is to get the mode and the median for a defined window frame like so (with a window of 2 preceding to current row; month unique):
MONTH | CODE | MEDIAN | MODE
1 0 0 0
2 3 1.5 0
3 2 2 0
4 2 2 2
5 2 2 2
6 5 2 2
7 3 3 2
If several values qualify as mode, than pick the first.

I commented my code thoroughly. Read my comments on my Mode calculations and let me know it needs tweaking. Overall, it's a relatively simple query. It just has a lot of ugly subqueries and it has a lot of comments. Check it out:
DECLARE #Table TABLE ([Month] INT,[Code] INT);
INSERT INTO #Table
VALUES (1,0),
(2,3),
(3,2),
(4,2), --Try commenting this out to test my special mode thingymajig
(5,2),
(6,5),
(7,3);
WITH CTE
AS
(
SELECT ROW_NUMBER() OVER (ORDER BY [Month]) row_num,
[Month],
CAST(Code AS FLOAT) Code
FROM #Table
)
SELECT [Month],
Code,
ISNULL((
SELECT CASE
--When there is only one previous value at row_num = 2, find Mean of first two codes
WHEN A.row_num = 2 THEN (LAG(B.code,1) OVER (ORDER BY [Code]) + B.Code)/2.0
--Else find middle code value of current and previous two rows
ELSE B.Code
END
FROM CTE B
--How subquery relates to outer query
WHERE B.row_num BETWEEN A.row_num - 2 AND A.row_num
ORDER BY B.[Code]
--Order by code and offset by 1 so don't select the lowest value, but fetch the one above the lowest value
OFFSET 1 ROW FETCH NEXT 1 ROW ONLY),
0) AS Median,
--I did mode a little different
--Instead of Avg(D.Code) you could list the values because with mode,
--If there's a tie with more than one of each number, you have multiple modes
--Instead of doing that, I simply return the mean of the tied modes
--When there's one, it doesn't change anything.
--If you were to delete the month 4, then your number of Codes 2 and number of Codes 3 would be the same in the last row.
--Proper mode would be 2,3. I instead average them out to be 2.5.
ISNULL((
SELECT AVG(D.Code)
FROM (
SELECT C.Code,
COUNT(*) cnt,
DENSE_RANK() OVER (ORDER BY COUNT(*) DESC) dnse_rank
FROM CTE C
WHERE C.row_num <= A.row_num
GROUP BY C.Code
HAVING COUNT(*) > 1) D
WHERE D.dnse_rank = 1),
0) AS Mode
FROM CTE A
Results:
Month Code Median Mode
----------- ---------------------- ---------------------- ----------------------
1 0 0 0
2 3 1.5 0
3 2 2 0
4 2 2 2
5 2 2 2
6 5 2 2
7 3 3 2

If I understood your requirements correctly, your source table contains MONTH and CODE columns, and you want to calculate MEDIAN and MODE.
The query below calculates MEDIAN and MODE with moving window <= than 3 month ("2 preceding to current row") and returns the results matching your example.
-----------------------------------------------------
--Demo data
-----------------------------------------------------
CREATE TABLE #Data(
[Month] INT NOT NULL,
[Code] INT NOT NULL,
CONSTRAINT [PK_Data] PRIMARY KEY CLUSTERED
(
[Month] ASC
));
INSERT #Data
([Month],[Code])
VALUES
(1,0),
(2,3),
(3,2),
(4,2),
(5,2),
(6,5),
(7,3);
-----------------------------------------------------
--Query
-----------------------------------------------------
DECLARE #PrecedingRowsLimit INT = 2;
WITH [MPos] AS
(
SELECT [R].[Month]
, [RB].[Month] AS [SubId]
, [RB].[Code]
, ROW_NUMBER() OVER(PARTITION BY [R].[Month] ORDER BY [RB].[Code]) AS [RowNumberInPartition]
, CASE
WHEN [R].[Count] % 2 = 1 THEN ([R].[Count] + 1) / 2
ELSE NULL
END AS [MedianPosition]
, CASE
WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2
ELSE NULL
END AS [MedianPosition1]
, CASE
WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2 + 1
ELSE NULL
END AS [MedianPosition2]
FROM
(
SELECT [RC].[Month]
, [RC].[RowNumber]
, CASE WHEN [RC].[Count] > #PrecedingRowsLimit + 1 THEN #PrecedingRowsLimit + 1 ELSE [RC].[Count] END AS [Count]
FROM
(
SELECT [Month]
, ROW_NUMBER() OVER(ORDER BY [Month]) AS [RowNumber]
, ROW_NUMBER() OVER(ORDER BY [Month]) AS [Count]
FROM #Data
) [RC]
) [R]
INNER JOIN #Data [RB]
ON [R].[Month] >= [RB].[Month]
AND [RB].[Month] >= [R].[RowNumber] - #PrecedingRowsLimit
)
SELECT DISTINCT [M].[Month]
, [ORIG].[Code]
, COALESCE([ME].[Code],([M1].[Code] + [M2].[Code]) / 2.0) AS [Median]
, [MOD].[Mode]
FROM [MPos] [M]
LEFT JOIN [MPOS] [ME]
ON [M].[Month] = [ME].[Month]
AND [M].[MedianPosition] = [ME].[RowNumberInPartition]
LEFT JOIN [MPOS] [M1]
ON [M].[Month] = [M1].[Month]
AND [M].[MedianPosition1] = [M1].[RowNumberInPartition]
LEFT JOIN [MPOS] [M2]
ON [M].[Month] = [M2].[Month]
AND [M].[MedianPosition2] = [M2].[RowNumberInPartition]
INNER JOIN
(
SELECT [MG].[Month]
, FIRST_VALUE([MG].[Code]) OVER (PARTITION BY [MG].[Month] ORDER BY [MG].[Count] DESC , [MG].[SubId] ASC) AS [Mode]
FROM
(
SELECT [Month] , MIN([SubId]) AS [SubId], [Code] , COUNT(1) AS [Count]
FROM [MPOS]
GROUP BY [Month] , [Code]
) [MG]
) [MOD]
ON [M].[Month] = [MOD].[Month]
INNER JOIN #Data [ORIG]
ON [ORIG].[Month] = [M].[Month]
ORDER BY [M].[Month];

Related

How to Sum (MAX values) from different value groups in same column SQL Server

I have a table like this:
Date
Consec_Days
2015-01-01
1
2015-01-03
1
2015-01-06
1
2015-01-07
2
2015-01-09
1
2015-01-12
1
2015-01-13
2
2015-01-14
3
2015-01-17
1
I need to Sum the max value (days) for each of the consecutive groupings where Consec_Days are > 1. So the correct result would be 5 days.

This is a type of gaps-and-islands problem.
There are many solutions, here is one simple one
Get the start points of each group using LAG
Calculate a grouping ID using a windowed conditional count
Group by that ID and take the highest sum
WITH StartPoints AS (
SELECT *,
IsStart = CASE WHEN LAG(Consec_Days) OVER (ORDER BY Date) = 1 THEN 1 END
FROM YourTable t
),
Groupings AS (
SELECT *,
GroupId = COUNT(IsStart) OVER (ORDER BY Date)
FROM StartPoints
WHERE Consec_Days > 1
)
SELECT TOP (1)
SUM(Consec_Days)
FROM Groupings
GROUP BY
GroupId
ORDER BY
SUM(Consec_Days) DESC;
db<>fiddle

with cte as (
select Consec_Days,
coalesce(lead(Consec_Days) over (order by Date), 1) as next
from YourTable
)
select sum(Consec_Days)
from cte
where Consec_Days <> 1 and next = 1
db<>fiddle

Group records based on time interval starting from timestamp of first record in each group

Struggling with this; need to group records within a specific time interval starting from the first timestamp (FREEZE_TIME) - but the first record outside the first group is the starting point for the time interval for the next group and so on. Expected result, THAW_COUNT, is the count of all groups for a PARENT_SAMPLE_ID. So for table:
SAMPLE_ID
FREEZE_TIME
PARENT_SAMPLE_ID
1
null
null
2
2015-11-27 10:23:10
1
3
2015-11-27 10:59:23
1
4
2015-11-27 11:05:43
1
5
2015-11-27 12:53:48
1
6
2015-11-27 13:42:25
1
I would like to get a result of:
PARENT_SAMPLE_ID
THAW_COUNT
1
2
So sample_id:s 2,3 and 4 should be in the same group and sample id:s 5 and 6 are in the next group.
I have tried something like:
with SampleList as
(
select PARENT_SAMPLE_ID, FREEZE_TIME,
ROW_NUMBER() OVER (partition by PARENT_SAMPLE_ID order by FREEZE_TIME asc) RN
from
SAMPLE
)
,
FirstSample as
(
select PARENT_SAMPLE_ID, FREEZE_TIME
from SampleList
where RN = 1
)
,
SelectedSample as
(
select
s.PARENT_SAMPLE_ID,
ABS(DATEDIFF(MINUTE, s.FREEZE_TIME, sFirst.FREEZE_TIME))/60 DiffToFirst
from SampleList s
inner join FirstSample sFirst ON s.PARENT_SAMPLE_ID = sFirst.PARENT_SAMPLE_ID
group by s.PARENT_SAMPLE_ID, ABS(DATEDIFF(MINUTE, s.FREEZE_TIME, sFirst.FREEZE_TIME))/60
)
select PARENT_SAMPLE_ID, count(*) THAW_COUNT
from SelectedSample
group by PARENT_SAMPLE_ID
But this will return a THAW_COUNT of 3 as sampleId:s 5 and 6 will be in different groups because the grouping is based on hour intervals from freeze time of sampleId 2 only. How do I get the grouping for group 2 to start from the first record outside the first group (sampleId 5) and so on?

This can be treated as a gaps and islands problem. Using some windows functions to check counts and using LAG to look at the "previous" row we can solve this. If you have multiple values for SAMPLE_ID you will want to add some partitioning.
create table #Something
(
SAMPLE_ID int
, FREEZE_TIME datetime
, PARENT_SAMPLE_ID int
)
insert #Something
select 1, null, null union all
select 2, '2015-11-27 10:23:10', 1 union all
select 3, '2015-11-27 10:59:23', 1 union all
select 4, '2015-11-27 11:05:43', 1 union all
select 5, '2015-11-27 12:53:48', 1 union all
select 6, '2015-11-27 13:42:25', 1;
with MyGroups as
(
select *
, GroupNum = count(IsNewGroup) over (order by FREEZE_TIME rows unbounded preceding)
from
(
select *
, IsNewGroup = case when LAG(FREEZE_TIME, 1, '') over(order by FREEZE_TIME) < dateadd(hour, -1, FREEZE_TIME) then 1 end
from #Something
) x
)
select coalesce(PARENT_SAMPLE_ID, SAMPLE_ID)
, count(distinct GroupNum)
from MyGroups
group by coalesce(PARENT_SAMPLE_ID, SAMPLE_ID)
drop table #Something

Transact-SQL - number rows until condition met

I'm trying to generate the numbers in the "x" column considering the values in field "eq", in a way that it should assign a number for every record until it meets the value "1", and the next row should reset and start counting again. I've tried with row_number, but the problem is that I only have ones and zeros in the column I need to evaluate, and the cases I've seen using row_number were using growing values in a column. Also tried with rank, but I haven't managed to make it work.
nInd Fecha Tipo #Inicio #contador_I #Final #contador_F eq x
1 18/03/2002 I 18/03/2002 1 null null 0 1
2 20/07/2002 F 18/03/2002 1 20/07/2002 1 1 2
3 19/08/2002 I 19/08/2002 2 20/07/2002 1 0 1
4 21/12/2002 F 19/08/2002 2 21/12/2002 2 1 2
5 17/03/2003 I 17/03/2003 3 21/12/2002 2 0 1
6 01/04/2003 I 17/03/2003 4 21/12/2002 2 0 2
7 07/04/2003 I 17/03/2003 5 21/12/2002 2 0 3
8 02/06/2003 F 17/03/2003 5 02/06/2003 3 0 4
9 31/07/2003 F 17/03/2003 5 31/07/2003 4 0 5
10 31/08/2003 F 17/03/2003 5 31/08/2003 5 1 6
11 01/09/2005 I 01/09/2005 6 31/08/2003 5 0 1
12 05/09/2005 I 01/09/2005 7 31/08/2003 5 0 2
13 31/12/2005 F 01/09/2005 7 31/12/2005 6 0 3
14 14/01/2006 F 01/09/2005 7 14/01/2006 7 1 4

There is another solution available:
select
nind, eq, row_number() over (partition by s order by s)
from (
select
nind, eq, coalesce((
select sum(eq) +1 from mytable pre where pre.nInd < mytable.nInd)
,1) s --this is the sum of eq!
from mytable) g
The inner subquery creates groups sequentially for each occurrence of 1 in eq. Then we can use row_number() over partition to get our counter.
Here is an example using Sql Server

I have two answers here. One is based off of the ROW_NUMBER() and the other is based off of what appears to be your index (nInd). I wasn't sure if there would be a gap in your index so I made the ROW_NUMBER() as well.
My table format was as follows -
myIndex int identity(1,1) NOT NULL
number int NOT NULL
First one is ROW_NUMBER()...
WITH rn AS (SELECT *, ROW_NUMBER() OVER (ORDER BY myIndex) AS rn, COUNT(*) AS max
FROM counting c GROUP BY c.myIndex, c.number)
,cte (myIndex, number, level, row) AS (
SELECT r.myIndex, r.number, 1, r.rn + 1 FROM rn r WHERE r.rn = 1
UNION ALL
SELECT r1.myIndex, r1.number,
CASE WHEN r1.number = 0 AND r2.number = 1 THEN 1
ELSE c.level + 1
END,
row + 1
FROM cte c
JOIN rn r1
ON c.row = r1.rn
JOIN rn r2
ON c.row - 1 = r2.rn
)
SELECT c.myIndex, c.number, c.level FROM cte c OPTION (MAXRECURSION 0);
Now the index...
WITH cte (myIndex, number, level) AS (
SELECT c.myIndex + 1, c.number, 1 FROM counting c WHERE c.myIndex = 1
UNION ALL
SELECT c1.myIndex + 1, c1.number,
CASE WHEN c1.number = 0 AND c2.number = 1 THEN 1
ELSE c.level + 1
END
FROM cte c
JOIN counting c1
ON c.myIndex = c1.myIndex
JOIN counting c2
ON c.myIndex - 1 = c2.myIndex
)
SELECT c.myIndex - 1 AS myIndex, c.number, c.level FROM cte c OPTION (MAXRECURSION 0);

The answer that I have now is via using
Cursor
I know if there is another solution without cursor it will be better for performance aspects
here is a quick demo of my solution:
-- Create DBTest
use master
Go
Create Database DBTest
Go
use DBTest
GO
-- Create table
Create table Tabletest
(nInd int , eq int)
Go
-- insert dummy data
insert into Tabletest (nInd,eq)
values (1,0),
(2,1),
(3,0),
(4,1),
(5,0),
(6,0),
(7,0),
(8,0),
(9,1),
(8,0),
(9,1)
Create table #Tabletest (nInd int ,eq int ,x int )
go
DECLARE #nInd int , #eq int , #x int
set #x = 1
DECLARE db_cursor CURSOR FOR
SELECT nInd , eq
FROM Tabletest
order by nInd
OPEN db_cursor
FETCH NEXT FROM db_cursor INTO #nInd , #eq
WHILE ##FETCH_STATUS = 0
BEGIN
if (#eq = 0)
begin
insert into #Tabletest (nInd ,eq ,x) values (#nInd , #eq , #x)
set #x = #x +1
end
else if (#eq = 1)
begin
insert into #Tabletest (nInd ,eq ,x) values (#nInd , #eq , #x)
set #x = 1
end
FETCH NEXT FROM db_cursor INTO #nInd , #eq
END
CLOSE db_cursor
DEALLOCATE db_cursor
select * from #Tabletest
The end result set will be as following:
Hope it helps.

Looking at this a slightly different way (which might not be true, but eliminates the need for cursors of recursive CTEs), it looks like you building ordered groups within your dataset. So, start by finding those groups, then determining the ordering of each of them.
The real key is to determine the rules to find the correcting grouping. Based on your description and comments, I'm guessing the grouping is from the start (ordered by the nInd column) ending at each row with and eq value of 1, so you can do something like:
;with ends(nInd, ord) as (
--Find the ending row for each set
SELECT nInd, row_number() over(order by nInd)
FROM mytable
WHERE eq=1
), ranges(sInd, eInd) as (
--Find the previous ending row for each ending row, forming a range for the group
SELECT coalesce(s.nInd,0), e.nInd
FROM ends s
right join ends e on s.ord=e.ord-1
)
Then, using these group ranges, you can find the final ordering of each:
select t.nInd, t.Fecha, t.eq
,[x] = row_number() over(partition by sInd order by nInd)
from ranges r
join mytable t on r.sInd < t.nInd
and t.nInd <= r.eInd
order by t.nInd

Find gaps in number range spanning multiple columns

I'm trying to identify the gaps in a range of numbers (SQL Server). My scenario is below...
ID Start End
1 1 4
2 1 6
3 2 4
4 8 10
5 13 14
Visual
-------------------------------
1-2-3-4
1-2-3-4-5-6
2-3-4
- -8-9-10
- - -13-14
The result of this could be something along the lines of:
Table
-------------------------------
ID Start End Gap
4 8 10 -1
5 13 14 -2
Ultimately, I want to have the gap ranges, but I should be able to figure that out from above...
Missing
7
11-12
I came up with solutions that are either too slow or don't account for the overlap in ranges (ex ID 2)
CREATE TABLE #Docs (
[Rank] INT, --DENSE_RANK () OVER(ORDER BY BegProd)
ControlNumber BIGINT,
BegProd INT,
EndProd INT
)
SELECT
T1.ControlNumber,
T1.BegProd,
T1.EndProd,
MAX(T2.EndProd) AS [PreviousEndProd],
[Gap] = T1.BegProd - MAX(T2.EndProd) - 1
FROM #Docs T1
INNER JOIN #Docs T2
ON T1.[Rank] = T2.[Rank] + 1
AND T1.EndProd > T2.EndProd
GROUP BY T1.ControlNumber, T1.BegProd, T1.EndProd
HAVING T1.BegProd - MAX(T2.EndProd) > 1
There is over 2 million rows in this table and a range span of 1 to 1 billion
EDIT
Fixed 'Missing' table.
The gap column indicates how much of a gap there is before that start number. (ex missing #7 is 1 number)

Try this:
create table #docs(id int, start int, [end] int)
insert #docs values(1,1,4),(2,1,6),(3,2,4),(4,8,10),(5,13,14)
;with a as
(
select start, dense_rank() over (order by start) rn
from #docs t where not exists (select 1 from #docs where t.start > start and t.start < [end])
group by start
), b as
(
select [end], dense_rank() over (order by [end]) rn
from #docs t where not exists (select 1 from #docs where t.[end] > start and t.[end] < [end])
group by [end]
)
select
case when a.[start]= b.[end]+2 then cast(a.start-1 as varchar(21))
else cast(b.[end]+1 as varchar(10)) +'-' + cast(a.start - 1 as varchar(10)) end missing
from a join b on a.rn - 1 = b.rn
and a.[start] <> b.[end] + 1
Result:
Missing
7
11-12

Reporting on data when data is missing (ie. how to report zero activities for a customer on a given week)

I want to create a report which aggregates the number of activities per customer per week.
If there has been no activites on that customer for a given week, 0 should be displayed (i.e week 3 and 4 in the sample below)
CUSTOMER | #ACTIVITIES | WEEKNUMBER
A | 4 | 1
A | 2 | 2
A | 0 | 3
A | 0 | 4
A | 1 | 5
B ...
C ...
The problem is that if there are no activities there is no data to report on and therefor week 3 and 4 in the sample below is not in the report.
What is the "best" way to solve this?

Try this:
DECLARE #YourTable table (CUSTOMER char(1), ACTIVITIES int, WEEKNUMBER int)
INSERT #YourTable VALUES ('A' , 4 , 1)
INSERT #YourTable VALUES ('A' , 2 , 2)
INSERT #YourTable VALUES ('A' , 0 , 3)
INSERT #YourTable VALUES ('A' , 0 , 4)
INSERT #YourTable VALUES ('A' , 1 , 5)
INSERT #YourTable VALUES ('B' , 5 , 3)
INSERT #YourTable VALUES ('C' , 2 , 4)
DECLARE #StartNumber int
,#EndNumber int
SELECT #StartNumber=1
,#EndNumber=5
;WITH AllNumbers AS
(
SELECT #StartNumber AS Number
UNION ALL
SELECT Number+1
FROM AllNumbers
WHERE Number<#EndNumber
)
, AllCustomers AS
(
SELECT DISTINCT CUSTOMER FROM #YourTable
)
SELECT
n.Number AS WEEKNUMBER, c.CUSTOMER, CASE WHEN y.Customer IS NULL THEN 0 ELSE y.ACTIVITIES END AS ACTIVITIES
FROM AllNumbers n
CROSS JOIN AllCustomers c
LEFT OUTER JOIN #YourTable y ON n.Number=y.WEEKNUMBER AND c.CUSTOMER=y.CUSTOMER
--OPTION (MAXRECURSION 500)
OUTPUT:
WEEKNUMBER CUSTOMER ACTIVITIES
----------- -------- -----------
1 A 4
1 B 0
1 C 0
2 A 2
2 B 0
2 C 0
3 A 0
3 B 5
3 C 0
4 A 0
4 B 0
4 C 2
5 A 1
5 B 0
5 C 0
(15 row(s) affected)
I use a CTE to build a Numbers table, but you could build a permanent one look at this question: What is the best way to create and populate a numbers table?. You could Write the Query without a CTE (same results as above):
SELECT
n.Number AS WEEKNUMBER, c.CUSTOMER, CASE WHEN y.Customer IS NULL THEN 0 ELSE y.ACTIVITIES END AS ACTIVITIES
FROM Numbers n
CROSS JOIN (SELECT DISTINCT
CUSTOMER
FROM #YourTable
) c
LEFT OUTER JOIN #YourTable y ON n.Number=y.WEEKNUMBER AND c.CUSTOMER=y.CUSTOMER
WHERE n.Number>=1 AND n.Number<=5
ORDER BY n.Number,c.CUSTOMER

Keep a table of time periods separately, and then outer left join the activities to it.
Like:
select *
from ReportingPeriod as p
left join Activities as a on a.ReportingPeriodId = p.ReportingPeriodId;

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Moving Median, Mode in T-SQL - sql-server

Related

How to Sum (MAX values) from different value groups in same column SQL Server

Group records based on time interval starting from timestamp of first record in each group

Transact-SQL - number rows until condition met

Find gaps in number range spanning multiple columns

Reporting on data when data is missing (ie. how to report zero activities for a customer on a given week)

Categories

Resources