select data, grouped as Histogram - sql-server

I have data in this format:
CREATE TABLE data(y int)
INSERT INTO data VALUES ((1))
INSERT INTO data VALUES ((55555))
INSERT INTO data VALUES ((55555))
INSERT INTO data VALUES ((99999))
I want to create a histogram, for to get a rough overview, of how my data is distributed. I am thinking of this format as output:
lowerBoundary upperBoundary y
------------- ------------- -----------
0 9999 1
10000 19999 0
20000 29999 0
30000 39999 0
40000 49999 0
50000 59999 2
60000 69999 0
70000 79999 0
80000 89999 0
90000 99999 1

You will have to create a table of numbers, so that the 0-rows will be displayed correctly. Then you can calculate lower and upper boundary of each "group".
Example SQL:
SELECT lowerBoundary, upperBoundary, COUNT(d.y) AS y
FROM (
SELECT n*10000 AS lowerBoundary, (n+1)*10000-1 AS upperBoundary
FROM (
-- Selects possible groups. Make this big enough for your data.
SELECT ones.n + 10*tens.n + 100*hundreds.n AS n
FROM (VALUES(0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) ones(n),
(VALUES(0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) tens(n),
(VALUES(0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) hundreds(n)
) numbersTable
) boundaries
-- join with data
LEFT JOIN data d
ON d.y BETWEEN lowerBoundary AND upperBoundary
-- avoid trailing '0' rows
WHERE lowerBoundary <= (SELECT MAX(d.y) FROM data d)
GROUP BY lowerBoundary, upperBoundary
ORDER BY 1
Click here to run this skript at SQL-Fiddle

Another option...
I use a TVF to generate dynamic ranges. Being a single-statement function, it is extremely fast. Furthermore, if you can't use a UDF, the logic is easily ported into a cte or sub-query.
Select RetVal1
,RetVaL2
,y = sum(case when y is null then 0 else 1 end)
From [dbo].[udf-Range-Number-Span](0,100000,10000) A
Left Join Data B on y>=RetVal1 and y<RetVal2
Group By RetVal1,RetVal2
Returns
RetVal1 RetVaL2 y
0.00 10000.00 1
10000.00 20000.00 0
20000.00 30000.00 0
30000.00 40000.00 0
40000.00 50000.00 0
50000.00 60000.00 2
60000.00 70000.00 0
70000.00 80000.00 0
80000.00 90000.00 0
90000.00 100000.00 1
The UDF if needed
CREATE FUNCTION [dbo].[udf-Range-Number-Span] (#R1 money,#R2 money,#Incr money)
Returns Table
Return (
with cte0(M) As (Select cast((#R2-#R1)/#Incr as int)),
cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
cte2(N) As (Select Top (Select M from cte0) Row_Number() over (Order By (Select NULL)) From cte1 a,cte1 b,cte1 c,cte1 d,cte1 e,cte1 f,cte1 g,cte1 h )
Select RetSeq=1,RetVal1=#R1,RetVal2=#R1+#Incr
Union All
Select N+1,(N*#Incr)+#R1,((N*#Incr)+#R1)+#Incr
From cte2,cte0
Where N<cte0.M
)
--Max 100 million observations
--Select * from [dbo].[udf-Range-Number-Span](1,4,.5)

Related

How to insert "empty" row extracting a month list?

I've this sp, which return a list of data, for each "month" (i.e. each row is a month). Somethings like that:
SELECT
*,
(CAST(t1.NumActivities AS DECIMAL) / t1.NumVisits) * 100 AS PercAccepted,
(CAST(t1.Accepted AS DECIMAL) / t1.Estimated) * 100 AS PercValue
FROM
(SELECT
MONTH(DateVisit) AS Month,
COUNT(*) AS NumVisits,
SUM(CASE WHEN DateActivity is not null THEN 1 ELSE 0 END) AS NumActivities,
SUM(Estimate) AS Estimated,
SUM(CASE WHEN DateActivity is not null THEN Estimate ELSE 0 END) AS Accepted
FROM [dbo].[Activities]
WHERE
DateVisit IS NOT NULL
AND (#year IS NULL OR YEAR(DateVisit) = #year)
AND (#clinicID IS NULL OR ClinicID = #clinicID)
GROUP BY MONTH(DateVisit)) t1
This is a result:
Month NumVisits NumActivities Estimated Accepted PercAccepted PercValue
1 5 1 13770.00 2520.00 20.00000000000 18.30065359477124
2 2 2 7900.00 7900.00 100.00000000000 100.00000000000000
3 1 0 2730.00 0.00 0.00000000000 0.00000000000000
8 1 1 3000.00 3000.00 100.00000000000 100.00000000000000
But as you can see, I could "miss" some Month (for example, here April "4" is missed).
Is it possible to insert, for the missing month/row, an empty (0) record? Such as:
Month NumVisits NumActivities Estimated Accepted PercAccepted PercValue
1 5 1 13770.00 2520.00 20.00000000000 18.30065359477124
2 2 2 7900.00 7900.00 100.00000000000 100.00000000000000
3 1 0 2730.00 0.00 0.00000000000 0.00000000000000
4 0 0 0 0 0 0
...
Here is a example with sample data:
CREATE TABLE #Report
(
Id INT,
Name nvarchar(max),
Percentage float
)
INSERT INTO #Report VALUES (1,'ONE',2.01)
INSERT INTO #Report VALUES (2,'TWO',3.01)
INSERT INTO #Report VALUES (5,'Five',5.01)
;WITH months(Month) AS
(
SELECT 1
UNION ALL
SELECT Month+1
FROM months
WHERE Month < 12
)
SELECT *
INTO #AllMonthsNumber
from months;
Your select query:
The left join will gives you the NULL for other months so just use ISNULL('ColumnName','String_to_replace')
\/\/\/\/
SELECT Month, ISNULL(Name,0), ISNULL(Percentage,0)
FROM AllMonthsNumber A
LEFT JOIN #Report B
ON A.Month = B.Id
EDIT:
Yes you can do it without creating AllMonthNumber Table:
You can use master..spt_values (found here) system table which contains the numbers so just with some where condition.
SELECT Number as Month, ISNULL(B.Name,0), ISNULL(Percentage,0)
FROM master..spt_values A
LEFT JOIN #Report B ON A.Number = B.Id
WHERE Type = 'P' AND number BETWEEN 1 AND 12

Performance issue with CTE SQL Server query

We have a table with a parent child relationship, that represents a deep tree structure.
We are using a view with a CTE to query the data but the performance is poor (see code and execution plan below).
Is there any way we can improve the performance?
WITH cte (ParentJobTypeId, Id) AS
(
SELECT
Id, Id
FROM
dbo.JobTypes
UNION ALL
SELECT
e.Id, cte.Id
FROM
cte
INNER JOIN
dbo.JobTypes AS e ON e.ParentJobTypeId = cte.ParentJobTypeId
)
SELECT
ISNULL(Id, 0) AS ParentJobTypeId,
ISNULL(ParentJobTypeId, 0) AS Id
FROM
cte
A quick example of using the range keys. As I mentioned before, hierarchies were 127K points and some sections where 15 levels deep
The cte Builds, let's assume the hier results will be will be stored in a table (indexed as well)
Declare #Table table(ID int,ParentID int,[Status] varchar(50))
Insert #Table values
(1,101,'Pending'),
(2,101,'Complete'),
(3,101,'Complete'),
(4,102,'Complete'),
(101,null,null),
(102,null,null)
;With cteOH (ID,ParentID,Lvl,Seq)
as (
Select ID,ParentID,Lvl=1,cast(Format(ID,'000000') + '/' as varchar(500)) from #Table where ParentID is null
Union All
Select h.ID,h.ParentID,cteOH.Lvl+1,Seq=cast(cteOH.Seq + Format(h.ID,'000000') + '/' as varchar(500)) From #Table h INNER JOIN cteOH ON h.ParentID = cteOH.ID
),
cteR1 as (Select ID,Seq,R1=Row_Number() over (Order by Seq) From cteOH),
cteR2 as (Select A.ID,R2 = max(B.R1) From cteOH A Join cteR1 B on (B.Seq Like A.Seq+'%') Group By A.ID)
Select B.R1
,C.R2
,A.Lvl
,A.ID
,A.ParentID
Into #TempHier
From cteOH A
Join cteR1 B on (A.ID=B.ID)
Join cteR2 C on (A.ID=C.ID)
Select * from #TempHier
Select H.R1
,H.R2
,H.Lvl
,H.ID
,H.ParentID
,Total = count(*)
,Complete = sum(case when D.Status = 'Complete' then 1 else 0 end)
,Pending = sum(case when D.Status = 'Pending' then 1 else 0 end)
,PctCmpl = format(sum(case when D.Status = 'Complete' then 1.0 else 0.0 end)/count(*),'##0.00%')
From #TempHier H
Join (Select _R1=B.R1,A.* From #Table A Join #TempHier B on A.ID=B.ID) D on D._R1 between H.R1 and H.R2
Group By H.R1
,H.R2
,H.Lvl
,H.ID
,H.ParentID
Order By 1
Returns the hier in a #Temp table for now. Notice the R1 and R2, I call these the range keys. Data (without recursion) can be selected and aggregated via these keys
R1 R2 Lvl ID ParentID
1 4 1 101 NULL
2 2 2 1 101
3 3 2 2 101
4 4 2 3 101
5 6 1 102 NULL
6 6 2 4 102
VERY SIMPLE EXAMPLE: Illustrates the rolling the data up the hier.
R1 R2 Lvl ID ParentID Total Complete Pending PctCmpl
1 4 1 101 NULL 4 2 1 50.00%
2 2 2 1 101 1 0 1 0.00%
3 3 2 2 101 1 1 0 100.00%
4 4 2 3 101 1 1 0 100.00%
5 6 1 102 NULL 2 1 0 50.00%
6 6 2 4 102 1 1 0 100.00%
The real beauty of the the range keys, is if you know an ID, you know where it exists (all descendants and ancestors).

Moving Median, Mode in T-SQL

I am using SQL Server 2012 and I know it is quite simple to calculate moving averages.
But what I need is to get the mode and the median for a defined window frame like so (with a window of 2 preceding to current row; month unique):
MONTH | CODE | MEDIAN | MODE
1 0 0 0
2 3 1.5 0
3 2 2 0
4 2 2 2
5 2 2 2
6 5 2 2
7 3 3 2
If several values qualify as mode, than pick the first.
I commented my code thoroughly. Read my comments on my Mode calculations and let me know it needs tweaking. Overall, it's a relatively simple query. It just has a lot of ugly subqueries and it has a lot of comments. Check it out:
DECLARE #Table TABLE ([Month] INT,[Code] INT);
INSERT INTO #Table
VALUES (1,0),
(2,3),
(3,2),
(4,2), --Try commenting this out to test my special mode thingymajig
(5,2),
(6,5),
(7,3);
WITH CTE
AS
(
SELECT ROW_NUMBER() OVER (ORDER BY [Month]) row_num,
[Month],
CAST(Code AS FLOAT) Code
FROM #Table
)
SELECT [Month],
Code,
ISNULL((
SELECT CASE
--When there is only one previous value at row_num = 2, find Mean of first two codes
WHEN A.row_num = 2 THEN (LAG(B.code,1) OVER (ORDER BY [Code]) + B.Code)/2.0
--Else find middle code value of current and previous two rows
ELSE B.Code
END
FROM CTE B
--How subquery relates to outer query
WHERE B.row_num BETWEEN A.row_num - 2 AND A.row_num
ORDER BY B.[Code]
--Order by code and offset by 1 so don't select the lowest value, but fetch the one above the lowest value
OFFSET 1 ROW FETCH NEXT 1 ROW ONLY),
0) AS Median,
--I did mode a little different
--Instead of Avg(D.Code) you could list the values because with mode,
--If there's a tie with more than one of each number, you have multiple modes
--Instead of doing that, I simply return the mean of the tied modes
--When there's one, it doesn't change anything.
--If you were to delete the month 4, then your number of Codes 2 and number of Codes 3 would be the same in the last row.
--Proper mode would be 2,3. I instead average them out to be 2.5.
ISNULL((
SELECT AVG(D.Code)
FROM (
SELECT C.Code,
COUNT(*) cnt,
DENSE_RANK() OVER (ORDER BY COUNT(*) DESC) dnse_rank
FROM CTE C
WHERE C.row_num <= A.row_num
GROUP BY C.Code
HAVING COUNT(*) > 1) D
WHERE D.dnse_rank = 1),
0) AS Mode
FROM CTE A
Results:
Month Code Median Mode
----------- ---------------------- ---------------------- ----------------------
1 0 0 0
2 3 1.5 0
3 2 2 0
4 2 2 2
5 2 2 2
6 5 2 2
7 3 3 2
If I understood your requirements correctly, your source table contains MONTH and CODE columns, and you want to calculate MEDIAN and MODE.
The query below calculates MEDIAN and MODE with moving window <= than 3 month ("2 preceding to current row") and returns the results matching your example.
-----------------------------------------------------
--Demo data
-----------------------------------------------------
CREATE TABLE #Data(
[Month] INT NOT NULL,
[Code] INT NOT NULL,
CONSTRAINT [PK_Data] PRIMARY KEY CLUSTERED
(
[Month] ASC
));
INSERT #Data
([Month],[Code])
VALUES
(1,0),
(2,3),
(3,2),
(4,2),
(5,2),
(6,5),
(7,3);
-----------------------------------------------------
--Query
-----------------------------------------------------
DECLARE #PrecedingRowsLimit INT = 2;
WITH [MPos] AS
(
SELECT [R].[Month]
, [RB].[Month] AS [SubId]
, [RB].[Code]
, ROW_NUMBER() OVER(PARTITION BY [R].[Month] ORDER BY [RB].[Code]) AS [RowNumberInPartition]
, CASE
WHEN [R].[Count] % 2 = 1 THEN ([R].[Count] + 1) / 2
ELSE NULL
END AS [MedianPosition]
, CASE
WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2
ELSE NULL
END AS [MedianPosition1]
, CASE
WHEN [R].[Count] % 2 = 0 THEN [R].[Count] / 2 + 1
ELSE NULL
END AS [MedianPosition2]
FROM
(
SELECT [RC].[Month]
, [RC].[RowNumber]
, CASE WHEN [RC].[Count] > #PrecedingRowsLimit + 1 THEN #PrecedingRowsLimit + 1 ELSE [RC].[Count] END AS [Count]
FROM
(
SELECT [Month]
, ROW_NUMBER() OVER(ORDER BY [Month]) AS [RowNumber]
, ROW_NUMBER() OVER(ORDER BY [Month]) AS [Count]
FROM #Data
) [RC]
) [R]
INNER JOIN #Data [RB]
ON [R].[Month] >= [RB].[Month]
AND [RB].[Month] >= [R].[RowNumber] - #PrecedingRowsLimit
)
SELECT DISTINCT [M].[Month]
, [ORIG].[Code]
, COALESCE([ME].[Code],([M1].[Code] + [M2].[Code]) / 2.0) AS [Median]
, [MOD].[Mode]
FROM [MPos] [M]
LEFT JOIN [MPOS] [ME]
ON [M].[Month] = [ME].[Month]
AND [M].[MedianPosition] = [ME].[RowNumberInPartition]
LEFT JOIN [MPOS] [M1]
ON [M].[Month] = [M1].[Month]
AND [M].[MedianPosition1] = [M1].[RowNumberInPartition]
LEFT JOIN [MPOS] [M2]
ON [M].[Month] = [M2].[Month]
AND [M].[MedianPosition2] = [M2].[RowNumberInPartition]
INNER JOIN
(
SELECT [MG].[Month]
, FIRST_VALUE([MG].[Code]) OVER (PARTITION BY [MG].[Month] ORDER BY [MG].[Count] DESC , [MG].[SubId] ASC) AS [Mode]
FROM
(
SELECT [Month] , MIN([SubId]) AS [SubId], [Code] , COUNT(1) AS [Count]
FROM [MPOS]
GROUP BY [Month] , [Code]
) [MG]
) [MOD]
ON [M].[Month] = [MOD].[Month]
INNER JOIN #Data [ORIG]
ON [ORIG].[Month] = [M].[Month]
ORDER BY [M].[Month];

Find gaps in number range spanning multiple columns

I'm trying to identify the gaps in a range of numbers (SQL Server). My scenario is below...
ID Start End
1 1 4
2 1 6
3 2 4
4 8 10
5 13 14
Visual
-------------------------------
1-2-3-4
1-2-3-4-5-6
2-3-4
- -8-9-10
- - -13-14
The result of this could be something along the lines of:
Table
-------------------------------
ID Start End Gap
4 8 10 -1
5 13 14 -2
Ultimately, I want to have the gap ranges, but I should be able to figure that out from above...
Missing
7
11-12
I came up with solutions that are either too slow or don't account for the overlap in ranges (ex ID 2)
CREATE TABLE #Docs (
[Rank] INT, --DENSE_RANK () OVER(ORDER BY BegProd)
ControlNumber BIGINT,
BegProd INT,
EndProd INT
)
SELECT
T1.ControlNumber,
T1.BegProd,
T1.EndProd,
MAX(T2.EndProd) AS [PreviousEndProd],
[Gap] = T1.BegProd - MAX(T2.EndProd) - 1
FROM #Docs T1
INNER JOIN #Docs T2
ON T1.[Rank] = T2.[Rank] + 1
AND T1.EndProd > T2.EndProd
GROUP BY T1.ControlNumber, T1.BegProd, T1.EndProd
HAVING T1.BegProd - MAX(T2.EndProd) > 1
There is over 2 million rows in this table and a range span of 1 to 1 billion
EDIT
Fixed 'Missing' table.
The gap column indicates how much of a gap there is before that start number. (ex missing #7 is 1 number)
Try this:
create table #docs(id int, start int, [end] int)
insert #docs values(1,1,4),(2,1,6),(3,2,4),(4,8,10),(5,13,14)
;with a as
(
select start, dense_rank() over (order by start) rn
from #docs t where not exists (select 1 from #docs where t.start > start and t.start < [end])
group by start
), b as
(
select [end], dense_rank() over (order by [end]) rn
from #docs t where not exists (select 1 from #docs where t.[end] > start and t.[end] < [end])
group by [end]
)
select
case when a.[start]= b.[end]+2 then cast(a.start-1 as varchar(21))
else cast(b.[end]+1 as varchar(10)) +'-' + cast(a.start - 1 as varchar(10)) end missing
from a join b on a.rn - 1 = b.rn
and a.[start] <> b.[end] + 1
Result:
Missing
7
11-12

Reporting on data when data is missing (ie. how to report zero activities for a customer on a given week)

I want to create a report which aggregates the number of activities per customer per week.
If there has been no activites on that customer for a given week, 0 should be displayed (i.e week 3 and 4 in the sample below)
CUSTOMER | #ACTIVITIES | WEEKNUMBER
A | 4 | 1
A | 2 | 2
A | 0 | 3
A | 0 | 4
A | 1 | 5
B ...
C ...
The problem is that if there are no activities there is no data to report on and therefor week 3 and 4 in the sample below is not in the report.
What is the "best" way to solve this?
Try this:
DECLARE #YourTable table (CUSTOMER char(1), ACTIVITIES int, WEEKNUMBER int)
INSERT #YourTable VALUES ('A' , 4 , 1)
INSERT #YourTable VALUES ('A' , 2 , 2)
INSERT #YourTable VALUES ('A' , 0 , 3)
INSERT #YourTable VALUES ('A' , 0 , 4)
INSERT #YourTable VALUES ('A' , 1 , 5)
INSERT #YourTable VALUES ('B' , 5 , 3)
INSERT #YourTable VALUES ('C' , 2 , 4)
DECLARE #StartNumber int
,#EndNumber int
SELECT #StartNumber=1
,#EndNumber=5
;WITH AllNumbers AS
(
SELECT #StartNumber AS Number
UNION ALL
SELECT Number+1
FROM AllNumbers
WHERE Number<#EndNumber
)
, AllCustomers AS
(
SELECT DISTINCT CUSTOMER FROM #YourTable
)
SELECT
n.Number AS WEEKNUMBER, c.CUSTOMER, CASE WHEN y.Customer IS NULL THEN 0 ELSE y.ACTIVITIES END AS ACTIVITIES
FROM AllNumbers n
CROSS JOIN AllCustomers c
LEFT OUTER JOIN #YourTable y ON n.Number=y.WEEKNUMBER AND c.CUSTOMER=y.CUSTOMER
--OPTION (MAXRECURSION 500)
OUTPUT:
WEEKNUMBER CUSTOMER ACTIVITIES
----------- -------- -----------
1 A 4
1 B 0
1 C 0
2 A 2
2 B 0
2 C 0
3 A 0
3 B 5
3 C 0
4 A 0
4 B 0
4 C 2
5 A 1
5 B 0
5 C 0
(15 row(s) affected)
I use a CTE to build a Numbers table, but you could build a permanent one look at this question: What is the best way to create and populate a numbers table?. You could Write the Query without a CTE (same results as above):
SELECT
n.Number AS WEEKNUMBER, c.CUSTOMER, CASE WHEN y.Customer IS NULL THEN 0 ELSE y.ACTIVITIES END AS ACTIVITIES
FROM Numbers n
CROSS JOIN (SELECT DISTINCT
CUSTOMER
FROM #YourTable
) c
LEFT OUTER JOIN #YourTable y ON n.Number=y.WEEKNUMBER AND c.CUSTOMER=y.CUSTOMER
WHERE n.Number>=1 AND n.Number<=5
ORDER BY n.Number,c.CUSTOMER
Keep a table of time periods separately, and then outer left join the activities to it.
Like:
select *
from ReportingPeriod as p
left join Activities as a on a.ReportingPeriodId = p.ReportingPeriodId;

Resources