paging in ms sql - sql-server

I have this code
ALTER PROCEDURE [dbo].[Model_Core_BlogPost_GetLatestPaging]
#PageSize INT,
#CurrentPage INT
AS
BEGIN
DECLARE #PageStart int, #PageEnd int
SET #PageStart = #CurrentPage * #PageSize
set #PageEnd = #PageStart + #PageSize
;with C as (
SELECT
e.blogpostid,
e.PreviewText,
e.Headline,
e.URLHeadline,
u.Blogname,
u.imageurl AS ImageURL,
e.CommentsCount,
e.HitsCount,
e.Created,
ROW_NUMBER() over (order by e.created desc) as rownum
FROM BlogPosts e
INNER JOIN Users u ON e.BlogUserID = u.UserID
WHERE e.[Status] = 1 and e.Deleteddate is null
)
SELECT *
FROM C
WHERE rownum > #pagestart
AND rownum <= #pageend
END
I have issues when the #CurrentPage is a large number and I often get sql timeouts in my application.
Any ideas for a solution ?

As long as indexes are in place I would suggest splitting this query in 2 separate.
First run the ranking function and filter on blogposts, insert the result into a temporaray table and afterwards join the temporary table with users possibly by using an option loop join (the temp table will have very little rows comparing to users and loop join is perfect for this situation).
This way your join will have much less rows to parse. Also, are you sure the blogposts.created column has index? Row_number will perform ordering on this field.

Related

How to optimize the insert query from multiple tables?

I have 2 tables, Table 1 (temp table in SP) has around 400 records. Table 2 has around 30,550,284 records.
I need to run a loop on table 1 for each record and get the top 1 from table 2 based on a few conditions (where clause) and then order by modified date in decreasing order.
There is an index on the modified date.
declare #iPos int;
declare #iCount int;
select #iCount = count(*) from Table1;
set #iPos = 1;
declare #Table2 table(......)
declare #timestampLocal2 datetime
while (#iPos <= #iCount)
BEGIN
select #val1 = Col1, #timestampLocal = TimeStamp
from #Table1 where ID = #iPos
set #timestampLocal2 = DATEADD(HH,-96,#timestampLocal)
INSERT INTO #Temp3 ( .... ),....)
select top 1 r.LastModified, r.[Col2], r.Col3, #iPos
from Table2 (NOLOCK) r
where Col1 =#val1 and
r.LastModified <= #timestampLocal
and r.LastModified >= #timestampLocal2
and (r.Col2 is not null and r.Col3 is not null)
order by LastModified desc
SELECT #iPos = #iPos + 1;
END
This query is very slow.
I have also thought to archive table 2, But I want to keep that as the second option for now.
Do I really need to add an index on the columns which are involved in the where clause?
So my question is, in terms of performance is there a better way to do this?
I believe a CROSS APPLY or OUTER APPLY may do the trick. These can be thought of as being similar to INNER JOIN or LEFT JOIN, except that they allow you to reference a subquery having more complex conditions such as TOP 1 and ORDER BY. Ideal for cases like this.
-- INSERT INTO #Temp3 ( .... )
select r.LastModified, r.[Col2], r.Col3, t1.ID
from #Table1 t1
cross apply (
SELECT TOP 1 r.*
from Table2 r -- Don't use (NOLOCK)
where r.Col1 = t.Col1
and r.LastModified <= t1.[TimeStamp]
and r.LastModified >= DATEADD(HH,-96,t1.[TimeStamp])
and (r.Col2 is not null and r.Col3 is not null)
order by r.LastModified desc
) r
For efficiency, I recommend an index on Table2(Col1,LastModified) or as an absolute minimum, an index on Table2(Col1).
I would strongly discourage the use of (NOLOCK) or 'READ UNCOMMITTED` in queries that update the database (like the insert into table3 above). While the query may appear to work most of the time, seemingly random occurrences of missing or duplicate rows may result.
Do you need to handle cases where no matching Table2 record is found? The above will quietly ignore such cases. Changing the CROSS APPLY to an OUTER APPLY together with logic to handle null r.xxx values could be what you need.

ROW_NUMBER in cross apply generating "incorrect" values based on exists clause

Here is the sql:
-- Schema
DECLARE #ModelItem TABLE (
ModelItemId UNIQUEIDENTIFIER,
MetamodelItemId UNIQUEIDENTIFIER
)
DECLARE #MetamodelItemAncestor TABLE (
MetamodelItemId UNIQUEIDENTIFIER,
ParentMetamodelItemId UNIQUEIDENTIFIER,
AncestorLevel INT
)
DECLARE #SolutionMetamodelItem TABLE (
MetamodelItemId UNIQUEIDENTIFIER,
SolutionId UNIQUEIDENTIFIER
)
INSERT INTO #ModelItem VALUES ('EC6AC6A9-684E-E611-8117-00155D026308', '2AB1F075-684E-E611-8117-00155D026308')
INSERT INTO #MetamodelItemAncestor
VALUES ('2AB1F075-684E-E611-8117-00155D026308', '2AB1F075-684E-E611-8117-00155D026308', 0),
('2AB1F075-684E-E611-8117-00155D026308', 'AA12E380-CA4D-E611-8117-00155D026308', 1)
INSERT INTO #SolutionMetamodelItem
VALUES ('2AB1F075-684E-E611-8117-00155D026308', 'f612a333-ca4d-e611-8117-00155d026308'),
('AA12E380-CA4D-E611-8117-00155D026308', 'fc160f3e-ca4d-e611-8117-00155d026308')
-- query
DECLARE #ModelItemId TABLE (EntityId UNIQUEIDENTIFIER)
DECLARE #SolutionId TABLE (EntityId UNIQUEIDENTIFIER)
INSERT INTO #ModelItemId
VALUES ('EC6AC6A9-684E-E611-8117-00155D026308')
INSERT INTO #SolutionId
VALUES ('f612a333-ca4d-e611-8117-00155d026308'), ('fc160f3e-ca4d-e611-8117-00155d026308')
SELECT mia.*
FROM (
SELECT M.EntityId AS ModelItemId, S.EntityId AS SolutionId
FROM #ModelItemId AS M
CROSS JOIN #SolutionId AS S
) AS m
CROSS APPLY (
SELECT
MI.ModelItemId,
OTA.ParentMetamodelItemId AS [MetamodelItemId],
ROW_NUMBER() OVER (PARTITION BY [MI].[ModelItemId] ORDER BY [OTA].[AncestorLevel] ASC) AS [AspectRank]
FROM #ModelItem AS MI
INNER JOIN #MetamodelItemAncestor AS OTA
ON MI.MetamodelItemId = OTA.MetamodelItemId
WHERE
MI.ModelItemId = m.ModelItemId
AND EXISTS (
SELECT 1
FROM #SolutionMetamodelItem AS MSMI
WHERE MSMI.MetamodelItemId = OTA.ParentMetamodelItemId
AND MSMI.SolutionId = m.SolutionId
)
) mia
SELECT mia.*
FROM #ModelItemId AS m
CROSS APPLY (
SELECT
MI.ModelItemId,
OTA.ParentMetamodelItemId AS [MetamodelItemId],
ROW_NUMBER() OVER (PARTITION BY [MI].[ModelItemId] ORDER BY [OTA].[AncestorLevel] ASC) AS [AspectRank]
FROM #ModelItem as MI
INNER JOIN #MetamodelItemAncestor AS OTA
ON MI.MetamodelItemId = OTA.MetamodelItemId
WHERE
MI.ModelItemId = m.EntityId
AND EXISTS (
SELECT 1
FROM #SolutionMetamodelItem MSMI
WHERE MSMI.MetamodelItemId = OTA.ParentMetamodelItemId
AND MSMI.SolutionId IN (SELECT s.EntityId FROM #SolutionId AS s)
)
) mia
Notice the AspectRank. In the second query it has correctly increased the value sequentially based on the partition.
Looking at the execution plan, for the first query it seems like the row_number (sequence project) is running concurrently to the scan of the #solution table, but I still am not fully sure why it has not increased the row number value since there a duplicate items.
Could someone explain this? I need to use the first approach because the cross apply query is in fact a UDF with the ModelItemId and SolutionId as parameters.
I would assume the cross apply is executed separately for each of the rows in your outer query -> each of the rows returned is the 1st (and only) row.
Why do you need to have the row number inside the cross apply, instead of being in the outer query, if that's actually where your data is?

How to select Top % in T-SQL without using Top clause?

How to select Top 40% from a table without using the Top clause (or Top percent, the assignment is a little ambiguous) ? This question is for T-SQL, SQL Server 2008. I am not allowed to use Top for my assignment.
Thanks.
This is what I've tried but seems complicated. Isn't there an easier way ?
select top (convert (int, (select round (0.4*COUNT(*), 0) from MyTable))) * from MyTable
Try the NTILE function:
;WITH YourCTE AS
(
SELECT
(some columns),
percentile = NTILE(10) OVER(ORDER BY SomeColumn DESC)
FROM
dbo.YourTable
)
SELECT *
FROM YourCTE
WHERE percentile <= 4
The NTILE(10) OVER(....) creates 10 groups of percentages over your data - and thus, the top 40% are the groups no. 1, 2, 3, 4 of that result
Use NTILE
CREATE TABLE #temp(StudentID CHAR(3), Score INT)
INSERT #temp VALUES('S1',75 )
INSERT #temp VALUES('S2',83)
INSERT #temp VALUES('S3',91)
INSERT #temp VALUES('S4',83)
INSERT #temp VALUES('S5',93 )
INSERT #temp VALUES('S6',75 )
INSERT #temp VALUES('S7',83)
INSERT #temp VALUES('S8',91)
INSERT #temp VALUES('S9',83)
INSERT #temp VALUES('S10',93 )
SELECT * FROM (
SELECT NTILE(10) OVER(ORDER BY Score) AS NtileValue,*
FROM #temp) x
WHERE NtileValue <= 4
ORDER BY 1
Interesting enough I blogged about NTILE today: Does anyone use the NTILE() windowing function?
A problem with the NTILE(10) answers given so far is that if the table has 15 rows they will return 8 rows (53%) rather than the correct number to make up 40% (6).
If the number of rows is not evenly divisible by number of buckets the extra rows all go into the first buckets rather than being evenly distributed.
This alternative (borrows SQL Menace's table) avoids that issue.
WITH CTE
AS (SELECT *,
ROW_NUMBER() OVER ( ORDER BY Score) AS RN,
COUNT(*) OVER() AS Cnt
FROM #temp)
SELECT StudentID,
Score
FROM CTE
WHERE RN <= CEILING(0.4 * Cnt )
Using Top t-sql command:
select top 10 [Column_1],
[Column_2] from [Table]
order by [Column_1]
Using Paging method:
select
[Column_1],
[Column_2]
from
(Select ROW_NUMBER() Over (ORDER BY [Column_1]) AS Row,
[Column_1],
[Column_2]
FROM [Table]) as [alias]
WHERE (Row between 0 and 10)
This is finding the top 10 with order by [Column_1]...please note this is using [variable] method of documentation.
If you could provide column names and table names i could write much more beneficial t-sql, for example to find the top 40% you are going to need to do another sub-query to get count of all rows then do division, i'd likely do this as a query before i do the main query.
Calculate and set ROWCOUNT for whatever number of records.
Then execute you query for the limited set.
declare #rc as integer
select #rc = count(*)*0.40 from CTE
Set ROWCOUNT #rc
select * from CTE
ROWCOUNT is not deprecated yet - see http://msdn.microsoft.com/en-us/library/ms188774.aspx

SQL Server: Improve PROCEDURE without using CURSOR

I am looking for a way to write the below procedure without using a CURSOR or just to find a better performing query.
CREATE TABLE #OrderTransaction (OrderTransactionId int, ProductId int, Quantity int);
CREATE TABLE #Product (ProductId int, MediaTypeId int);
CREATE TABLE #OrderDelivery (OrderTransactionId int, MediaTypeId int);
INSERT INTO #Product (ProductId, MediaTypeId) VALUES (1,1);
INSERT INTO #Product (ProductId, MediaTypeId) VALUES (2,2);
INSERT INTO #OrderTransaction(OrderTransactionId, ProductId, Quantity) VALUES (1,1,1);
INSERT INTO #OrderTransaction(OrderTransactionId, ProductId, Quantity) VALUES (2,2,6);
DECLARE #OrderTransactionId int, #MediaTypeId int, #Quantity int;
DECLARE ordertran CURSOR FAST_FORWARD FOR
SELECT OT.OrderTransactionId, P.MediaTypeId, OT.Quantity
FROM #OrderTransaction OT WITH (NOLOCK)
INNER JOIN #Product P WITH (NOLOCK)
ON OT.ProductId = P.ProductId
OPEN ordertran;
FETCH NEXT FROM ordertran INTO #OrderTransactionId, #MediaTypeId, #Quantity;
WHILE ##FETCH_STATUS = 0
BEGIN
WHILE #Quantity > 0
BEGIN
INSERT INTO #OrderDelivery ([OrderTransactionId], [MediaTypeId])
VALUES (#OrderTransactionId, #MediaTypeId)
SELECT #Quantity = #Quantity - 1;
END
FETCH NEXT FROM ordertran INTO #OrderTransactionId, #MediaTypeId, #Quantity;
END
CLOSE ordertran;
DEALLOCATE ordertran;
SELECT * FROM #OrderTransaction
SELECT * FROM #Product
SELECT * FROM #OrderDelivery
DROP TABLE #OrderTransaction;
DROP TABLE #Product;
DROP TABLE #OrderDelivery;
Begin with a Numbers table that is large enough to handle the maximum order amount:
CREATE TABLE Numbers (
Num int NOT NULL PRIMARY KEY CLUSTERED
)
-- SQL 2000 version
INSERT Numbers VALUES (1)
SET NOCOUNT ON
GO
INSERT Numbers (Num) SELECT Num + (SELECT Max(Num) FROM Numbers) FROM Numbers
GO 15
-- SQL 2005 and up version
WITH
L0 AS (SELECT c = 1 UNION ALL SELECT 1),
L1 AS (SELECT c = 1 FROM L0 A, L0 B),
L2 AS (SELECT c = 1 FROM L1 A, L1 B),
L3 AS (SELECT c = 1 FROM L2 A, L2 B),
L4 AS (SELECT c = 1 FROM L3 A, L3 B),
L5 AS (SELECT c = 1 FROM L4 A, L4 B),
N AS (SELECT Num = ROW_NUMBER() OVER (ORDER BY c) FROM L5)
INSERT Numbers(Num)
SELECT Num FROM N
WHERE Num <= 32768;
Then, immediately after your INSERT statements:
INSERT #OrderDelivery (OrderTransactionId, MediaTypeId)
SELECT
OT.OrderTransactionId,
P.MediaTypeId
FROM
#OrderTransaction OT
INNER JOIN #Product P ON OT.ProductId = P.ProductId
INNER JOIN Numbers N ON N.Num BETWEEN 1 AND OT.Quantity
That should do it!
If for some reason you have qualms about putting a permanent Numbers table in your database (which I don't understand as it is a wonderful tool), then you can simply join to the CTE given instead of the table itself. In SQL 2000 you can create a temp table and use a loop, but I would advise against this strongly.
A Numbers table is highly recommended. There is no concern about some future change breaking it (the set of whole numbers won't change any time soon). Some people use a Numbers table with a million numbers in it, which is only around 4MB of storage.
To answer critics of the Numbers table: if the database design uses a numbers table, then that table won't need to change. It is like any other table in the database and can be relied on. You don't worry too much about queries against an Orders table failing because some day the table might not exist, so I don't see why there would be any similar concern about another table that is required and depended on.
UPDATE
In the time since writing this answer I have learned about the master.dbo.spt_values table which has a number column. When queried with where type='P' you get 0 - 255 in SQL 2000 and 0 - 8191 in SQL 2005 and up. (There are also potentially useful low and high columns.) You can cross join this table to itself a couple of times if necessary to get, even in SQL 2000, a bunch of rows very quickly.
The trick is to introduce a table of values (named, in the example below, MyTableOfIntegers) which contains all the integer values between 1 and (at least) some value (in the case at hand, that would be the biggest possible Quantity value from OrderTransaction table).
INSERT INTO #OrderDelivery ([OrderTransactionId], [MediaTypeId])
SELECT OT.OrderTransactionId, P.MediaTypeId
FROM #OrderTransaction OT WITH (NOLOCK)
INNER JOIN #Product P WITH (NOLOCK)
ON OT.ProductId = P.ProductId
JOIN MyTableOfIntegers I ON I.Num <= OT.Quantity
--WHERE some optional conditions
Essentially the extra JOIN on MyTableOfIntegers, produces as many duplicate rows as OT.Quantity, and that seems to be what the purpose of the cursor was: to insert that many duplicated rows in the OrderDelivery table.
I didn't check the rest of the logic with the temporary tables and all (I'm assuming these are temp tables for the purpose of checking the logic rather than being part of the process proper), but it seems that the above is the type of construct needed to express the needed logic in declarative fashion only, without any cursor or even any loop.
Here is a slight variation on the previous answers, that avoids a permanent numbers table (though I am not sure why people are so afraid of this construct), and allows you to build a run-time CTE that contains exactly the set of numbers you'll need to perform the correct number of inserts (by checking for the highest quantity). I commented out the CROSS JOIN in the initial CTE, but you can use it if your quantity for any given order can exceed the number of rows in sys.columns. Hopefully that is an unlikely scenario. Note that this is for SQL Server 2005 and up ... it is always useful to let us know which specific version(s) you are targeting.
DECLARE #numsNeeded INT;
SELECT #numsNeeded = MAX(Quantity) FROM #OrderTransaction;
WITH n AS
(
SELECT TOP (#numsNeeded) i = ROW_NUMBER()
OVER (ORDER BY c.[object_id])
FROM sys.columns AS c --CROSS JOIN sys.columns AS c2
)
INSERT #OrderDelivery
(
OrderTransactionID,
MediaTypeID
)
SELECT t.OrderTransactionID, p.MediaTypeID
FROM #OrderTransaction AS t
INNER JOIN #Product AS p
ON t.ProductID = p.ProductID
INNER JOIN n
ON n.i <= t.Quantity;
INSERT INTO #OrderDelivery ([OrderTransactionId], [MediaTypeId])
SELECT OT.OrderTransactionId, P.MediaTypeId,
FROM #OrderTransaction OT
INNER JOIN #Product P
ON OT.ProductId = P.ProductId
WHERE OT.Quantity > 0
I feel like i'm misreading the logic here, but isn't that the equivelant?
This still uses a loop but it has gotten rid of the cursor. Short of creating a table of numbers to join on, I think this is the best answer.
DECLARE #Count AS INTEGER
SET #Count = 1
WHILE (1 = 1)
BEGIN
INSERT INTO #OrderDelivery ([OrderTransactionId], [MediaTypeId])
SELECT OT.OrderTransactionId, P.MediaTypeId, OT.Quantity
FROM #OrderTransaction OT WITH (NOLOCK)
INNER JOIN #Product P WITH (NOLOCK)
ON OT.ProductId = P.ProductId
WHERE OT.Quantity > #Count
IF ##ROWCOUNT = 0
BREAK
SET #COUNT = #COUNT + 1
END

Function to Calculate Median in SQL Server

According to MSDN, Median is not available as an aggregate function in Transact-SQL. However, I would like to find out whether it is possible to create this functionality (using the Create Aggregate function, user defined function, or some other method).
What would be the best way (if possible) to do this - allow for the calculation of a median value (assuming a numeric data type) in an aggregate query?
If you're using SQL 2005 or better this is a nice, simple-ish median calculation for a single column in a table:
SELECT
(
(SELECT MAX(Score) FROM
(SELECT TOP 50 PERCENT Score FROM Posts ORDER BY Score) AS BottomHalf)
+
(SELECT MIN(Score) FROM
(SELECT TOP 50 PERCENT Score FROM Posts ORDER BY Score DESC) AS TopHalf)
) / 2 AS Median
2019 UPDATE: In the 10 years since I wrote this answer, more solutions have been uncovered that may yield better results. Also, SQL Server releases since then (especially SQL 2012) have introduced new T-SQL features that can be used to calculate medians. SQL Server releases have also improved its query optimizer which may affect perf of various median solutions. Net-net, my original 2009 post is still OK but there may be better solutions on for modern SQL Server apps. Take a look at this article from 2012 which is a great resource: https://sqlperformance.com/2012/08/t-sql-queries/median
This article found the following pattern to be much, much faster than all other alternatives, at least on the simple schema they tested. This solution was 373x faster (!!!) than the slowest (PERCENTILE_CONT) solution tested. Note that this trick requires two separate queries which may not be practical in all cases. It also requires SQL 2012 or later.
DECLARE #c BIGINT = (SELECT COUNT(*) FROM dbo.EvenRows);
SELECT AVG(1.0 * val)
FROM (
SELECT val FROM dbo.EvenRows
ORDER BY val
OFFSET (#c - 1) / 2 ROWS
FETCH NEXT 1 + (1 - #c % 2) ROWS ONLY
) AS x;
Of course, just because one test on one schema in 2012 yielded great results, your mileage may vary, especially if you're on SQL Server 2014 or later. If perf is important for your median calculation, I'd strongly suggest trying and perf-testing several of the options recommended in that article to make sure that you've found the best one for your schema.
I'd also be especially careful using the (new in SQL Server 2012) function PERCENTILE_CONT that's recommended in one of the other answers to this question, because the article linked above found this built-in function to be 373x slower than the fastest solution. It's possible that this disparity has been improved in the 7 years since, but personally I wouldn't use this function on a large table until I verified its performance vs. other solutions.
ORIGINAL 2009 POST IS BELOW:
There are lots of ways to do this, with dramatically varying performance. Here's one particularly well-optimized solution, from Medians, ROW_NUMBERs, and performance. This is a particularly optimal solution when it comes to actual I/Os generated during execution – it looks more costly than other solutions, but it is actually much faster.
That page also contains a discussion of other solutions and performance testing details. Note the use of a unique column as a disambiguator in case there are multiple rows with the same value of the median column.
As with all database performance scenarios, always try to test a solution out with real data on real hardware – you never know when a change to SQL Server's optimizer or a peculiarity in your environment will make a normally-speedy solution slower.
SELECT
CustomerId,
AVG(TotalDue)
FROM
(
SELECT
CustomerId,
TotalDue,
-- SalesOrderId in the ORDER BY is a disambiguator to break ties
ROW_NUMBER() OVER (
PARTITION BY CustomerId
ORDER BY TotalDue ASC, SalesOrderId ASC) AS RowAsc,
ROW_NUMBER() OVER (
PARTITION BY CustomerId
ORDER BY TotalDue DESC, SalesOrderId DESC) AS RowDesc
FROM Sales.SalesOrderHeader SOH
) x
WHERE
RowAsc IN (RowDesc, RowDesc - 1, RowDesc + 1)
GROUP BY CustomerId
ORDER BY CustomerId;
In SQL Server 2012 you should use PERCENTILE_CONT:
SELECT SalesOrderID, OrderQty,
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY OrderQty)
OVER (PARTITION BY SalesOrderID) AS MedianCont
FROM Sales.SalesOrderDetail
WHERE SalesOrderID IN (43670, 43669, 43667, 43663)
ORDER BY SalesOrderID DESC
See also : http://blog.sqlauthority.com/2011/11/20/sql-server-introduction-to-percentile_cont-analytic-functions-introduced-in-sql-server-2012/
My original quick answer was:
select max(my_column) as [my_column], quartile
from (select my_column, ntile(4) over (order by my_column) as [quartile]
from my_table) i
--where quartile = 2
group by quartile
This will give you the median and interquartile range in one fell swoop. If you really only want one row that is the median then uncomment the where clause.
When you stick that into an explain plan, 60% of the work is sorting the data which is unavoidable when calculating position dependent statistics like this.
I've amended the answer to follow the excellent suggestion from Robert Ševčík-Robajz in the comments below:
;with PartitionedData as
(select my_column, ntile(10) over (order by my_column) as [percentile]
from my_table),
MinimaAndMaxima as
(select min(my_column) as [low], max(my_column) as [high], percentile
from PartitionedData
group by percentile)
select
case
when b.percentile = 10 then cast(b.high as decimal(18,2))
else cast((a.low + b.high) as decimal(18,2)) / 2
end as [value], --b.high, a.low,
b.percentile
from MinimaAndMaxima a
join MinimaAndMaxima b on (a.percentile -1 = b.percentile) or (a.percentile = 10 and b.percentile = 10)
--where b.percentile = 5
This should calculate the correct median and percentile values when you have an even number of data items. Again, uncomment the final where clause if you only want the median and not the entire percentile distribution.
Even better:
SELECT #Median = AVG(1.0 * val)
FROM
(
SELECT o.val, rn = ROW_NUMBER() OVER (ORDER BY o.val), c.c
FROM dbo.EvenRows AS o
CROSS JOIN (SELECT c = COUNT(*) FROM dbo.EvenRows) AS c
) AS x
WHERE rn IN ((c + 1)/2, (c + 2)/2);
From the master Himself, Itzik Ben-Gan!
MS SQL Server 2012 (and later) has the PERCENTILE_DISC function which computes a specific percentile for sorted values. PERCENTILE_DISC (0.5) will compute the median - https://msdn.microsoft.com/en-us/library/hh231327.aspx
Simple, fast, accurate
SELECT x.Amount
FROM (SELECT amount,
Count(1) OVER (partition BY 'A') AS TotalRows,
Row_number() OVER (ORDER BY Amount ASC) AS AmountOrder
FROM facttransaction ft) x
WHERE x.AmountOrder = Round(x.TotalRows / 2.0, 0)
If you want to use the Create Aggregate function in SQL Server, this is how to do it. Doing it this way has the benefit of being able to write clean queries. Note this this process could be adapted to calculate a Percentile value fairly easily.
Create a new Visual Studio project and set the target framework to .NET 3.5 (this is for SQL 2008, it may be different in SQL 2012). Then create a class file and put in the following code, or c# equivalent:
Imports Microsoft.SqlServer.Server
Imports System.Data.SqlTypes
Imports System.IO
<Serializable>
<SqlUserDefinedAggregate(Format.UserDefined, IsInvariantToNulls:=True, IsInvariantToDuplicates:=False, _
IsInvariantToOrder:=True, MaxByteSize:=-1, IsNullIfEmpty:=True)>
Public Class Median
Implements IBinarySerialize
Private _items As List(Of Decimal)
Public Sub Init()
_items = New List(Of Decimal)()
End Sub
Public Sub Accumulate(value As SqlDecimal)
If Not value.IsNull Then
_items.Add(value.Value)
End If
End Sub
Public Sub Merge(other As Median)
If other._items IsNot Nothing Then
_items.AddRange(other._items)
End If
End Sub
Public Function Terminate() As SqlDecimal
If _items.Count <> 0 Then
Dim result As Decimal
_items = _items.OrderBy(Function(i) i).ToList()
If _items.Count Mod 2 = 0 Then
result = ((_items((_items.Count / 2) - 1)) + (_items(_items.Count / 2))) / 2#
Else
result = _items((_items.Count - 1) / 2)
End If
Return New SqlDecimal(result)
Else
Return New SqlDecimal()
End If
End Function
Public Sub Read(r As BinaryReader) Implements IBinarySerialize.Read
'deserialize it from a string
Dim list = r.ReadString()
_items = New List(Of Decimal)
For Each value In list.Split(","c)
Dim number As Decimal
If Decimal.TryParse(value, number) Then
_items.Add(number)
End If
Next
End Sub
Public Sub Write(w As BinaryWriter) Implements IBinarySerialize.Write
'serialize the list to a string
Dim list = ""
For Each item In _items
If list <> "" Then
list += ","
End If
list += item.ToString()
Next
w.Write(list)
End Sub
End Class
Then compile it and copy the DLL and PDB file to your SQL Server machine and run the following command in SQL Server:
CREATE ASSEMBLY CustomAggregate FROM '{path to your DLL}'
WITH PERMISSION_SET=SAFE;
GO
CREATE AGGREGATE Median(#value decimal(9, 3))
RETURNS decimal(9, 3)
EXTERNAL NAME [CustomAggregate].[{namespace of your DLL}.Median];
GO
You can then write a query to calculate the median like this:
SELECT dbo.Median(Field) FROM Table
I just came across this page while looking for a set based solution to median. After looking at some of the solutions here, I came up with the following. Hope is helps/works.
DECLARE #test TABLE(
i int identity(1,1),
id int,
score float
)
INSERT INTO #test (id,score) VALUES (1,10)
INSERT INTO #test (id,score) VALUES (1,11)
INSERT INTO #test (id,score) VALUES (1,15)
INSERT INTO #test (id,score) VALUES (1,19)
INSERT INTO #test (id,score) VALUES (1,20)
INSERT INTO #test (id,score) VALUES (2,20)
INSERT INTO #test (id,score) VALUES (2,21)
INSERT INTO #test (id,score) VALUES (2,25)
INSERT INTO #test (id,score) VALUES (2,29)
INSERT INTO #test (id,score) VALUES (2,30)
INSERT INTO #test (id,score) VALUES (3,20)
INSERT INTO #test (id,score) VALUES (3,21)
INSERT INTO #test (id,score) VALUES (3,25)
INSERT INTO #test (id,score) VALUES (3,29)
DECLARE #counts TABLE(
id int,
cnt int
)
INSERT INTO #counts (
id,
cnt
)
SELECT
id,
COUNT(*)
FROM
#test
GROUP BY
id
SELECT
drv.id,
drv.start,
AVG(t.score)
FROM
(
SELECT
MIN(t.i)-1 AS start,
t.id
FROM
#test t
GROUP BY
t.id
) drv
INNER JOIN #test t ON drv.id = t.id
INNER JOIN #counts c ON t.id = c.id
WHERE
t.i = ((c.cnt+1)/2)+drv.start
OR (
t.i = (((c.cnt+1)%2) * ((c.cnt+2)/2))+drv.start
AND ((c.cnt+1)%2) * ((c.cnt+2)/2) <> 0
)
GROUP BY
drv.id,
drv.start
The following query returns the median from a list of values in one column. It cannot be used as or along with an aggregate function, but you can still use it as a sub-query with a WHERE clause in the inner select.
SQL Server 2005+:
SELECT TOP 1 value from
(
SELECT TOP 50 PERCENT value
FROM table_name
ORDER BY value
)for_median
ORDER BY value DESC
Although Justin grant's solution appears solid I found that when you have a number of duplicate values within a given partition key the row numbers for the ASC duplicate values end up out of sequence so they do not properly align.
Here is a fragment from my result:
KEY VALUE ROWA ROWD
13 2 22 182
13 1 6 183
13 1 7 184
13 1 8 185
13 1 9 186
13 1 10 187
13 1 11 188
13 1 12 189
13 0 1 190
13 0 2 191
13 0 3 192
13 0 4 193
13 0 5 194
I used Justin's code as the basis for this solution. Although not as efficient given the use of multiple derived tables it does resolve the row ordering problem I encountered. Any improvements would be welcome as I am not that experienced in T-SQL.
SELECT PKEY, cast(AVG(VALUE)as decimal(5,2)) as MEDIANVALUE
FROM
(
SELECT PKEY,VALUE,ROWA,ROWD,
'FLAG' = (CASE WHEN ROWA IN (ROWD,ROWD-1,ROWD+1) THEN 1 ELSE 0 END)
FROM
(
SELECT
PKEY,
cast(VALUE as decimal(5,2)) as VALUE,
ROWA,
ROW_NUMBER() OVER (PARTITION BY PKEY ORDER BY ROWA DESC) as ROWD
FROM
(
SELECT
PKEY,
VALUE,
ROW_NUMBER() OVER (PARTITION BY PKEY ORDER BY VALUE ASC,PKEY ASC ) as ROWA
FROM [MTEST]
)T1
)T2
)T3
WHERE FLAG = '1'
GROUP BY PKEY
ORDER BY PKEY
In a UDF, write:
Select Top 1 medianSortColumn from Table T
Where (Select Count(*) from Table
Where MedianSortColumn <
(Select Count(*) From Table) / 2)
Order By medianSortColumn
Justin's example above is very good. But that Primary key need should be stated very clearly. I have seen that code in the wild without the key and the results are bad.
The complaint I get about the Percentile_Cont is that it wont give you an actual value from the dataset.
To get to a "median" that is an actual value from the dataset use Percentile_Disc.
SELECT SalesOrderID, OrderQty,
PERCENTILE_DISC(0.5)
WITHIN GROUP (ORDER BY OrderQty)
OVER (PARTITION BY SalesOrderID) AS MedianCont
FROM Sales.SalesOrderDetail
WHERE SalesOrderID IN (43670, 43669, 43667, 43663)
ORDER BY SalesOrderID DESC
Using a single statement - One way is to use ROW_NUMBER(), COUNT() window function and filter the sub-query. Here is to find the median salary:
SELECT AVG(e_salary)
FROM
(SELECT
ROW_NUMBER() OVER(ORDER BY e_salary) as row_no,
e_salary,
(COUNT(*) OVER()+1)*0.5 AS row_half
FROM Employee) t
WHERE row_no IN (FLOOR(row_half),CEILING(row_half))
I have seen similar solutions over the net using FLOOR and CEILING but tried to use a single statement. (edited)
Median Finding
This is the simplest method to find the median of an attribute.
Select round(S.salary,4) median from employee S
where (select count(salary) from station
where salary < S.salary ) = (select count(salary) from station
where salary > S.salary)
See other solutions for median calculation in SQL here:
"Simple way to calculate median with MySQL" (the solutions are mostly vendor-independent).
Building on Jeff Atwood's answer above here it is with GROUP BY and a correlated subquery to get the median for each group.
SELECT TestID,
(
(SELECT MAX(Score) FROM
(SELECT TOP 50 PERCENT Score FROM Posts WHERE TestID = Posts_parent.TestID ORDER BY Score) AS BottomHalf)
+
(SELECT MIN(Score) FROM
(SELECT TOP 50 PERCENT Score FROM Posts WHERE TestID = Posts_parent.TestID ORDER BY Score DESC) AS TopHalf)
) / 2 AS MedianScore,
AVG(Score) AS AvgScore, MIN(Score) AS MinScore, MAX(Score) AS MaxScore
FROM Posts_parent
GROUP BY Posts_parent.TestID
For a continuous variable/measure 'col1' from 'table1'
select col1
from
(select top 50 percent col1,
ROW_NUMBER() OVER(ORDER BY col1 ASC) AS Rowa,
ROW_NUMBER() OVER(ORDER BY col1 DESC) AS Rowd
from table1 ) tmp
where tmp.Rowa = tmp.Rowd
Frequently, we may need to calculate Median not just for the whole table, but for aggregates with respect to some ID. In other words, calculate median for each ID in our table, where each ID has many records. (based on the solution edited by #gdoron: good performance and works in many SQL)
SELECT our_id, AVG(1.0 * our_val) as Median
FROM
( SELECT our_id, our_val,
COUNT(*) OVER (PARTITION BY our_id) AS cnt,
ROW_NUMBER() OVER (PARTITION BY our_id ORDER BY our_val) AS rnk
FROM our_table
) AS x
WHERE rnk IN ((cnt + 1)/2, (cnt + 2)/2) GROUP BY our_id;
Hope it helps.
For large scale datasets, you can try this GIST:
https://gist.github.com/chrisknoll/1b38761ce8c5016ec5b2
It works by aggregating the distinct values you would find in your set (such as ages, or year of birth, etc.), and uses SQL window functions to locate any percentile position you specify in the query.
To get median value of salary from employee table
with cte as (select salary, ROW_NUMBER() over (order by salary asc) as num from employees)
select avg(salary) from cte where num in ((select (count(*)+1)/2 from employees), (select (count(*)+2)/2 from employees));
I wanted to work out a solution by myself, but my brain tripped and fell on the way. I think it works, but don't ask me to explain it in the morning. :P
DECLARE #table AS TABLE
(
Number int not null
);
insert into #table select 2;
insert into #table select 4;
insert into #table select 9;
insert into #table select 15;
insert into #table select 22;
insert into #table select 26;
insert into #table select 37;
insert into #table select 49;
DECLARE #Count AS INT
SELECT #Count = COUNT(*) FROM #table;
WITH MyResults(RowNo, Number) AS
(
SELECT RowNo, Number FROM
(SELECT ROW_NUMBER() OVER (ORDER BY Number) AS RowNo, Number FROM #table) AS Foo
)
SELECT AVG(Number) FROM MyResults WHERE RowNo = (#Count+1)/2 OR RowNo = ((#Count+1)%2) * ((#Count+2)/2)
--Create Temp Table to Store Results in
DECLARE #results AS TABLE
(
[Month] datetime not null
,[Median] int not null
);
--This variable will determine the date
DECLARE #IntDate as int
set #IntDate = -13
WHILE (#IntDate < 0)
BEGIN
--Create Temp Table
DECLARE #table AS TABLE
(
[Rank] int not null
,[Days Open] int not null
);
--Insert records into Temp Table
insert into #table
SELECT
rank() OVER (ORDER BY DATEADD(mm, DATEDIFF(mm, 0, DATEADD(ss, SVR.close_date, '1970')), 0), DATEDIFF(day,DATEADD(ss, SVR.open_date, '1970'),DATEADD(ss, SVR.close_date, '1970')),[SVR].[ref_num]) as [Rank]
,DATEDIFF(day,DATEADD(ss, SVR.open_date, '1970'),DATEADD(ss, SVR.close_date, '1970')) as [Days Open]
FROM
mdbrpt.dbo.View_Request SVR
LEFT OUTER JOIN dbo.dtv_apps_systems vapp
on SVR.category = vapp.persid
LEFT OUTER JOIN dbo.prob_ctg pctg
on SVR.category = pctg.persid
Left Outer Join [mdbrpt].[dbo].[rootcause] as [Root Cause]
on [SVR].[rootcause]=[Root Cause].[id]
Left Outer Join [mdbrpt].[dbo].[cr_stat] as [Status]
on [SVR].[status]=[Status].[code]
LEFT OUTER JOIN [mdbrpt].[dbo].[net_res] as [net]
on [net].[id]=SVR.[affected_rc]
WHERE
SVR.Type IN ('P')
AND
SVR.close_date IS NOT NULL
AND
[Status].[SYM] = 'Closed'
AND
SVR.parent is null
AND
[Root Cause].[sym] in ( 'RC - Application','RC - Hardware', 'RC - Operational', 'RC - Unknown')
AND
(
[vapp].[appl_name] in ('3PI','Billing Rpts/Files','Collabrent','Reports','STMS','STMS 2','Telco','Comergent','OOM','C3-BAU','C3-DD','DIRECTV','DIRECTV Sales','DIRECTV Self Care','Dealer Website','EI Servlet','Enterprise Integration','ET','ICAN','ODS','SB-SCM','SeeBeyond','Digital Dashboard','IVR','OMS','Order Services','Retail Services','OSCAR','SAP','CTI','RIO','RIO Call Center','RIO Field Services','FSS-RIO3','TAOS','TCS')
OR
pctg.sym in ('Systems.Release Health Dashboard.Problem','DTV QA Test.Enterprise Release.Deferred Defect Log')
AND
[Net].[nr_desc] in ('3PI','Billing Rpts/Files','Collabrent','Reports','STMS','STMS 2','Telco','Comergent','OOM','C3-BAU','C3-DD','DIRECTV','DIRECTV Sales','DIRECTV Self Care','Dealer Website','EI Servlet','Enterprise Integration','ET','ICAN','ODS','SB-SCM','SeeBeyond','Digital Dashboard','IVR','OMS','Order Services','Retail Services','OSCAR','SAP','CTI','RIO','RIO Call Center','RIO Field Services','FSS-RIO3','TAOS','TCS')
)
AND
DATEADD(mm, DATEDIFF(mm, 0, DATEADD(ss, SVR.close_date, '1970')), 0) = DATEADD(mm, DATEDIFF(mm,0,DATEADD(mm,#IntDate,getdate())), 0)
ORDER BY [Days Open]
DECLARE #Count AS INT
SELECT #Count = COUNT(*) FROM #table;
WITH MyResults(RowNo, [Days Open]) AS
(
SELECT RowNo, [Days Open] FROM
(SELECT ROW_NUMBER() OVER (ORDER BY [Days Open]) AS RowNo, [Days Open] FROM #table) AS Foo
)
insert into #results
SELECT
DATEADD(mm, DATEDIFF(mm,0,DATEADD(mm,#IntDate,getdate())), 0) as [Month]
,AVG([Days Open])as [Median] FROM MyResults WHERE RowNo = (#Count+1)/2 OR RowNo = ((#Count+1)%2) * ((#Count+2)/2)
set #IntDate = #IntDate+1
DELETE FROM #table
END
select *
from #results
order by [Month]
This works with SQL 2000:
DECLARE #testTable TABLE
(
VALUE INT
)
--INSERT INTO #testTable -- Even Test
--SELECT 3 UNION ALL
--SELECT 5 UNION ALL
--SELECT 7 UNION ALL
--SELECT 12 UNION ALL
--SELECT 13 UNION ALL
--SELECT 14 UNION ALL
--SELECT 21 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 29 UNION ALL
--SELECT 40 UNION ALL
--SELECT 56
--
--INSERT INTO #testTable -- Odd Test
--SELECT 3 UNION ALL
--SELECT 5 UNION ALL
--SELECT 7 UNION ALL
--SELECT 12 UNION ALL
--SELECT 13 UNION ALL
--SELECT 14 UNION ALL
--SELECT 21 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 23 UNION ALL
--SELECT 29 UNION ALL
--SELECT 39 UNION ALL
--SELECT 40 UNION ALL
--SELECT 56
DECLARE #RowAsc TABLE
(
ID INT IDENTITY,
Amount INT
)
INSERT INTO #RowAsc
SELECT VALUE
FROM #testTable
ORDER BY VALUE ASC
SELECT AVG(amount)
FROM #RowAsc ra
WHERE ra.id IN
(
SELECT ID
FROM #RowAsc
WHERE ra.id -
(
SELECT MAX(id) / 2.0
FROM #RowAsc
) BETWEEN 0 AND 1
)
For newbies like myself who are learning the very basics, I personally find this example easier to follow, as it is easier to understand exactly what's happening and where median values are coming from...
select
( max(a.[Value1]) + min(a.[Value1]) ) / 2 as [Median Value1]
,( max(a.[Value2]) + min(a.[Value2]) ) / 2 as [Median Value2]
from (select
datediff(dd,startdate,enddate) as [Value1]
,xxxxxxxxxxxxxx as [Value2]
from dbo.table1
)a
In absolute awe of some of the codes above though!!!
This is as simple an answer as I could come up with. Worked well with my data. If you want to exclude certain values just add a where clause to the inner select.
SELECT TOP 1
ValueField AS MedianValue
FROM
(SELECT TOP(SELECT COUNT(1)/2 FROM tTABLE)
ValueField
FROM
tTABLE
ORDER BY
ValueField) A
ORDER BY
ValueField DESC
The following solution works under these assumptions:
No duplicate values
No NULLs
Code:
IF OBJECT_ID('dbo.R', 'U') IS NOT NULL
DROP TABLE dbo.R
CREATE TABLE R (
A FLOAT NOT NULL);
INSERT INTO R VALUES (1);
INSERT INTO R VALUES (2);
INSERT INTO R VALUES (3);
INSERT INTO R VALUES (4);
INSERT INTO R VALUES (5);
INSERT INTO R VALUES (6);
-- Returns Median(R)
select SUM(A) / CAST(COUNT(A) AS FLOAT)
from R R1
where ((select count(A) from R R2 where R1.A > R2.A) =
(select count(A) from R R2 where R1.A < R2.A)) OR
((select count(A) from R R2 where R1.A > R2.A) + 1 =
(select count(A) from R R2 where R1.A < R2.A)) OR
((select count(A) from R R2 where R1.A > R2.A) =
(select count(A) from R R2 where R1.A < R2.A) + 1) ;
DECLARE #Obs int
DECLARE #RowAsc table
(
ID INT IDENTITY,
Observation FLOAT
)
INSERT INTO #RowAsc
SELECT Observations FROM MyTable
ORDER BY 1
SELECT #Obs=COUNT(*)/2 FROM #RowAsc
SELECT Observation AS Median FROM #RowAsc WHERE ID=#Obs
I try with several alternatives, but due my data records has repeated values, the ROW_NUMBER versions seems are not a choice for me. So here the query I used (a version with NTILE):
SELECT distinct
CustomerId,
(
MAX(CASE WHEN Percent50_Asc=1 THEN TotalDue END) OVER (PARTITION BY CustomerId) +
MIN(CASE WHEN Percent50_desc=1 THEN TotalDue END) OVER (PARTITION BY CustomerId)
)/2 MEDIAN
FROM
(
SELECT
CustomerId,
TotalDue,
NTILE(2) OVER (
PARTITION BY CustomerId
ORDER BY TotalDue ASC) AS Percent50_Asc,
NTILE(2) OVER (
PARTITION BY CustomerId
ORDER BY TotalDue DESC) AS Percent50_desc
FROM Sales.SalesOrderHeader SOH
) x
ORDER BY CustomerId;
For your question, Jeff Atwood had already given the simple and effective solution. But, if you are looking for some alternative approach to calculate the median, below SQL code will help you.
create table employees(salary int);
insert into employees values(8); insert into employees values(23); insert into employees values(45); insert into employees values(123); insert into employees values(93); insert into employees values(2342); insert into employees values(2238);
select * from employees;
declare #odd_even int; declare #cnt int; declare #middle_no int;
set #cnt=(select count(*) from employees); set #middle_no=(#cnt/2)+1; select #odd_even=case when (#cnt%2=0) THEN -1 ELse 0 END ;
select AVG(tbl.salary) from (select salary,ROW_NUMBER() over (order by salary) as rno from employees group by salary) tbl where tbl.rno=#middle_no or tbl.rno=#middle_no+#odd_even;
If you are looking to calculate median in MySQL, this github link will be useful.

Resources