SQL Server Row_Number() with Group By - sql-server

The View in SQL Server 2008 has a Group By and it refines the records to 23. The same query in a ROW_Number sproc returns 58 records. I have tried various combininations of Partitions to return the same result, have tried all fields (like the View) or other pairings but nothing works.
Hope someone can shed some light on this.
The Stored Procedure that doesn't group correctly (returns 58 records). #Filter contains johndoe, just like the View (see below):
ALTER PROCEDURE [dbo].[Get_Reporting2]
#OutTotalRecCount INT OUTPUT,
#CurrentPage INT,
#PageSize INT,
#SortDirection INT,
#SortField nvarchar(50),
#Filter nvarchar(50)
AS
SELECT *
FROM
(SELECT ROW_NUMBER() OVER (
PARTITION BY
--dbo.ppoma_fsa.OBJID,
--dbo.ppoma_fsa.OTYPE,
--dbo.ppoma_fsa.STEXT,
--dbo.ppoma_fsa.LOW,
--dbo.ppoma_assigned.SOBID,
dbo.ppoma_us_bp.Personal_No,
dbo.ppoma_us_bp.UserID,
dbo.ppoma_us_bp.BP,
dbo.ppoma_us_bp.NAME_FIRST,
dbo.ppoma_us_bp.NAME_LAST
--dbo.ppoma_us_bp.UserID,
--dbo.ppoma_fsa.LOW
ORDER BY
CASE WHEN #SortDirection = 1 AND #SortField = 1 THEN ppoma_fsa.OBJID END DESC,
CASE WHEN #SortDirection = 1 AND #SortField = 2 THEN ppoma_fsa.OTYPE END ASC,
CASE WHEN #SortDirection = 2 AND #SortField = 2 THEN ppoma_fsa.OTYPE END DESC
)
AS Row,
dbo.ppoma_fsa.OBJID as _OBJID,
dbo.ppoma_fsa.OTYPE as _OTYPE,
dbo.ppoma_fsa.STEXT as _STEXT,
dbo.ppoma_fsa.LOW,
dbo.ppoma_assigned.SOBID AS Depot_BP,
dbo.ppoma_us_bp.Personal_No,
dbo.ppoma_us_bp.UserID,
dbo.ppoma_us_bp.BP,
dbo.ppoma_us_bp.NAME_FIRST,
dbo.ppoma_us_bp.NAME_LAST
FROM dbo.ppoma_assigned INNER JOIN
dbo.ppoma_us_bp ON dbo.ppoma_assigned.SOBID = dbo.ppoma_us_bp.OBJID INNER JOIN
dbo.ppoma_fsa ON dbo.ppoma_assigned.OBJID = dbo.ppoma_fsa.OBJID
WHERE (dbo.ppoma_us_bp.UserID LIKE N'%' + #Filter + '%') AND (NOT (dbo.ppoma_fsa.LOW = 'Service'))
)
AS TeamWithRowNumbers
WHERE Row >= (#CurrentPage - 1) * #PageSize + 1 AND Row <= #CurrentPage*#PageSize
SELECT #OutTotalRecCount = COUNT(*)
FROM dbo.ppoma_assigned INNER JOIN
dbo.ppoma_us_bp ON dbo.ppoma_assigned.SOBID = dbo.ppoma_us_bp.OBJID INNER JOIN
dbo.ppoma_fsa ON dbo.ppoma_assigned.OBJID = dbo.ppoma_fsa.OBJID
WHERE (dbo.ppoma_us_bp.UserID LIKE N'%' + #Filter + '%') AND (NOT (dbo.ppoma_fsa.LOW = 'Service'))
The View with Group By that returns 23 records:
SELECT dbo.ppoma_fsa.OBJID, dbo.ppoma_fsa.OTYPE, dbo.ppoma_fsa.STEXT, dbo.ppoma_fsa.LOW, dbo.ppoma_assigned.SOBID AS Depot_BP,
dbo.ppoma_us_bp.Personal_No, dbo.ppoma_us_bp.UserID, dbo.ppoma_us_bp.BP, dbo.ppoma_us_bp.NAME_FIRST, dbo.ppoma_us_bp.NAME_LAST
FROM dbo.ppoma_assigned INNER JOIN
dbo.ppoma_fsa ON dbo.ppoma_assigned.OBJID = dbo.ppoma_fsa.OBJID INNER JOIN
dbo.ppoma_us_bp ON dbo.ppoma_assigned.SOBID = dbo.ppoma_us_bp.OBJID
GROUP BY dbo.ppoma_fsa.OBJID, dbo.ppoma_fsa.OTYPE, dbo.ppoma_fsa.STEXT, dbo.ppoma_fsa.LOW, dbo.ppoma_assigned.SOBID, dbo.ppoma_us_bp.Personal_No,
dbo.ppoma_us_bp.UserID, dbo.ppoma_us_bp.BP, dbo.ppoma_us_bp.NAME_FIRST, dbo.ppoma_us_bp.NAME_LAST
HAVING (dbo.ppoma_us_bp.UserID = 'johndoe')

Without seeing any sample data it's impossible to say for certain.
It could be because the proc is using a LIKE on the filter parameter, and the view uses =.
Stored Proc:
WHERE (dbo.ppoma_us_bp.UserID LIKE N'%' + #Filter + '%')
View:
HAVING (dbo.ppoma_us_bp.UserID = 'johndoe')
Also, given that the view is using GROUP BY, it very likely would return fewer rows, if there are any duplicates in the data. The ROW_NUMBER function is in no way like GROUP BY, they don't do the same thing at all.
GROUP BY is for aggregating, usually in order to do a SUM or other aggregate function: https://msdn.microsoft.com/en-us/library/ms177673.aspx
ROW_NUMBER is just for assigning a sequential number to rows in the result set: https://msdn.microsoft.com/en-us/library/ms186734.aspx
What I don't understand is, if you want the same results as the view, why not just select from it?

My solution was to convert the View to a Make Table (ppoma_final) and then modified the Row_Number() to the following. Works like a charm.
SELECT *
FROM
(SELECT ROW_NUMBER() OVER (
ORDER BY
CASE WHEN #SortDirection = 1 AND #SortField = 1 THEN ppoma_final.OBJID END DESC,
CASE WHEN #SortDirection = 1 AND #SortField = 2 THEN ppoma_final.OTYPE END ASC,
CASE WHEN #SortDirection = 2 AND #SortField = 2 THEN ppoma_final.OTYPE END DESC
)
AS Row,
[OBJID]
,[OTYPE]
,[STEXT]
,[LOW]
,[Depot_BP]
,[Personal_No]
,[UserID]
,[BP]
,[NAME_FIRST]
,[NAME_LAST]
,[id_fsa]
,[id_us_bp]
FROM dbo.ppoma_final
WHERE (dbo.ppoma_final.UserID LIKE N'%' + #Filter + '%') AND (NOT (dbo.ppoma_final.LOW = 'Service'))
)
AS TeamWithRowNumbers
WHERE Row >= (#CurrentPage - 1) * #PageSize + 1 AND Row <= #CurrentPage*#PageSize
SELECT #OutTotalRecCount = COUNT(*)
FROM dbo.ppoma_final
WHERE (dbo.ppoma_final.UserID LIKE N'%' + #Filter + '%') AND (NOT (dbo.ppoma_final.LOW = 'Service'))

Related

Is it always possible to transform multiple spatial selects with while loop and variables into a single query without using temp tables in sql?

This problem can be solved with temp table, however, I don't want to use Temp table or var table, this question is mostly for my personal educational purposes.
I inherited the following SQL:
DECLARE #i int = 993
while #i <=1000
begin
declare #lat nvarchar(20)
select top 1 #lat = SUBSTRING(Address,0,CHARINDEX(',',Address,0)) from dbo.rent
where id = #i;
declare #lon nvarchar(20)
select top 1 #lon = SUBSTRING(Address,CHARINDEX(',',Address)+1,LEN(Address)) from dbo.rent
where id = #i
declare #p GEOGRAPHY = GEOGRAPHY::STGeomFromText('POINT('+ #lat +' '+#lon+')', 4326)
select price/LivingArea sq_m, (price/LivingArea)/avg_sq_m, * from
(select (sum(price)/sum(LivingArea)) avg_sq_m, count(1) cnt, #i id from
(select *, GEOGRAPHY::STGeomFromText('POINT('+
convert(nvarchar(20), SUBSTRING(Address,0,CHARINDEX(',',Address,0)))+' '+
convert( nvarchar(20), SUBSTRING(Address,CHARINDEX(',',Address)+1,LEN(Address)))+')', 4326)
.STBuffer(500).STIntersects(#p) as [Intersects]
from dbo.rent
where Address is not null
) s
where [Intersects] = 1) prox
inner join dbo.rent r on prox.id = r.id
set #i = #i+1
end
it is used to analyze property prices per square meter that are in proximity and compare them to see which ones are cheaper...
Problem: a mechanism for calling has to be moved from C# to SQL and all queries have to be combined into a single result (now you get one row per one while run), i.e #i and #p has to go and become while id < x and id > y or somehow magically joined,
the procedure is a cut down version of actual thing but having a solution to the above I will have no problem making the whole thing work...
I am of the opinion that any SQL mechanism with variables and loops can be transformed to a single SQL statement, hence the question.
SqlFiddle
If I understand your question properly (Remove the need for loops and return one data set) then you can use CTE (Common Table Expressions) for the Lats, Lons and Geog variables.
You;re SQLFIddle was referencing a database called "webanalyser" so I removed that from the query below
However, the query will not return anything as the sample data has wrong data for address column.
;WITH cteLatsLongs
AS(
SELECT
lat = SUBSTRING(Address, 0, CHARINDEX(',', Address, 0))
,lon = SUBSTRING(Address, CHARINDEX(',', Address) + 1, LEN(Address))
FROM dbo.rent
)
,cteGeogs
AS(
SELECT
Geog = GEOGRAPHY ::STGeomFromText('POINT(' + LL.lat + ' ' + LL.lon + ')', 4326)
FROM cteLatsLongs LL
),cteIntersects
AS(
SELECT *,
GEOGRAPHY::STGeomFromText('POINT(' + CONVERT(NVARCHAR(20), SUBSTRING(Address, 0, CHARINDEX(',', Address, 0))) + ' ' + CONVERT(NVARCHAR(20), SUBSTRING(Address, CHARINDEX(',', Address) + 1, LEN(Address))) + ')', 4326).STBuffer(500).STIntersects(G.Geog) AS [Intersects]
FROM dbo.rent
CROSS APPLY cteGeogs G
)
SELECT avg_sq_m = (SUM(price) / SUM(LivingArea)), COUNT(1) cnt
FROM
cteIntersects I
WHERE I.[Intersects] = 1
It can be done, in this specific case 'discovery' that was necessary was the ability to perform JOINs on Point e.g ability to join tables on proximity (another a small cheat was to aggregate point-strings to actual points, but it's just an optimization). Once this is done, a query could be rewritten as follows:
SELECT adds.Url,
adds.Price/adds.LivingArea Sqm,
(adds.Price/adds.LivingArea)/k1.sale1Avg ratio,
*
FROM
(SELECT baseid,
count(k1Rent.rentid) rent1kCount,
sum(k1Rent.RperSqM)/(count(k1Rent.rentid)) AS rent1kAvgSqM,
count(around1k.SaleId) sale1kCount,
(sum(k1sale.price)/sum(k1Sale.LivingArea)) sale1Avg,
(sum(k1sale.price)/sum(k1Sale.LivingArea))/((sum(k1Rent.RperSqM)/(count(k1Rent.rentid)))*12) years --*
FROM
(SELECT sa.id baseid,
s.id saleid,
s.RoomCount,
POINT
FROM SpatialAnalysis sa
INNER JOIN Sale s ON s.Id = SaleId
WHERE sa.SalesIn1kRadiusCount IS NULL) AS base
JOIN SpatialAnalysis around1k ON base.Point.STBuffer(1000).STIntersects(around1k.Point) = 1
LEFT OUTER JOIN
(SELECT id rentid,
rc,
Price/avgRoomSize RperSqM
FROM
(SELECT *
FROM
(SELECT rc,
sum(avgArea*c)/sum(c) avgRoomSize
FROM
(SELECT roomcount rc,
avg(livingarea) avgArea,
count(1) c
FROM Rent
WHERE url LIKE '%systemname%'
AND LivingArea IS NOT NULL
GROUP BY RoomCount
UNION
(SELECT roomcount rc,
avg(livingarea) avgArea,
count(1) c
FROM sale
WHERE url LIKE '%systemname%'
AND LivingArea IS NOT NULL
GROUP BY RoomCount))uni
GROUP BY rc) avgRoom) avgrents
JOIN rent r ON r.RoomCount = avgrents.rc) k1Rent ON k1Rent.rentid =around1k.RentId
AND base.RoomCount = k1Rent.rc
LEFT OUTER JOIN Sale k1Sale ON k1Sale.Id = around1k.SaleId
AND base.RoomCount = k1Sale.RoomCount
GROUP BY baseid) k1
left outer join SpatialAnalysis sp on sp.Id = baseid
left outer join Sale adds on adds.Id = sp.SaleId
where adds.Price < 250000
order by years, ratio

Counting number of Populated Columns by Row

I wanted to know if there is a way of counting the number of populated columns per row of a table.
For example if I have the simple table below Called Customer:
**Name** **Customer** **DOB** **Order number** **Populated Columns**
ABC Ltd Jo Blogg 2/1/78 123 3
Umbrella Co A Sherman 232 2
Nike 14/5/98 1
What I want is a query which will give me an extra column with a number saying how many columns have a value in them.
Any ideas?
Can be done via trivial check on NULL (and empty strings for such columns):
SELECT
[Name]
, [Customer]
, [DOB]
, [Order number]
, CASE WHEN ISNULL([Name], '') != '' THEN 1 ELSE 0 END
+ CASE WHEN ISNULL([Customer], '') != '' THEN 1 ELSE 0 END
+ CASE WHEN [DOB] IS NOT NULL THEN 1 ELSE 0 END
+ CASE WHEN [Order number] IS NOT NULL THEN 1 ELSE 0 END AS [Populated Columns]
This will work nicely for a fixed and known number of columns.
Such an approach can be perhaps more universal if columns list fetched from the metadata. As a downside - this requires a dynamic SQL.
Below is an example for SQL Server 2017 and higher:
DECLARE #_SQL NVARCHAR(max)
DECLARE #_TableName sysname = 'Table1'
SELECT #_SQL =
'SELECT '
+ STRING_AGG(QUOTENAME(COLUMN_NAME), ',
')
+ ', '
+ STRING_AGG('
CASE WHEN ['+COLUMN_NAME+'] IS NOT NULL THEN 1 ELSE 0 END', ' +')
+ ' AS [Populated Columns]
FROM ' + QUOTENAME(MIN(TABLE_SCHEMA)) + '.' + QUOTENAME(MIN(TABLE_NAME))
FROM INFORMATION_SCHEMA.COLUMNs
WHERE TABLE_NAME = #_TableName
EXEC sys.sp_executesql #_SQL
It will generate and execute a code:
SELECT
[Col1],
[Col2],
[Col3],
CASE WHEN [Col1] IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN [Col2] IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN [Col3] IS NOT NULL THEN 1 ELSE 0 END AS [Populated Columns]
FROM [dbo].[Table1]
In older versions, such result is achievable but with other string aggregation workarounds, like XML STUFF or SQLCLR functions...
Just thought of sharing another approach using UNPIVOT to calculate the same, assuming that you will have a primary key/identity in your table.
declare #tmp table (id int, [Name] varchar(100), Customer varchar(100), dob datetime, orderno int)
insert into #tmp select 1, 'name1','c1',getdate(),123
insert into #tmp select 2,'name2',null,getdate(),123
insert into #tmp select 3,'name3',null,null,null
SELECT t.*,
t1.notpopulated
FROM #tmp t
INNER JOIN (SELECT 4 - Count(*) AS NotPopulated,
id
FROM
(SELECT id,
u.x,
u.y
FROM (SELECT id,
Cast([name]AS VARCHAR(100)) [name],
Cast(customer AS VARCHAR(100)) AS customer,
Cast(dob AS VARCHAR(100)) AS dob1,
Cast(orderno AS VARCHAR(100)) orderno
FROM #tmp) AS s
UNPIVOT ( [y]
FOR [x] IN ([name],
[Customer],
dob1,
[orderno]) ) u) t
GROUP BY id) t1
ON t1.id = t.id
Online Demo

Alternate of offset in sql server

I have below stored procedure in sql server 2016, its working fine there.
Now I need to create the same sp in sql 2008, now I am getting error :
Msg 102, Level 15, State 1, Procedure GetEmployees, Line 41 [Batch
Start Line 0] Incorrect syntax near 'OFFSET'. Msg 153, Level 15, State
2, Procedure GetEmployees, Line 42 [Batch Start Line 0] Invalid usage
of the option NEXT in the FETCH statement.
How to modify the same proc so that it can run over sql 2008 as well.
--dbo.GetEmployees '',2,2
CreatePROCEDURE [dbo].GetEmployees
(
#SearchValue NVARCHAR(50) = '',
#PageNo INT = 0,
#PageSize INT = 10,
#SortColumn NVARCHAR(20) = 'Name',
#SortOrder NVARCHAR(20) = 'ASC'
)
AS BEGIN
SET NOCOUNT ON;
if #PageNo<0 set #PageNo=0
set #PageNo=#PageNo+1
SET #SearchValue = LTRIM(RTRIM(#SearchValue))
Set #SearchValue= nullif(#SearchValue,'')
; WITH CTE_Results AS
(
SELECT EmployeeID, Name, City from tblEmployee
WHERE (#SearchValue IS NULL OR Name LIKE '%' + #SearchValue + '%')
ORDER BY
CASE WHEN (#SortColumn = 'EmployeeID' AND #SortOrder='ASC')
THEN EmployeeID
END ASC,
CASE WHEN (#SortColumn = 'EmployeeID' AND #SortOrder='DESC')
THEN EmployeeID
END DESC,
CASE WHEN (#SortColumn = 'Name' AND #SortOrder='ASC')
THEN Name
END ASC,
CASE WHEN (#SortColumn = 'Name' AND #SortOrder='DESC')
THEN Name
END DESC,
CASE WHEN (#SortColumn = 'City' AND #SortOrder='ASC')
THEN City
END ASC,
CASE WHEN (#SortColumn = 'City' AND #SortOrder='DESC')
THEN City
END DESC
OFFSET #PageSize * (#PageNo - 1) ROWS
FETCH NEXT #PageSize ROWS ONLY
),
CTE_TotalRows AS
(
select count(EmployeeID) as MaxRows from tblEmployee WHERE (#SearchValue IS NULL OR Name LIKE '%' + #SearchValue + '%')
)
Select MaxRows TotalRecords, t.EmployeeID, t.Name, t.City,t.Department,t.Gender from dbo.tblEmployee as t, CTE_TotalRows
WHERE EXISTS (SELECT 1 FROM CTE_Results WHERE CTE_Results.EmployeeID = t.EmployeeID)
OPTION (RECOMPILE)
END
You need a row_number() window function and in the OVER section you want to put your entire sorting expression. Note that I've created another CTE for readability, but you could get the same thing done with just a subquery.
Formatted code for the SELECT statement would be the following:
WITH CTE_Rownums AS (
SELECT
EmployeeID,
Name,
City,
row_number() over ( ORDER BY ... ) as rn -- put your entire order by here
FROM tblEmployee
WHERE
#SearchValue IS NULL
OR Name LIKE '%' + #SearchValue + '%'
), CTE_Results AS (
SELECT EmployeeID, Name, City
FROM CTE_Rownums
WHERE
(rn > #PageSize * (#PageNo - 1)
AND (rn <= #PageSize * #PageNo)
ORDER BY rn
), CTE_TotalRows AS (
SELECT count(EmployeeID) as MaxRows
FROM tblEmployee
WHERE
#SearchValue IS NULL
OR Name LIKE '%' + #SearchValue + '%'
)
SELECT MaxRows TotalRecords, t.EmployeeID, t.Name, t.City,t.Department,t.Gender
FROM dbo.tblEmployee as t
CROSS JOIN CTE_TotalRows
WHERE EXISTS (
SELECT 1
FROM CTE_Results
WHERE CTE_Results.EmployeeID = t.EmployeeID
)
OPTION (RECOMPILE)
In the last SELECT I've replaced comma separated where clause with CROSS JOIN.
If you use 2008 R2 or older you can't use OFFSET FETCH,
you have alternative to use ROW_NUMBER() and rewrite your query for examle
with OFFSET
SELECT Price
FROM dbo.Inventory
ORDER BY Price OFFSET 10 ROWS FETCH NEXT 5 ROWS ONLY
this query without OFFSET using ROW_NUMBER()
SELECT Price
FROM
(
SELECT Price
ROW_NUMBER() OVER (ORDER BY Price) AS Seq
FROM dbo.Inventory
)t
WHERE Seq BETWEEN 11 AND 15

return value at a position from STRING_SPLIT in SQL Server 2016

Can I return a value at a particular position with the STRING_SPLIT function in SQL Server 2016 or higher?
I know the order from a select is not guaranteed, but is it with STRING_SPLIT?
DROP TABLE IF EXISTS #split
SELECT 'z_y_x' AS splitIt
INTO #split UNION
SELECT 'a_b_c'
SELECT * FROM #split;
WITH cte
AS (
SELECT ROW_NUMBER() OVER ( PARTITION BY s.splitIt ORDER BY s.splitIt ) AS position,
s.splitIt,
value
FROM #split s
CROSS APPLY STRING_SPLIT(s.splitIt, '_')
)
SELECT * FROM cte WHERE position = 2
Will this always return the value at the 2nd element? b for a_b_c and y for z_y_x?
I don't understand why Microsoft doesn't return a position indicator column alongside the value for this function.
There is - starting with v2016 - a solution via FROM OPENJSON():
DECLARE #str VARCHAR(100) = 'val1,val2,val3';
SELECT *
FROM OPENJSON('["' + REPLACE(#str,',','","') + '"]');
The result
key value type
0 val1 1
1 val2 1
2 val3 1
The documentation tells clearly:
When OPENJSON parses a JSON array, the function returns the indexes of the elements in the JSON text as keys.
For your case this was:
SELECT 'z_y_x' AS splitIt
INTO #split UNION
SELECT 'a_b_c'
DECLARE #delimiter CHAR(1)='_';
SELECT *
FROM #split
CROSS APPLY OPENJSON('["' + REPLACE(splitIt,#delimiter,'","') + '"]') s
WHERE s.[key]=1; --zero based
Let's hope, that future versions of STRING_SPLIT() will include this information
UPDATE Performance tests, compare with popular Jeff-Moden-splitter
Try this out:
USE master;
GO
CREATE DATABASE dbTest;
GO
USE dbTest;
GO
--Jeff Moden's splitter
CREATE FUNCTION [dbo].[DelimitedSplit8K](#pString VARCHAR(8000), #pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
GO
--Avoid first call bias
SELECT * FROM dbo.DelimitedSplit8K('a,b,c',',');
GO
--Table to keep the results
CREATE TABLE Results(ID INT IDENTITY,ResultSource VARCHAR(100),durationMS INT, RowsCount INT);
GO
--Table with strings to split
CREATE TABLE dbo.DelimitedItems(ID INT IDENTITY,DelimitedNString nvarchar(4000),DelimitedString varchar(8000));
GO
--Get rows wiht randomly mixed strings of 100 items
--Try to play with the count of rows (count behind GO) and the count with TOP
INSERT INTO DelimitedItems(DelimitedNString)
SELECT STUFF((
SELECT TOP 100 ','+REPLACE(v.[name],',',';')
FROM master..spt_values v
WHERE LEN(v.[name])>0
ORDER BY NewID()
FOR XML PATH('')),1,1,'')
--Keep it twice in varchar and nvarchar
UPDATE DelimitedItems SET DelimitedString=DelimitedNString;
GO 500 --create 500 differently mixed rows
--The tests
DECLARE #d DATETIME2;
SET #d = SYSUTCDATETIME();
SELECT DI.ID, DS.Item, DS.ItemNumber
INTO #TEMP
FROM dbo.DelimitedItems DI
CROSS APPLY dbo.DelimitedSplit8K(DI.DelimitedNString,',') DS;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'delimited8K with NVARCHAR(4000)'
,(SELECT COUNT(*) FROM #TEMP) AS RowCountInTemp
,DATEDIFF(MILLISECOND,#d,SYSUTCDATETIME()) AS Duration_NV_ms_delimitedSplit8K
SET #d = SYSUTCDATETIME();
SELECT DI.ID, DS.Item, DS.ItemNumber
INTO #TEMP2
FROM dbo.DelimitedItems DI
CROSS APPLY dbo.DelimitedSplit8K(DI.DelimitedString,',') DS;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'delimited8K with VARCHAR(8000)'
,(SELECT COUNT(*) FROM #TEMP2) AS RowCountInTemp
,DATEDIFF(MILLISECOND,#d,SYSUTCDATETIME()) AS Duration_V_ms_delimitedSplit8K
SET #d = SYSUTCDATETIME();
SELECT DI.ID, OJ.[Value] AS Item, OJ.[Key] AS ItemNumber
INTO #TEMP3
FROM dbo.DelimitedItems DI
CROSS APPLY OPENJSON('["' + REPLACE(DI.DelimitedNString,',','","') + '"]') OJ;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'OPENJSON with NVARCHAR(4000)'
,(SELECT COUNT(*) FROM #TEMP3) AS RowCountInTemp
,DATEDIFF(MILLISECOND,#d,SYSUTCDATETIME()) AS Duration_NV_ms_OPENJSON
SET #d = SYSUTCDATETIME();
SELECT DI.ID, OJ.[Value] AS Item, OJ.[Key] AS ItemNumber
INTO #TEMP4
FROM dbo.DelimitedItems DI
CROSS APPLY OPENJSON('["' + REPLACE(DI.DelimitedString,',','","') + '"]') OJ;
INSERT INTO Results(ResultSource,RowsCount,durationMS)
SELECT 'OPENJSON with VARCHAR(8000)'
,(SELECT COUNT(*) FROM #TEMP4) AS RowCountInTemp
,DATEDIFF(MILLISECOND,#d,SYSUTCDATETIME()) AS Duration_V_ms_OPENJSON
GO
SELECT * FROM Results;
GO
--Clean up
DROP TABLE #TEMP;
DROP TABLE #TEMP2;
DROP TABLE #TEMP3;
DROP TABLE #TEMP4;
USE master;
GO
DROP DATABASE dbTest;
Results:
200 items in 500 rows
1220 delimited8K with NVARCHAR(4000)
274 delimited8K with VARCHAR(8000)
417 OPENJSON with NVARCHAR(4000)
443 OPENJSON with VARCHAR(8000)
100 items in 500 rows
421 delimited8K with NVARCHAR(4000)
140 delimited8K with VARCHAR(8000)
213 OPENJSON with NVARCHAR(4000)
212 OPENJSON with VARCHAR(8000)
100 items in 5 rows
10 delimited8K with NVARCHAR(4000)
5 delimited8K with VARCHAR(8000)
3 OPENJSON with NVARCHAR(4000)
4 OPENJSON with VARCHAR(8000)
5 items in 500 rows
32 delimited8K with NVARCHAR(4000)
30 delimited8K with VARCHAR(8000)
28 OPENJSON with NVARCHAR(4000)
24 OPENJSON with VARCHAR(8000)
--unlimited length (only possible with OPENJSON)
--Wihtout a TOP clause while filling
--results in about 500 items in 500 rows
1329 OPENJSON with NVARCHAR(4000)
1117 OPENJSON with VARCHAR(8000)
Facit:
the popular splitter function does not like NVARCHAR
the function is limited to strings within 8k byte volumen
Only the case with many items and many rows in VARCHAR lets the splitter function be ahead.
In all other cases OPENJSON seems to be more or less faster...
OPENJSON can deal with (almost) unlimited counts
OPENJSON demands for v2016
Everybody is waiting for STRING_SPLIT with the position
UPDATE Added STRING_SPLIT to the test
In the meanwhile I re-run the test with two more test sections using STRING_SPLIT(). As position I had to return a hardcoded value as this function does not return the part's index.
In all tested cases OPENJSON was close with STRING_SPLIT and often faster:
5 items in 1000 rows
250 delimited8K with NVARCHAR(4000)
124 delimited8K with VARCHAR(8000) --this function is best with many rows in VARCHAR
203 OPENJSON with NVARCHAR(4000)
204 OPENJSON with VARCHAR(8000)
235 STRING_SPLIT with NVARCHAR(4000)
234 STRING_SPLIT with VARCHAR(8000)
200 items in 30 rows
140 delimited8K with NVARCHAR(4000)
31 delimited8K with VARCHAR(8000)
47 OPENJSON with NVARCHAR(4000)
31 OPENJSON with VARCHAR(8000)
47 STRING_SPLIT with NVARCHAR(4000)
31 STRING_SPLIT with VARCHAR(8000)
100 items in 10.000 rows
8145 delimited8K with NVARCHAR(4000)
2806 delimited8K with VARCHAR(8000) --fast with many rows!
5112 OPENJSON with NVARCHAR(4000)
4501 OPENJSON with VARCHAR(8000)
5028 STRING_SPLIT with NVARCHAR(4000)
5126 STRING_SPLIT with VARCHAR(8000)
The simple answer is, no. Microsoft so far have refused to provide Ordinal position as part of the return dataset in STRING_SPLIT. You'll need to use a different solution I'm afraid. For example Jeff Moden's DelimitedSplit8k.
(Yes, I realise this is more or less a link only answer, however, pasting Jeff's solution here would effectively be plagiarism).
If you were to use Jeff's solution, then you would be able to do something like:
SELECT *
FROM dbo.DelimitedSplit8K('a,b,c,d,e,f,g,h,i,j,k',',') DS
WHERE ItemNumber = 2;
Of course, you'd likely be passing column rather than a literal string.
I just extended #Shnugo's answer if the splitted text would contain line breaks, unicode and other non json compatible characters, to use
STRING_ESCAPE
My Test code with pipe as separator instead comma:
DECLARE #Separator VARCHAR(5) = STRING_ESCAPE('|', 'json'); -- here pipe or use any other separator (even ones escaped by json)
DECLARE #LongText VARCHAR(MAX) = 'Albert says: "baby, listen!"|ve Çağrı söylüyor: "Elma"|1st Line' + CHAR(13) + CHAR(10) + '2nd line';
SELECT * FROM OPENJSON('["' + REPLACE(STRING_ESCAPE(#LongText, 'json'), #Separator ,'","') + '"]'); -- ok
-- SELECT * FROM OPENJSON('["' + REPLACE(#LongText, #Separator ,'","') + '"]'); -- fails with: JSON text is not properly formatted. ...
Updated due to comment from Simon Zeinstra
I didn't want to deal with OPENJSON, but still wanted to get string_split() value by index.
The performance was not an issue in my case.
I used CTE (Common Table Expression)
Assume you have string str = "part1 part2 part3".
WITH split_res_list as
(
SELECT value FROM STRING_SPLIT('part1 part2 part3', ' ')
),
split_res_list_with_index as
(
SELECT [value],
ROW_NUMBER() OVER (ORDER BY [value] ASC) as [RowNumber]
FROM split_res_list
)
SELECT * FROM split_res_list_with_index WHERE RowNumber = 2
BUT: please be aware that the order of 3 parts is changed according to ORDER BY condition!
The output for the second row with "part2" value:
Using STRING_SPLIT:
STRING_SPLIT ( string , separator [ , enable_ordinal ] )
enable_ordinal
An int or bit expression that serves as a flag to enable or disable the ordinal output column. A value of 1 enables the ordinal column. If enable_ordinal is omitted, NULL, or has a value of 0, the ordinal column is disabled.
The enable_ordinal argument and ordinal output column are currently only supported in Azure SQL Database, Azure SQL Managed Instance, and Azure Synapse Analytics (serverless SQL pool only).
Query:
SELECT value FROM STRING_SPLIT('part1_part2_part3', '_', 1) WHERE ordinal = 2;
Here is my workaround. I will follow the Question waiting for a better answer:
UPDATED: Original code did not take into consideration if a word contains another.
UPDATE 2: Performance was horrible in production so i have to think another way. you have it at the end as option 2, implementation for table.
UPDATE 3: Added code for UDF in the implementation in a string.
Implementation in a string:
declare #a as nvarchar(100) = 'Lorem ipsum dolor dol ol sit amet. D Lorem DO ipsum DOL dolor sit amet. DOLORES ipsum';
WITH T AS (
SELECT T1.value
,charindex(' ' + T1.value + ' ',' ' + #a + ' ' ,0) AS INDX
,RN = ROW_NUMBER() OVER (PARTITION BY value order BY value)
FROM STRING_SPLIT(#a, ' ') AS T1
WHERE T1.value <> ''
),
R (VALUE,INDX,RN) AS (
SELECT *
FROM T
WHERE T.RN = 1
UNION ALL
SELECT T.VALUE
,charindex(' ' + T.value + ' ',' ' + #a + ' ',R.INDX + 1) AS INDX
,T.RN
FROM T
JOIN R
ON T.value = R.VALUE
AND T.RN = R.RN + 1
)
SELECT * FROM R ORDER BY INDX
result:
tableOfResults
UDF:
CREATE FUNCTION DBO.UDF_get_word(#string nvarchar(100),#wordNumber int)
returns nvarchar(100)
AS
BEGIN
DECLARE #searchedWord nvarchar(100);
WITH T AS (
SELECT T1.value
,charindex(' ' + T1.value + ' ',' ' + #string + ' ' ,0) AS INDX
,RN = ROW_NUMBER() OVER (PARTITION BY value order BY value)
FROM STRING_SPLIT(#string, ' ') AS T1
WHERE T1.value <> ''
),
R (VALUE,INDX,RN) AS (
SELECT *
FROM T
WHERE T.RN = 1
UNION ALL
SELECT T.VALUE
,charindex(' ' + T.value + ' ',' ' + #string + ' ',R.INDX + 1) AS INDX
,T.RN
FROM T
JOIN R
ON T.value = R.VALUE
AND T.RN = R.RN + 1
)
SELECT #searchedWord = (value) FROM ( SELECT *, ORD = ROW_NUMBER() OVER (ORDER BY INDX) FROM R )AS TBL WHERE ORD = #wordNumber
RETURN #searchedword
END
GO
Modification for a column in a table, OPTION 1:
WITH T AS (
SELECT T1.stringToBeSplit
,T1.column1 --column1 is an example of column where stringToBeSplit is the same for more than one record. better to be avoid but if you need to added here it is how just follow column1 over the code
,T1.column2
,T1.value
,T1.column3
/*,...any other column*/
,charindex(' ' + T1.value + ' ',' ' + T1.stringToBeSplit + ' ' ,0) AS INDX
,RN = ROW_NUMBER() OVER (PARTITION BY t1.column1, T1.stringToBeSplit, T1.value order BY T1.column1, T1.T1.stringToBeSplit, T1.value) --any column that create duplicates need to be added here as example i added column1
FROM (SELECT TOP 10 * FROM YourTable D CROSS APPLY string_split(D.stringToBeSplit,' ')) AS T1
WHERE T1.value <> ''
),
R (stringToBeSplit, column1, column2, value, column3, INDX, RN) AS (
SELECT stringToBeSplit, column1, column2, value, column3, INDX, RN
FROM T
WHERE T.RN = 1
UNION ALL
SELECT T.stringToBeSplit, T.column1, column2, T.value, T.column3
,charindex(' ' + T.value + ' ',' ' + T.stringToBeSplit + ' ',R.INDX + 1) AS INDX
,T.RN
FROM T
JOIN R
ON T.value = R.VALUE AND T.COLUMN1 = R.COLUMN1 --any column that create duplicates need to be added here as exapmle i added column1
AND T.RN = R.RN + 1
)
SELECT * FROM R ORDER BY column1, stringToBeSplit, INDX
Modification for a column in a table, OPTION 2 (max performance i could get, main action came from removing the join and finding a way of properly execute (and stop) the recursive loop of the CTE, from 1.30 for 1000 lines to 2 sec for 30K lines of strings of similar type and length):
WITH T AS (
SELECT T1.stringToBeSplit --no extracolumns this time
,T1.value
,charindex(' ' + T1.value + ' ',' ' + T1.stringToBeSplit + ' ' ,0) AS INDX
,RN = ROW_NUMBER() OVER (PARTITION BY T1.stringToBeSplit,T1.value order BY T1.stringToBeSplit,T1.value) --from clause use distinct and where if possible
FROM (SELECT DISTINCT stringToBeSplit, VALUE FROM [your table] D CROSS APPLY string_split(D.stringToBeSplit,' ') WHERE [your filter]) AS T1
WHERE T1.value <> ''
),
R (stringToBeSplit, value, INDX, RN) AS (
SELECT stringToBeSplit, value, INDX, RN
FROM T
WHERE T.RN = 1
UNION ALL
SELECT R.stringToBeSplit, R.value
,charindex(' ' + R.value + ' ',' ' + R.stringToBeSplit + ' ',R.INDX + 1) AS INDX
,R.RN + 1
FROM R
WHERE charindex(' ' + R.value + ' ',' ' + R.stringToBeSplit + ' ',R.INDX + 1) <> 0
)
SELECT * FROM R ORDER BY stringToBeSplit, INDX
For getting the word ordinal instead of SELECT * FROM R USE:
SELECT stringToBeSplit ,value , ROW_NUMBER() OVER (PARTITION BY stringToBeSplit order BY [indX]) AS ORD FROM R
if instead of having one RW per word you prefer one column:
select * FROM (SELECT [name 1],value , ROW_NUMBER() OVER (PARTITION BY [name 1] order BY [indX]) AS ORD FROM R ) as R2
pivot (MAX(VALUE) FOR ORD in ([1],[2],[3]) ) AS PIV
if you don't want to specify the number of columns QUOTNAME() like in this link, in my case i only need first 4 words rest are irrelevant for the moment. Below the code from the page in case link fail:
DECLARE
#columns NVARCHAR(MAX) = '',
#sql NVARCHAR(MAX) = '';
-- select the category names
SELECT
#columns+=QUOTENAME(category_name) + ','
FROM
production.categories
ORDER BY
category_name;
-- remove the last comma
SET #columns = LEFT(#columns, LEN(#columns) - 1);
-- construct dynamic SQL
SET #sql ='
SELECT * FROM
(
SELECT
category_name,
model_year,
product_id
FROM
production.products p
INNER JOIN production.categories c
ON c.category_id = p.category_id
) t
PIVOT(
COUNT(product_id)
FOR category_name IN ('+ #columns +')
) AS pivot_table;';
-- execute the dynamic SQL
EXECUTE sp_executesql #sql;
Last but not least i'm really looking forward to know if there is an easier way with same performance either in SQL server or in C#. i just think everything that does not use external info should stay in the Server and run as query or batch but not sure to be honest as i heard the contrary (specially from people that use panda) but no one have convince me just yet.
This works
Example:
String = "pos1-pos2-pos3"
REVERSE(PARSENAME(REPLACE(REVERSE(String), '-', '.'), 1))
With 1 Returns "pos1"
With 2 will return "pos2"...

selecting random records inside a UDF

I'm writing a function in sql server 2012.
I came to know that we can not use RAND() or NEWID() functions in the select statement of a function in sql server.
My function goes like this:
CREATE FUNCTION Keywordsuggester (#userid INT)
returns #suggestor_tab TABLE (
keywordid INT,
keywordname VARCHAR(max),
keywordcategory VARCHAR(max))
AS
BEGIN
DECLARE #category_table TABLE
(
category_name VARCHAR(max),
category_id INT,
rownum INT
)
DECLARE #ID INT = 1
DECLARE #COUNT INT = 0
DECLARE #I INT = 1
INSERT INTO #category_table
SELECT kc.NAME,
kc.id,
k.NAME,
d.NAME,
Row_number()
OVER(
ORDER BY d.id ASC) AS rownum
FROM dtypes d
JOIN keywords k
ON d.NAME LIKE '%' + k.NAME + '%'
JOIN keywordscategory kc
ON k.categoryid = kc.id
WHERE d.userid = #userid
SELECT #count = rownum
FROM #category_table
WHILE #count > #I
BEGIN
INSERT INTO #suggestor_tab
SELECT TOP 5 kc.id,
k.NAME,
kc.NAME
FROM kwords k
JOIN #category_table ct
ON k.categoryid = ct.category_id
JOIN kwcategory kc
ON kc.NAME = ct.category_name
WHERE ct.rownum = #I
--Here I'm inserting top 5 records for each category into the suggestor_tab,instead I have to insert random 5 records for each category(i.e.,#I)
SET #I=#I + 1
--return
END
INSERT INTO #suggestor_tab
SELECT kc.id,
k.NAME,
kc.NAME
FROM kwords k
JOIN #category_table ct
ON k.categoryid = ct.category_id
JOIN kwcategory kc
ON kc.NAME = category_name
RETURN
END
How can I get random records for each category(i.e., #I in the while loop).
I tried the query like:
SELECT TOP 5 k.NAME
FROM kwords k
JOIN #category_table ct
ON k.category_id = ct.id
JOIN kwcategory kc
ON kc.NAME = category_name
WHERE ct.rownum = #I
ORDER BY Newid()
which throws an error like:
Invalid use of a side-effecting operator 'newid' within a function.
Is there anyway to do this?
Thanks in advance.
You cannot use Non-deterministic Funcions inside UDF.
Create a View and use it in order by
create view Random
as
select newid() as New_id
Change you select something like this.
SELECT TOP 5 k.NAME
FROM KWords k
JOIN #category_table ct
ON k.category_id = ct.id
JOIN kwcategory kc
ON kc.NAME = category_name
WHERE ct.rownum = #I
ORDER BY (SELECT new_id
FROM random)

Resources