combining groups in sql containing substrings - sql-server

I apologize in advance for not explaining this very well.
I have a sql database with some data like this:
column1 | groups
3323052 | 3323052,3324794,3324795
3324794 | 3323052,3324794
3324794 | 3324794
3324794 | 3324794,3763369
3353586 | 3353586
3763369 | 3324794,3763369
I want to combine groups so that if a number is in two groups, the groups will combine and the number will only show up once in the list.
For example, the final result would look like this:
groups
3323052,3324794,3324795,3763369
3353586
I have been googling around without much luck. Any help is greatly appreciated.
Thanks.

So you want to recursively replace any items in groups -column with any values found from other rows with that value in column1? At least you can do it this way:
Split the data into rows, so there's just column1 -> group relation
Fetch any values that can be used as root nodes, my approach takes the smallest value because your data has a circle (3323052 -> 3324794 -> 3323052)
Fetch recursively all the value that can be found from the hierarchy under these root nodes
Put it back together into the original format
This example uses DelimitedSplit8k by Jeff Moden:
-- Step 1:
select distinct
d.column1,
convert(int, s.Item) as item
into #tmp
from
data d
cross apply DelimitedSplit8k(d.groups, ',') s
-- Step 2:
select distinct
column1
into #root
from #tmp t1
where not exists
(select 1 from #tmp t2 where t2.item = t1.column1 and t2.item > t2.column1)
-- Step 3:
;with CTE (root, parent, child) as (
select r.column1, r.column1, r.column1 from #root r
union all
select C.root, t.column1, t.item
from CTE C join #tmp t on t.column1 = C.child and t.item > C.parent
)
select distinct * into #results from CTE
-- Step 4:
SELECT r.column1, STUFF((SELECT distinct ', ' + convert(varchar(50), r2.child)
FROM #results r2
WHERE r2.root = r.column1
ORDER BY ', ' + convert(varchar(50), r2.child)
FOR XML PATH(N'')), 1, 2, '') as groups
FROM #root r
GROUP BY column1
ORDER BY column1
Result:
column1 groups
3323052 3323052, 3324794, 3324795, 3763369
3353586 3353586
I used temp. tables to be sure each of the steps is executed just once, but I believe it would be possible to do the whole thing with just one select and using CTEs instead of temp tables.
You can test this in SQL Fiddle

Related

Matching string with LEVENSHTEIN algorithm

create table tbl1
(
name varchar(50)
);
insert into tbl1 values ('Mircrosoft SQL Server'),
('Office Microsoft');
create table tbl2
(
name varchar(50)
);
insert into tbl2 values ('SQL Server Microsoft'),
('Microsoft Office');
I want to get the percentage of matching string between two tables column name.
I tried with LEVENSHTEIN algorithm. But what I want to achieve from given data is same between the tables but with different sequence so I want to see the output as 100% matching.
Tried: LEVENSHTEIN
SELECT [dbo].[GetPercentageOfTwoStringMatching](a.name , b.name) MatchedPercentage,a.name as tbl1_name,b.name as tbl2_name
FROM tbl1 a
CROSS JOIN tbl2 b
WHERE [dbo].[GetPercentageOfTwoStringMatching](a.name , b.name) >= 0;
Result:
MatchedPercentage tbl1_name tbl2_name
-----------------------------------------------------------------
5 Mircrosoft SQL Server SQL Server Microsoft
10 Office Microsoft SQL Server Microsoft
15 Mircrosoft SQL Server Microsoft Office
13 Office Microsoft Microsoft Office
As mentioned in the comments this can be achieved through the use of a string split table valued function. Personally I use one based on the very performant set-based tally table approach put together by Jeff Moden which is at the end of my answer.
Using this function allows you to compare the individual words as delimited by a space character and count up the number of matches compared to the total number of words in the two values.
Do note however that this solution falls over on any values with leading spaces. If this will be a problem, clean your data before running this script or adjust to handle them:
declare #t1 table(v nvarchar(50));
declare #t2 table(v nvarchar(50));
insert into #t1 values('Microsoft SQL Server'),('Office Microsoft'),('Other values'); -- Add in some extra values, with the same number of words and some with the same number of characters
insert into #t2 values('SQL Server Microsoft'),('Microsoft Office'),('that matched'),('that didn''t'),('Other valuee');
with c as
(
select t1.v as v1
,t2.v as v2
,len(t1.v) - len(replace(t1.v,' ','')) + 1 as NumWords -- String Length - String Length without spaces = Number of words - 1
from #t1 as t1
cross join #t2 as t2 -- Cross join the two tables to get all comparisons
where len(replace(t1.v,' ','')) = len(replace(t2.v,' ','')) -- Where the length without spaces is the same. Can't have the same words in a different order if the number of non space characters in the whole string is different
)
select c.v1
,c.v2
,c.NumWords
,sum(case when s1.item = s2.item then 1 else 0 end) as MatchedWords
from c
cross apply dbo.fn_StringSplit4k(c.v1,' ',null) as s1
cross apply dbo.fn_StringSplit4k(c.v2,' ',null) as s2
group by c.v1
,c.v2
,c.NumWords
having c.NumWords = sum(case when s1.item = s2.item then 1 else 0 end);
Output
+----------------------+----------------------+----------+--------------+
| v1 | v2 | NumWords | MatchedWords |
+----------------------+----------------------+----------+--------------+
| Microsoft SQL Server | SQL Server Microsoft | 3 | 3 |
| Office Microsoft | Microsoft Office | 2 | 2 |
+----------------------+----------------------+----------+--------------+
Function
create function dbo.fn_StringSplit4k
(
#str nvarchar(4000) = ' ' -- String to split.
,#delimiter as nvarchar(1) = ',' -- Delimiting value to split on.
,#num as int = null -- Which value to return.
)
returns table
as
return
-- Start tally table with 10 rows.
with n(n) as (select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1)
-- Select the same number of rows as characters in #str as incremental row numbers.
-- Cross joins increase exponentially to a max possible 10,000 rows to cover largest #str length.
,t(t) as (select top (select len(isnull(#str,'')) a) row_number() over (order by (select null)) from n n1,n n2,n n3,n n4)
-- Return the position of every value that follows the specified delimiter.
,s(s) as (select 1 union all select t+1 from t where substring(isnull(#str,''),t,1) = #delimiter)
-- Return the start and length of every value, to use in the SUBSTRING function.
-- ISNULL/NULLIF combo handles the last value where there is no delimiter at the end of the string.
,l(s,l) as (select s,isnull(nullif(charindex(#delimiter,isnull(#str,''),s),0)-s,4000) from s)
select rn
,item
from(select row_number() over(order by s) as rn
,substring(#str,s,l) as item
from l
) a
where rn = #num
or #num is null;

Creating permutation via recursive CTE in SQL server?

Looking at :
;WITH cte AS(
SELECT 1 AS x UNION
SELECT 2 AS x UNION
SELECT 3 AS x
)
I can create permutation table for all 3 values :
SELECT T1.x , y=T2.x , z=t3.x
FROM cte T1
JOIN cte T2
ON T1.x != T2.x
JOIN cte T3
ON T2.x != T3.x AND T1.x != T3.x
This uses the power of SQL's cartesian product plus eliminating equal values.
OK.
But is it possible to enhance this recursive pseudo CTE :
;WITH cte AS(
SELECT 1 AS x , 2 AS y , 3 AS z
UNION ALL
...
)
SELECT * FROM cte
So that it will yield same result as :
NB there are other solutions in SO that uses recursive CTE , but it is not spread to columns , but string representation of the permutations
I tried to do the lot in a CTE.
However trying to "redefine" a rowset dynamically is a little tricky. While the task is relatively easy using dynamic SQL doing it without poses some issues.
While this answer may not be the most efficient or straight forward, or even correct in the sense that it's not all CTE it may give others a basis to work from.
To best understand my approach read the comments, but it might be worthwhile looking at each CTE expression in turn with by altering the bit of code below in the main block, with commenting out the section below it.
SELECT * FROM <CTE NAME>
Good luck.
IF OBJECT_ID('tempdb..#cteSchema') IS NOT NULL
DROP Table #cteSchema
GO
-- BASE CTE
;WITH cte AS( SELECT 1 AS x, 2 AS y, 3 AS z),
-- So we know what columns we have from the CTE we extract it to XML
Xml_Schema AS ( SELECT CONVERT(XML,(SELECT * FROM cte FOR XML PATH(''))) AS MySchema ),
-- Next we need to get a list of the columns from the CTE, by querying the XML, getting the values and assigning a num to the column
MyColumns AS (SELECT D.ROWS.value('fn:local-name(.)','SYSNAME') AS ColumnName,
D.ROWS.value('.','SYSNAME') as Value,
ROW_NUMBER() OVER (ORDER BY D.ROWS.value('fn:local-name(.)','SYSNAME')) AS Num
FROM Xml_Schema
CROSS APPLY Xml_Schema.MySchema.nodes('/*') AS D(ROWS) ),
-- How many columns we have in the CTE, used a coupld of times below
ColumnStats AS (SELECT MAX(NUM) AS ColumnCount FROM MyColumns),
-- create a cartesian product of the column names and values, so now we get each column with it's possible values,
-- so {x=1, x =2, x=3, y=1, y=2, y=3, z=1, z=2, z=3} -- you get the idea.
PossibleValues AS (SELECT MyC.ColumnName, MyC.Num AS ColumnNum, MyColumns.Value, MyColumns.Num,
ROW_NUMBER() OVER (ORDER BY MyC.ColumnName, MyColumns.Value, MyColumns.Num ) AS ID
FROM MyColumns
CROSS APPLY MyColumns MyC
),
-- Now we have the possibly values of each "column" we now have to concat the values together using this recursive CTE.
AllRawXmlRows AS (SELECT CONVERT(VARCHAR(MAX),'<'+ISNULL((SELECT ColumnName FROM MyColumns WHERE MyColumns.Num = 1),'')+'>'+Value) as ConcatedValue, Value,ID, Counterer = 1 FROM PossibleValues
UNION ALL
SELECT CONVERT(VARCHAR(MAX),CONVERT(VARCHAR(MAX), AllRawXmlRows.ConcatedValue)+'</'+(SELECT ColumnName FROM MyColumns WHERE MyColumns.Num = Counterer)+'><'+(SELECT ColumnName FROM MyColumns WHERE MyColumns.Num = Counterer+1)+'>'+CONVERT(VARCHAR(MAX),PossibleValues.Value)) AS ConcatedValue, PossibleValues.Value, PossibleValues.ID,
Counterer = Counterer+1
FROM AllRawXmlRows
INNER JOIN PossibleValues ON AllRawXmlRows.ConcatedValue NOT LIKE '%'+PossibleValues.Value+'%' -- I hate this, there has to be a better way of making sure we don't duplicate values....
AND AllRawXmlRows.ID <> PossibleValues.ID
AND Counterer < (SELECT ColumnStats.ColumnCount FROM ColumnStats)
),
-- The above made a list but was missing the final closing XML element. so we add it.
-- we also restict the list to the items that contain all columns, the section above builds it up over many columns
XmlRows AS (SELECT DISTINCT
ConcatedValue +'</'+(SELECT ColumnName FROM MyColumns WHERE MyColumns.Num = Counterer)+'>'
AS ConcatedValue
FROM AllRawXmlRows WHERE Counterer = (SELECT ColumnStats.ColumnCount FROM ColumnStats)
),
-- Wrap the output in row and table tags to create the final XML
FinalXML AS (SELECT (SELECT CONVERT(XML,(SELECT CONVERT(XML,ConcatedValue) FROM XmlRows FOR XML PATH('row'))) FOR XML PATH('table') )as XMLData),
-- Prepare a CTE that represents the structure of the original CTE with
DataTable AS (SELECT cte.*, XmlData
FROM FinalXML, cte)
--SELECT * FROM <CTE NAME>
-- GETS destination columns with XML data.
SELECT *
INTO #cteSchema
FROM DataTable
DECLARE #XML VARCHAR(MAX) ='';
SELECT #Xml = XMLData FROM #cteSchema --Extract XML Data from the
ALTER TABLE #cteSchema DROP Column XMLData -- Removes the superflous column
DECLARE #h INT
EXECUTE sp_xml_preparedocument #h OUTPUT, #XML
SELECT *
FROM OPENXML(#h, '/table/row', 2)
WITH #cteSchema -- just use the #cteSchema to define the structure of the xml that has been constructed
EXECUTE sp_xml_removedocument #h
How about translating 1,2,3 into a column, which will look exactly like the example you started from, and use the same approach ?
;WITH origin (x,y,z) AS (
SELECT 1,2,3
), translated (x) AS (
SELECT col
FROM origin
UNPIVOT ( col FOR cols IN (x,y,z)) AS up
)
SELECT T1.x , y=T2.x , z=t3.x
FROM translated T1
JOIN translated T2
ON T1.x != T2.x
JOIN translated T3
ON T2.x != T3.x AND T1.x != T3.x
ORDER BY 1,2,3
If I understood correctly the request, this might just do the trick.
And to run it on more columns, just need to add them origin cte definition + unpivot column list.
Now, i dont know how you pass your 1 - n values for it to be dynamic, but if you tell me, i could try edit the script to be dynamic too.

SQL replace occurrances based on a table

Hi I've a SQL issue to solve; I've these tables:
Table A with varchar column tst
tst
'2','5','8'
'2','6'
'4','12'
Table B with int column rep
rep
2
6
I'm looking for a query (without cycle WHILE) to update Table A in the following way:
tst
'R','5','8'
'R','R'
'4','12'
using char 'R' to replace the occurrances of Table B in Table A
Thanks in advance
SQLFiddle Demo
UPDATE t1
SET tst = STUFF(z,1,1,'') --Remove leading comma from final result
FROM (
SELECT --Convert original string to xml
tst
,CAST('<a>'+REPLACE(tst ,',','</a><a>')+'</a>' AS XML) x
FROM tst
) t1
CROSS APPLY (
SELECT --Replace value with 'R' when matched in rep
','+CASE WHEN rep IS NULL THEN y.value('.','varchar(max)') ELSE '''R''' END
FROM x.nodes('a') t2(y) --Explode xml to separate values
LEFT JOIN rep t3 --Match value to rep
ON y.value('.','varchar(max)') = QUOTENAME(rep,CHAR(39))
FOR XML PATH('') --Recompact xml to comma-delimited string
) t4(z)
Got it working by using a recursive CTE:
;with numbers as (
SELECT
rep,
-- processing order
ROW_NUMBER() OVER (order by rep) working_order
FROM B
), worker as (
-- Anchor: the first substitution
SELECT
tst,
rep,
-- stores already done substitutions
replace(tst, '''' + cast(rep as varchar) + '''', '''R''') tmp_result,
1 lvl
FROM A JOIN numbers ON working_order=1
UNION ALL
-- run through all substitutions to be done
SELECT
w.tst,
n.rep,
-- use tmp_result to refer to already done substitutions
replace(w.tmp_result, '''' + cast(n.rep as varchar) + '''', '''R'''),
lvl + 1
FROM worker w JOIN numbers n ON working_order=lvl+1
), result as (
SELECT tst, tmp_result FROM worker where lvl = (SELECT MAX(working_order) FROM numbers)
)
UPDATE A SET tst=tmp_result FROM A JOIN result ON result.tst=A.tst
Explanation:
First I select all numbers from B and give them a processing number
In the recursive CTE worker, I do a sequential substitution with the
order given by numbers
In result I reduce the worker to the final
rows (those with the highest working_order)
Finally I update A using
the result.

delete duplicate tuples except for one specific tuple

I want to delete the duplicate(same from and same to values) tuples from a table but need to keep the tuple with minimum object id(object id is pk).
So here are columns:
from | to | time | distance | object_id
I can see the correct number of tuples that will be deleted by executing
select [from],[to],count(*)
FROM table
where [object_id] NOT IN(
SELECT min([object_id])
FROM table
group by [from],[to]
having count(*) > 1)
group by [from],[to]
having count(*) > 1
but I want to first see the object_id's which are counted on the SQL above.
You could try with this (untested)...
;WITH temp AS (
SELECT [from_id], [to_id], [object_id] = min([object_id])
FROM table
group by [from_id],[to_id]
having count(*) > 1)
SELECT
t2.[from_id],
t2.[to_id],
t.[object_id]
FROM
table t
join temp t2
on t2.[from_id] = t.[from_id]
AND t2.[to_id] = t.[to_id]
AND t2.[object_id] != t.[object_id]
EDIT:
CTE temp will yield all distinct from/to groupings with min object_id, one you would like to keep.
SELECT [from_id], [to_id], [object_id] = min([object_id])
FROM table
group by [from_id],[to_id]
having count(*) > 1
There are other pairs you would like to remove, and these are same from/to pairs, but with different object_id. Last select should output those records exactly.

SQL Server: Joining in rows via. comma separated field

I'm trying to extract some data from a third party system which uses an SQL Server database. The DB structure looks something like this:
Order
OrderID OrderNumber
1 OX101
2 OX102
OrderItem
OrderItemID OrderID OptionCodes
1 1 12,14,15
2 1 14
3 2 15
Option
OptionID Description
12 Batteries
14 Gift wrap
15 Case
[etc.]
What I want is one row per order item that includes a concatenated field with each option description. So something like this:
OrderItemID OrderNumber Options
1 OX101 Batteries\nGift Wrap\nCase
2 OX101 Gift Wrap
3 OX102 Case
Of course this is complicated by the fact that the options are a comma separated string field instead of a proper lookup table. So I need to split this up by comma in order to join in the options table, and then concat the result back into one field.
At first I tried creating a function which splits out the option data by comma and returns this as a table. Although I was able to join the result of this function with the options table, I wasn't able to pass the OptionCodes column to the function in the join, as it only seemed to work with declared variables or hard-coded values.
Can someone point me in the right direction?
I would use a splitting function (here's an example) to get individual values and keep them in a CTE. Then you can join the CTE to your table called "Option".
SELECT * INTO #Order
FROM (
SELECT 1 OrderID, 'OX101' OrderNumber UNION SELECT 2, 'OX102'
) X;
SELECT * INTO #OrderItem
FROM (
SELECT 1 OrderItemID, 1 OrderID, '12,14,15' OptionCodes
UNION
SELECT 2, 1, '14'
UNION
SELECT 3, 2, '15'
) X;
SELECT * INTO #Option
FROM (
SELECT 12 OptionID, 'Batteries' Description
UNION
SELECT 14, 'Gift Wrap'
UNION
SELECT 15, 'Case'
) X;
WITH N AS (
SELECT I.OrderID, I.OrderItemID, X.items OptionCode
FROM #OrderItem I CROSS APPLY dbo.Split(OptionCodes, ',') X
)
SELECT Q.OrderItemID, Q.OrderNumber,
CONVERT(NVarChar(1000), (
SELECT T.Description + ','
FROM N INNER JOIN #Option T ON N.OptionCode = T.OptionID
WHERE N.OrderItemID = Q.OrderItemID
FOR XML PATH(''))
) Options
FROM (
SELECT N.OrderItemID, O.OrderNumber
FROM #Order O INNER JOIN N ON O.OrderID = N.OrderID
GROUP BY N.OrderItemID, O.OrderNumber) Q
DROP TABLE #Order;
DROP TABLE #OrderItem;
DROP TABLE #Option;

Resources