Performance tuning on matching word using CHARINDEX and COLLATION - sql-server

I have two tables with following records:
Table 1: 100 rows
Table 2: 50 Millions rows
Example:
Table 1: tb100
create table tb100
(
name varchar(50)
);
insert into tb100 values('Mak John'),('Will Smith'),('Luke W')......100 rows.
Table 2: tb50mil
create table tb50mil
(
name varchar(50)
);
insert into tb10mil values('John A Mak'),('K Smith Will'),('James Henry')......50 millions rows.
create nonclustered index nci_tb10mil_name on tb10mil(name);
Requirement: I want to match the name between two tables, if any WORD(John,Smith,Will) present in another table. For example John present in John A Mark.
My try: Used XML to split column name of the table tb100 and also adding collation with CHARINDEX.
;WITH splitdata AS
(
SELECT splitname
FROM
(
SELECT *,Cast('<X>' + Replace(t.name, ' ', '</X><X>') + '</X>' AS XML) AS xmlfilter
FROM tb100 t
)F1
CROSS apply
(
SELECT fdata.d.value('.', 'varchar(50)') AS splitName
FROM f1.xmlfilter.nodes('X') AS fdata(d)
) O
)
SELECT t2.name AS [Aadhar Names]
FROM tb50mil t2
INNER JOIN splitdata S
ON CHARINDEX(S.splitname collate Latin1_General_BIN,T2.name collate Latin1_General_BIN)>0
GROUP BY t2.name
Time taken to execution: 00:01:34
Rows affected: (2251429 row(s) affected)
Execution Plan:

If you need the separate words within your name then maybe having a table where your name is just one string isn't optimal conceptionally. Also, separating the names now is painful since you have no reoccurring pattern to your middle names. Plus string modification is really not a SQL strength. I would instead extend your table into something like this:
alter table tb100
add
nameID int IDENTITY(1,1) NOT NULL,
first_name varchar(50) null,
middle_name varchar(50) null,
last_name varchar(50) null
insert into tb100 values('Mak John'),('Will Smith'),('Luke W')......100 rows.
if (SELECT LEN(col) - LEN(REPLACE(col, ' ', '')) > 1)
update tb100
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name)))),
last_name = (select Substring( Right(middle_name, (LEN(middle_name) - (Charindex(' ', middle_name) + 1)), LEN(LEN(middle_name) - (Charindex(' ', middle_name) + 1))
else
update tb100
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = '',
last_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name))))
I hope it works I didn't have a chance to test it because I'm on the road.
If you have the chance of inserting the data into those columns without this entire modification then please do so.
You then do the same to your other table...
alter table tb50mil
add
nameID int IDENTITY(1,1) NOT NULL,
first_name varchar(50) null,
middle_name varchar(50) null,
last_name varchar(50) null
insert into tb10mil values('John A Mak'),('K Smith Will'),('James Henry')......50 million rows.
if (SELECT LEN(col) - LEN(REPLACE(col, ' ', '')) > 1)
update tb50mil
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name)))),
last_name = (select Substring( Right(middle_name, (LEN(middle_name) - (Charindex(' ', middle_name) + 1)), LEN(LEN(middle_name) - (Charindex(' ', middle_name) + 1))
else
update tb50mil
set
first_name = (Select Substring(name, 0, (Charindex(' ', name)))),
middle_name = '',
last_name = (Select Substring( Right(name, (LEN(name) - (Charindex(' ', name) +1)), 0, (Charindex(' ', name))))
and from here on it's a simple join really:
select * from tb100 hun
inner join
tb50mil mil on hun.first_name = mil.first_name OR hun.middle_name = mil.middle_name OR hun.last_name OR mil.last_name
Hope this helps!

Related

Script Data Dynamically using TSQL

I came up with the following script to dynamically create INSERT statements for a given table.
CREATE TABLE Employee ([Id] int, [Name] NVARCHAR(25))
INSERT INTO Employee VALUES (1, 'Vyas')
INSERT INTO Employee VALUES (2, 'Nandhini')
select 'Insert into employee (' +
STUFF ((
SELECT ', [' + name + ']'
FROM syscolumns
WHERE id = OBJECT_ID('employee')
FOR XML PATH('')), 1, 1, '') +
') values(' +
stuff((select +',' + cast([id] as varchar(2)) +','''+ [name] + '''' from Employee e1 where e1.id = e2.id for xml path('')),1,1,'') +
')'
from employee e2
The script creates insert statements as required.
Insert into employee ( [Id], [Name]) values(1,'Vyas')
Insert into employee ( [Id], [Name]) values(2,'Nandhini')
However, I need
The value SQL statement to be dynamic as the first
Only selected columns with common SQL Server datatypes to be handled & quoted correctly (int, [n]varchar, uniqueidentifier and bit)
What's the best way to accomplish this?

Split multiple column and store into temp table

I have the sample data:
Table: tblsampledata
create table tblsampledata
(
column1 varchar(50),
column2 varchar(50)
);
insert into tblsampledata values('Bob Frapples','Gail Forcewind');
insert into tblsampledata values('Paul Molive','Mario Speedwagon');
And I have column mapping table with table name:
Table: tblmapping
create table tblmapping
(
tblname varchar(100),
columnmap varchar(max)
);
insert into tblmapping values('tblsampledata','[column1]|[column2]');
Note: I want to split the column data which are exists in tblmapping of table name in column tblname and store it into temp table.
Expected Result: #TempTable
column1 column2
---------------------
Bob Gail
Frapples Forcewind
Paul Mario
Molive Speedwagon
You need to use dynamic query to acheive this.
You can try like following.
select #xml = Cast(( '<X>' + Replace(columnmap, '|', '</X><X>') + '</X>' ) AS XML)
from tblmapping where tblname =#tablename
DECLARE #query AS NVARCHAR(max) = 'select ' + Stuff((SELECT DISTINCT ', ' + value
FROM (
SELECT n.value('.', 'varchar(100)') AS value
FROM #xml.nodes('X') AS T(n)
)t
FOR xml path(''), type).value('.', 'NVARCHAR(MAX)'), 1, 1, '')
+ ' from ' + #tablename;
exec sp_executesql #query
Online Demo
To split the column 1 and Column 2 you can use query like following.
SELECT CASE
WHEN n = 1 THEN LEFT(column1, Charindex(' ', column1) - 1)
WHEN n = 2 THEN RIGHT(column1, Charindex(' ', Reverse(column1)) - 1)
END AS column1,
CASE
WHEN n = 1 THEN LEFT(column2, Charindex(' ', column2) - 1)
WHEN n = 2 THEN RIGHT(column2, Charindex(' ', Reverse(column2)) - 1)
END AS column2
FROM tblsampledata t1
CROSS JOIN ( VALUES(1),(2) )t(n)
Full Demo using dynamic query

Fill with spaces a column value in update

How to automatically fill a column with spaces to a pre-determined length in update SQL sentence in SQL Server 2012?
I have a table with several columns like
Col1 NVARCHAR(10)
Col2 NVARCHAR(100)
Col3 NVARCHAR(200)
Col4 NVARCHAR(50)
and more.
If value of column is NULL or '', I update the column with spaces to a pre-determined length (the lenth of the column).
For Col3, if value is NULL or '', spaces to 200 blank space (' ')
if value has any characters, 'abcd', fill (pad right) to 200 blank spaces. Then, finally 4 not spaces characters and 196 spaces characteres.
For example, for Col1 has length 10.
1) Value = NULL , Col1 value = ' ' (10 spaces)
2) Value = '' , Col1 value = ' ' (10 spaces)
2) Value = 'abc' , Col1 value = 'abc ' (abc and 7 spaces)
How can I do that in the UPDATE SQL?
Maybe using
select column_name, data_type, character_maximum_length
from information_schema.columns
where table_name = 'myTable'
or
SELECT COL_LENGTH('Table', 'Column')
More in How to get the size of a varchar[n] field in one SQL statement?
Try the following, the LEFT is used to keep the length down to the column length, while the space ensures the field is filled with spaces:
create table test (col1 varchar(10), col2 varchar(15))
GO
insert into test (col1, col2)
values ('', '')
,(NULL, NULL)
,('abc', 'abc')
UPDATE test
SET col1 = LEFT(COALESCE(col1, '') + SPACE(COL_LENGTH('test', 'col1')), COL_LENGTH('test', 'col1'))
,col2 = LEFT(COALESCE(col2, '') + SPACE(COL_LENGTH('test', 'col2')), COL_LENGTH('test', 'col2'))
FROM test
SELECT *
FROM test
I don't understand what you want exactly, but here is what I understand:
CREATE TABLE MyTable (
Col1 NVARCHAR(200),
Col2 NVARCHAR(100),
Col3 NVARCHAR(200),
Col4 NVARCHAR(50)
);
INSERT INTO MyTable VALUES (NULL, NULL, NULL, NULL), ('ABC', NULL, NULL, NULL);
-- You can do the same for the other cols
UPDATE MyTABLE
SET Col1 = REPLICATE(' ', COL_LENGTH('MyTable', 'Col1')/2)
WHERE Col1 IS NULL;
SELECT *
FROM MyTable;
Demo
Update:
Here is how to do it in one statement:
UPDATE MyTABLE
SET Col1 = (SELECT CASE WHEN (Col1 IS NULL) OR (Col1 = '') THEN
REPLICATE(' ', COL_LENGTH('MyTable', 'Col1')/2)
ELSE Col1 + REPLICATE(' ', (COL_LENGTH('MyTable', 'Col1')/2)- LEN(Col1)) END),
Col2 = (SELECT CASE WHEN (Col2 IS NULL) OR (Col2 = '') THEN
REPLICATE(' ', COL_LENGTH('MyTable', 'Col2')/2)
ELSE Col2 + REPLICATE(' ', (COL_LENGTH('MyTable', 'Col2')/2)- LEN(Col2)) END),
Col3 = (SELECT CASE WHEN (Col3 IS NULL) OR (Col3 = '') THEN
REPLICATE(' ', COL_LENGTH('MyTable', 'Col1')/2)
ELSE Col3 + REPLICATE(' ', (COL_LENGTH('MyTable', 'Col3')/2)- LEN(Col3)) END),
Col4 = (SELECT CASE WHEN (Col4 IS NULL) OR (Col4 = '') THEN
REPLICATE(' ', COL_LENGTH('MyTable', 'Col4')/2)
ELSE Col4 + REPLICATE(' ', (COL_LENGTH('MyTable', 'Col4')/2)- LEN(Col4)) END);
Demo

SQL FOR XML PATH list and COUNT

I have a table such as:
|Date |Name|
--------------------
|'20-May-2011'|Bob |
|'20-May-2011'|Fred|
|'20-May-2011'|Jim |
|'21-May-2011'|Bob |
|'21-May-2011'|Ed |
|'22-May-2011'|Bill|
I need a query to return:
|Date |Count|Names |
--------------------------------------
|'20-May-2011'| 3|'Bob, Fred, Jim'|
|'21-May-2011'| 2|'Bob, Ed' |
|'22-May-2011'| 1|'Bill' |
In other words, I want a list and a count of the names by date.
The best I can come up with is:
SELECT list.[Date], [Count], [Names]
FROM (
SELECT [Date],
STUFF((
SELECT ', ' + [Name]
FROM #table t2
WHERE t2.[Date] = t.[Date]
ORDER BY [Name]
FOR XML PATH('')
), 1, 2, '') AS [Names]
FROM #table t
GROUP BY [Date]
) [list]
INNER JOIN (
SELECT [Date],
COUNT(*) AS [Count]
FROM #table t
GROUP BY [Date]
) [count]
ON list.[Date] = count.[Date]
ORDER BY [Count] DESC, list.[Date]
Is there a more elegant query?
SELECT [Date],
COUNT(*) AS [Count],
STUFF((
SELECT ', ' + [Name]
FROM #table t2
WHERE t2.[Date] = t.[Date]
ORDER BY [Name]
FOR XML PATH('')
), 1, 2, '') AS [Names]
FROM #table t
GROUP BY [Date]
If you think that the Name column might contain <>'"& you should do like this instead:
SELECT [Date],
COUNT(*) AS [Count],
STUFF((
SELECT ', ' + [Name]
FROM #table t2
WHERE t2.[Date] = t.[Date]
ORDER BY [Name]
FOR XML PATH(''), TYPE
).value('.', 'varchar(max)'), 1, 2, '') AS [Names]
FROM #table t
GROUP BY [Date]
Not a whole lot better - but maybe using a single CTE to "encapsulate" the XML-PATH-stuffing into a more presentable way would work??
;WITH ConsolidatedData AS
(
SELECT
[Date],
STUFF((
SELECT ', ' + [Name]
FROM #table t2
WHERE t2.[Date] = t.[Date]
ORDER BY [Name]
FOR XML PATH('')
), 1, 2, '') AS [Names]
FROM #table t
)
SELECT
[Date], Names, COUNT(*)
FROM
ConsolidatedData
GROUP BY
[Date], Names
Not sure if you'd count this as one "compound" statement, or two.... :-)
One word of advice: try not to use SQL Server identifiers and reserved words (like Date or Order) as your own column and/or table names.... it's always rather messy....

TSQL to transform Address into a Mailing Address - SQL Server 2005

I would like to transform an Address (Line1, Line2, Line3, City, State, ZIP) into a Mailing Address (Addr1, Addr2, Addr3, Addr4) that has no blank lines and the City, State and ZIP are concatenated together on one line. Having a function do this would be very nice.
i.e.
Line1=
Line2=123 Somewhere
Line3=
City=Detroit
State=MI
Zip=48000
Here is the table stucture for the incoming address:
IF OBJECT_ID('tempdb..#Employee') IS NOT NULL DROP TABLE #Employee
CREATE TABLE #Employee (Line1 VARCHAR(30), Line2 VARCHAR(30), Line3 VARCHAR(30),
City VARCHAR(17), State VARCHAR(2), ZIP VARCHAR(10))
GO
INSERT #Employee VALUES ('', '123 Somewhere', '', 'Detroit', 'MI', '48000')
SELECT * FROM #Employee
The resulting Mailing Address
Addr1=123 Somewhere
Addr2=Detroit MI 48000
Addr3=
Addr4=
or one field with cr character
Addr=
123 Somewhere cr
Detroit MI 48000 cr
cr
cr
A function would be nice to return Addr1, Addr2, Addr3 and Addr4 or just Addr with .
SqueezeAddress(Line1, Line2, Line3, City, State, ZIP)
Then SqueezeAddress would return Addr1, Addr2, Addr3, Addr4
or
Addr with cr
All the Addr1-4 lines would be VARCHAR (40) or if one field is used Addr VARCHAR (200)
Per Phil's request in the comments below, here is the current logic that is being used (Many fields were removed to make it easier to read):
SELECT Line1, Line2, Line3,
ISNULL(LTRIM(RTRIM(ADDR.City)) + ', ','') + ISNULL(ADDR.RegionCode,'')
+ ' ' + ISNULL(ADDR.PostalCode,'') AS Line4,
UPDATE #tmpBilling
SET Line1 = Line2, Line2 = NULL
WHERE ISNULL(Line1, '') = ''
AND ISNULL(Line2, '') <> ''
UPDATE #tmpBilling
SET Line2 = Line3, Line3 = NULL
WHERE ISNULL(Line2, '') = ''
AND ISNULL(Line3, '') <> ''
UPDATE #tmpBilling
SET Line2 = Line4, Line4 = NULL
WHERE ISNULL(Line2, '') = ''
AND ISNULL(Line4, '') <> ''
UPDATE #tmpBilling
SET Line3 = Line4, Line4 = NULL
WHERE ISNULL(Line3, '') = ''
AND ISNULL(Line2, '') <> ''
I may be missing something here, but if this is just simple string concatenation, then this would work...
Set up testing data (I added a few more samples)
IF OBJECT_ID('tempdb..#Employee') IS NOT NULL DROP TABLE #Employee
CREATE TABLE #Employee (Line1 VARCHAR(30), Line2 VARCHAR(30), Line3 VARCHAR(30),
City VARCHAR(17), State VARCHAR(2), ZIP VARCHAR(10))
GO
INSERT #Employee VALUES ('', '123 Somewhere', '', 'Detroit', 'MI', '48001')
INSERT #Employee VALUES ('123 Somewhere', 'Suite 500', '', 'Detroit', 'MI', '48002')
INSERT #Employee VALUES ('123 Somewhere', 'Suite 500', 'attn: JP', 'Detroit', 'MI', '48003')
SELECT * FROM #Employee
From here, all you have to do is stitch the strings together. This version presumes that you have both nulls and empty strings to factor out.
SELECT
isnull(nullif(Line1, '') + char(13) + char(10), '')
+ isnull(nullif(Line2, '') + char(13) + char(10), '')
+ isnull(nullif(Line3, '') + char(13) + char(10), '')
+ City + ' ' + State + ' ' + ZIP
+ char(13) + char(10) + '------------------------------'
from #Employee
Wrap that into a function:
CREATE FUNCTION dbo.SqueezeAddress
(
#Line1 varchar(30)
,#Line2 varchar(30)
,#Line3 varchar(30)
,#City varchar(17)
,#State varchar(2)
,#Zip varchar(10)
)
RETURNS varchar(200)
AS
BEGIN
RETURN isnull(nullif(#Line1, '') + char(13) + char(10), '')
+ isnull(nullif(#Line2, '') + char(13) + char(10), '')
+ isnull(nullif(#Line3, '') + char(13) + char(10), '')
+ #City + ' ' + #State + ' ' + #ZIP
+ char(13) + char(10) + '------------------------------'
END
GO
Lastly, put the function in the query:
SELECT dbo.SqueezeAddress(Line1, Line2, Line3, City, State, Zip)
from #Employee
More straightforward (and easier to debug, IMHO):
-------------------------------------------------------------
-- assumptions:
--
-- * nullable fields never contain an nil (empty) string.
-- every nullable column will contain either a proper value
-- or NULL.
--
-- * zipcode is a 5- or 9-digit USPS zip code, without a dash.
-- Addresses lacking a zipcode will be NULL.
--------------------------------------------------------------
drop table dbo.address
go
create table dbo.address
(
id int not null identity(1,1) primary key clustered ,
line1 varchar(100) null ,
line2 varchar(100) null ,
line3 varchar(100) null ,
city varchar(100) null ,
state varchar(2) null ,
zipcode varchar(9) null ,
)
go
-----------------------------------------------------------------------
-- create a work table and rotate the source table such that
-- the work table contains 1 row for each non-null row for each address
-----------------------------------------------------------------------
drop table #addr
go
create table #addr
(
id int not null , -- pk.1
line_no int not null , -- pk.2
value varchar(100) not null ,
primary key clustered ( id , line_no ) ,
)
go
insert #addr ( id , line_no , value )
select addr.id , addr.line_no , addr.value
from ( select id = t.id ,
line_no = row_number() over ( partition by t.id order by t.seq ) ,
value = t.value
from ( select id = id ,
seq = 1 ,
value = line1
from dbo.address where line1 is not null
UNION
select id = id ,
seq = 2 ,
value = line2
from dbo.address where line2 is not null
UNION
select id = id ,
seq = 3 ,
value = line3
from dbo.address where line3 is not null
UNION
select id = id ,
seq = 4 ,
value = ltrim(rtrim(
coalesce( city , '' )
+ case when city is not null and state is not null then ', ' else '' end
+ coalesce( state , '' )
+ case when ( city is not null or state is not null ) and zipcode is not null then ' ' else '' end
+ coalesce( left(zipcode,5) , '' )
+ case when len(zipcode) = 9 then '-' + right(zipcode,4) else '' end
))
from dbo.address
where city is not null
OR state is not null
OR zipcode is not null
) t
) addr
---------------------------------------------------------------------
-- finally, do another table rotation to build the desired result set
---------------------------------------------------------------------
select id = addr.id ,
line1 = line1.value ,
line2 = line2.value ,
line3 = line3.value ,
line4 = line4.value
from #addr addr
left join #addr line1 on line1.id = addr.id and line1.line_no = 1
left join #addr line2 on line2.id = addr.id and line2.line_no = 2
left join #addr line3 on line3.id = addr.id and line3.line_no = 3
left join #addr line4 on line4.id = addr.id and line4.line_no = 4
order by addr.id
Assuming that empty values are actually NULL and not empty strings and that City, State and Zip are requried:
;With AddressValues As
(
Select PK, Line1 As LineValue, 1 As LinePos
From AddressTable
Union All
Select PK, Line2, 2
From AddressTable
Union All
Select PK, Line3, 3
From AddressTable
Union All
Select PK, [City] + ', ' + [State] + ' ' + [Zip], 4
From AddressTable
)
, OrderedValues As
(
Select PK, LineValue
, Row_Number() Over( Partition By PK Order By LinePos ) As Num
From AddressValues
Where LineValue Is Not Null
)
, ValuesAsColumns As
(
Select PK
, Case When Num = 1 Then LineValue End As Line1
, Case When Num = 2 Then LineValue End As Line2
, Case When Num = 3 Then LineValue End As Line3
, Case When Num = 4 Then LineValue End As Line4
From OrderedValues
Group By PK
)
Update #tmpBilling
Set Line1 = VC.Line1
, Line2 = VC.Line2
, Line3 = VC.Line3
, Line4 = VC.Line4
From #tmpBilling As B
Join ValuesAsColumns As VC
On VC.PK = B.PK
EDIT
Here is the same result in the form of a function:
CREATE FUNCTION dbo.SqueezeAddress
(
#Line1 varchar(50)
, #Line2 varchar(50)
, #Line3 varchar(50)
, #City varchar(50)
, #State varchar(50)
, #Zip varchar(50)
, #LineNumToReturn int
)
RETURNS varchar(50)
AS
BEGIN
Declare #Result varchar(50);
With AddressValues As
(
Select #Line1 As LineValue, 1 As LinePos
Union All
Select #Line2, 2
Union All
Select #Line3, 3
Union All
Select #City + ', ' + #State + ' ' + #Zip, 4
)
, OrderedValues As
(
Select LineValue
, Row_Number() Over( Order By LinePos ) As Num
From AddressValues
Where LineValue Is Not Null
)
Select #Result = LineValue
From OrderedValues
Where Num = #LineNumToReturn
Return #Result
END
GO
Select dbo.SqueezeAddress(null, '123 Main St', null, 'Detroit', 'MI', '12345', 1)
, dbo.SqueezeAddress(null, '123 Main St', null, 'Detroit', 'MI', '12345', 2)
, dbo.SqueezeAddress(null, '123 Main St', null, 'Detroit', 'MI', '12345', 3)
, dbo.SqueezeAddress(null, '123 Main St', null, 'Detroit', 'MI', '12345', 4)
)

Resources