Optimal search string in the where clause - sql-server

Want to search the string using PATINDEX and SOUNDEX within the WHERE clause or any optimal way.
I have the following table with some sample data to search the given string using PATINDEX and SOUNDEX.
create table tbl_pat_soundex
(
col_str varchar(max)
);
insert into tbl_pat_soundex values('Smith A Steve');
insert into tbl_pat_soundex values('Steve A Smyth');
insert into tbl_pat_soundex values('A Smeeth Stive');
insert into tbl_pat_soundex values('Steve Smith A');
insert into tbl_pat_soundex values('Smit Steve A');
Note: I have 100 Millions of records in the table to search for.
String to search:- 'Smith A Steve'
SELECT col_str
FROM tbl_pat_soundex
WHERE PATINDEX('%Smith%',col_str) >= 1 AND PATINDEX('%A%',col_str) >= 1 AND PATINDEX('%Steve%',col_str) >= 1
Getting Output:
col_str
--------------
Smith A Steve
Steve Smith A
Expected Output:
col_str
----------------
Smith A Steve
Steve A Smyth
A Smeeth Stive
Steve Smith A
Smit Steve A
Tried:
1:
SELECT col_str
FROM tbl_pat_soundex
WHERE PATINDEX('%Smith%',col_str) >= 1 AND
PATINDEX('%A%',col_str) >= 1 AND
PATINDEX('%Steve%',col_str) >= 1
2:
SELECT col_str
FROM tbl_pat_soundex
WHERE PATINDEX('%'+SOUNDEX('Smith')+'%',SOUNDEX(col_str)) >= 1 AND
PATINDEX('%'+SOUNDEX('A')+'%',SOUNDEX(col_str)) >= 1 AND
PATINDEX('%'+SOUNDEX('Steve')+'%',SOUNDEX(col_str)) >= 1
3:
SELECT col_str
FROM tbl_pat_soundex
WHERE DIFFERENCE('Smith',col_str) = 4 AND
DIFFERENCE('A',col_str) =4 AND
DIFFERENCE('Steve',col_str) = 4
4:
--Following was taking huge time(was kept running more than 20 minutes) to execute.
SELECT DISTINCT col_str
FROM tbl_pat_soundex [a]
CROSS APPLY SplitString([a].[col_str], ' ') [b]
WHERE DIFFERENCE([b].Item,'Smith') >= 1 AND
DIFFERENCE([b].Item,'A') >= 1 AND
DIFFERENCE([b].Item,'Steve') >= 1

With such a lot of rows the only hint I can give you is: Change the design. Each name part should live in a separate column...
The following will work, but I promise it will be slow...
--set up a test db
USE master;
GO
CREATE DATABASE shnugo;
GO
USE shnugo;
GO
--your table, I added an ID-column
create table tbl_pat_soundex
(
ID INT IDENTITY --needed to distinguish rows
,col_str varchar(max)
);
GO
--A function, which will return a blank-separated string as a alphabetically sorted list of distinct soundex values separated by /: "Smith A Steve" comes back as /A000/S310/S530/
CREATE FUNCTION dbo.ComputeSoundex(#str VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #tmpXML XML=CAST('<x>' + REPLACE((SELECT #str AS [*] FOR XML PATH('')),' ','</x><x>') + '</x>' AS XML);
RETURN (SELECT DISTINCT '/' + SOUNDEX(x.value('text()[1]','varchar(max)')) AS [se]
FROM #tmpXML.nodes('/x[text()]') A(x)
ORDER BY se
FOR XML PATH(''),TYPE).value('.','nvarchar(max)') + '/';
END
GO
--Add a column to store a computed soundex-chain permanently
ALTER TABLE tbl_pat_soundex ADD SortedSoundExPattern VARCHAR(MAX);
GO
--We need a trigger to maintain the computed soundex-chain on any insert or update
CREATE TRIGGER RefreshComputeSoundex ON tbl_pat_soundex
FOR INSERT,UPDATE
AS
BEGIN
UPDATE s SET SortedSoundExPattern=dbo.ComputeSoundex(i.col_str)
FROM tbl_pat_soundex s
INNER JOIN inserted i ON s.ID=i.ID;
END
GO
--test data
insert into tbl_pat_soundex(col_str) values
('Smith A Steve')
,('Steve A Smyth')
,('A Smeeth Stive')
,('Steve Smith A')
,('Smit Steve A')
,('Smit Steve') --no A
,('Smit A') --no Steve
,('Smit Smith Robert Peter A') --add noise
,('Shnugo'); --something else entirely
--check the intermediate result
SELECT *
FROM tbl_pat_soundex
/*
+----+---------------------------+-----------------------+
| ID | col_str | SortedSoundExPattern |
+----+---------------------------+-----------------------+
| 1 | Smith A Steve | /A000/S310/S530/ |
+----+---------------------------+-----------------------+
| 2 | Steve A Smyth | /A000/S310/S530/ |
+----+---------------------------+-----------------------+
| 3 | A Smeeth Stive | /A000/S310/S530/ |
+----+---------------------------+-----------------------+
| 4 | Steve Smith A | /A000/S310/S530/ |
+----+---------------------------+-----------------------+
| 5 | Smit Steve A | /A000/S310/S530/ |
+----+---------------------------+-----------------------+
| 6 | Smit Steve | /S310/S530/ |
+----+---------------------------+-----------------------+
| 7 | Smit A | /A000/S530/ |
+----+---------------------------+-----------------------+
| 8 | Smit Smith Robert Peter A | /A000/P360/R163/S530/ |
+----+---------------------------+-----------------------+
| 9 | Shnugo | /S520/ |
+----+---------------------------+-----------------------+
*/
--Now we can start to search:
DECLARE #StringToSearch VARCHAR(MAX)=' A Steve';
WITH SplittedSearchString AS
(
SELECT soundexCode.value('text()[1]','nvarchar(max)') AS SoundExCode
FROM (SELECT CAST('<x>' + REPLACE(dbo.ComputeSoundex(#StringToSearch),'/','</x><x>') + '</x>' AS XML)) A(x)
CROSS APPLY x.nodes('/x[text()]') B(soundexCode)
)
SELECT a.ID,col_str
FROM tbl_pat_soundex a
INNER JOIN SplittedSearchString s On SortedSoundExPattern LIKE '%/' + s.SoundExCode + '/%'
GROUP BY ID,col_str
HAVING COUNT(ID)=(SELECT COUNT(*) FROM SplittedSearchString)
ORDER BY ID
GO
--clean-up
USE master;
GO
DROP DATABASE shnugo;
Short explanation
This is how it works:
The cte will use the same function to return a soundex-chain of alle the input's fragments
The query will then INNER JOIN this with a LIKE test --this will be sloooooow...
The final check is, if the number of hits is the same as number of fragments.
And a final hint: If you want to search for an exact match, but you want to include different writings you can just directly compare the two strings. You might even place an index on the new column SortedSoundExPattern. Due to the way of creation all kinds of "Steven A Smith", "Steeven a Smit" and even in differing order like "Smith Steven A" will produce exactly the same pattern.

In my view, you should try to use dynamic SQL.
For example, you have a table:
create table tbl_pat_soundex
(
id int,
col_str varchar(max)
)
And you have an the following clustered index or any other index(table with over 100 million rows should have some index):
CREATE NONCLUSTERED INDEX myIndex ON dbo.tbl_pat_soundex(id) INCLUDE (col_str)*/
So try to create the following dynamic SQL query based on your logic and execute it. The wish result should look like this:
DECLARE #statement NVARCHAR(4000)
SET #statement = N'
SELECT col_str
FROM tbl_pat_soundex
WHERE col_str like '%Smith%' AND id > 0
UNION ALL
SELECT col_str
FROM tbl_pat_soundex
WHERE col_str like '%Steve%' AND id > 0
UNION ALL
SELECT col_str
FROM tbl_pat_soundex
WHERE
PATINDEX('%Smith%',col_str) >= 1 AND PATINDEX('%A%',col_str) >= 1 AND
PATINDEX('%Steve%',col_str) >= 1
AND id > 0'
Basically, what we do is creating single search queries which will have index seek and then combine all results.
This query will have index seek as we use predicate id > 0(assuming that all ids are greater than 0 or you can write your own negative number):
SELECT col_str
FROM tbl_pat_soundex
WHERE col_str like '%Smith%' AND id > 0

Related

TSQL: How to Flatten a table by combining data into a single XML column

How can I combine 2 tables into one flattened table (1 row per person) so that the columns retrieved from the first table appear as they were but the values in the second table are combined into a single XML column?
e.g. Table 1
Person_gid
Name
1
Mary
2
Barry
3
Liam
Table 2
Person_gid
Subjects
1
Physics
1
Chemistry
How would I write a query to result in something like this:
Person_gid
Name
Books
1
Mary
<Physics, Chemistry>
2
Barry
3
Liam
I understand that I'll need to do a LEFT JOIN here but it's the XML bit that I need help with please.
Note: The XML won't look exactly like the above but I was having trouble writing XML in the markup that I used for the tables.
Please try the following solution.
SQL
-- DDL and sample data population, start
DECLARE #person TABLE (Person_gid INT IDENTITY PRIMARY KEY, person_name VARCHAR(30));
INSERT INTO #person (person_name) VALUES
('Mary'),
('Barry'),
('Liam');
DECLARE #subject TABLE (Person_gid INT, [subject] VARCHAR(30));
INSERT INTO #subject (Person_gid, [subject]) VALUES
(1, 'Physics'),
(1, 'Chemistry');
-- DDL and sample data population, end
SELECT p.*, (
SELECT subject FROM #subject AS s
WHERE s.Person_gid = p.Person_gid
FOR XML PATH(''), TYPE, ROOT('root')
) AS books
FROM #person AS p;
Output
+------------+-------------+---------------------------------------------------------------------+
| Person_gid | person_name | books |
+------------+-------------+---------------------------------------------------------------------+
| 1 | Mary | <root><subject>Physics</subject><subject>Chemistry</subject></root> |
| 2 | Barry | NULL |
| 3 | Liam | NULL |
+------------+-------------+---------------------------------------------------------------------+

SQL SERVER update or insert after left join

I have a Table Animals
Id | Name | Count | -- (other columns not relevant)
1 | horse | 11
2 | giraffe | 20
I want to try to insert or update values from a CSV string
Is it possible to do something like the following in 1 query?
;with results as
(
select * from
(
values ('horse'), ('giraffe'), ('lion')
)
animal_csv(aName)
left join animals on
animals.[Name] = animal_csv.aName
)
update results
set
[Count] = 1 + animals.[Count]
-- various other columns are set here
where Id is not null
--else
--insert into results ([Name], [Count]) values (results.aName, 1)
-- (essentially Where id is null)
It looks like what you're looking for is a table variable or temporary table rather than a common table expression.
If I understand your problem correctly, you are building a result set based on data you're getting from a CSV, merging it by incrementing values, and then returning that result set.
As I read your code, it looks as if your results would look like this:
aName | Id | Name | Count
horse | 1 | horse | 12
giraffe | 2 | giraffe | 21
lion | | |
I think what you're looking for in your final result set is this:
Name | Count
horse | 12
giraffe | 21
lion | 1
First, you can get from your csv and table to a resultset in a single CTE statement:
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
Or, if you want to build your resultset using a table variable:
DECLARE #Results TABLE
(
Name VARCHAR(30)
, Count INT
)
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
INSERT #Results
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
SELECT * FROM #results
Or, if you just want to use a temporary table, you can build it like this (temp tables are deleted when the connection is released/closed or when they're explicitly dropped):
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
INTO #results
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
SELECT * FROM #results

SQL Server : except with results from both datasets

I have the following tables:
Stores:
StoreID | Name
1 | Store1
2 | Store2
3 | Store3
EmID | StoreID
1 | 1
2 | 1
3 | 1
1 | 2
3 | 2
Employee:
EmID | Employee | Important
1 | Cashier | 1
2 | Manager | 1
3 | Guard | 0
I need a query to return StoreID and EmID where Employee is important (Important = 1) and the store and employee are not connected. Basically, the result should be:
StoreID | EmId
--------+-------
2 | 2
3 | 1
3 | 2
I have tried joins, outer joins / apply-es, except, cte, temporary tables, but still haven't found the answer.
Can someone help me with the code, or at least point me in the right direction?
Any idea will be very much appreciated.
Thanks.
You use a cross join to get the set of all possible employee/store combinations, and a left join to then remove the combinations that exist in the join table1:
declare #Stores table (StoreID int, Name char(6))
insert into #Stores (StoreID,Name) values
(1,'Store1'),
(2,'Store2'),
(3,'Store3')
declare #Employees table (EmID int, Employee varchar(8), Important bit)
insert into #Employees (EmID,Employee,Important) values
(1,'Cashier',1),
(2,'Manager',1),
(3,'Guard' ,0)
declare #Staffing table (EmID int, StoreID int)
insert into #Staffing (EmID,StoreID) values
(1,1),
(2,1),
(3,1),
(1,2),
(3,2)
select
*
from
#Stores s
cross join
#Employees e
left join
#Staffing st
on
s.StoreID = st.StoreID and
e.EmID = st.EmID
where
e.Important = 1 and
st.EmID is null
Results:
StoreID Name EmID Employee Important EmID StoreID
----------- ------ ----------- -------- --------- ----------- -----------
3 Store3 1 Cashier 1 NULL NULL
2 Store2 2 Manager 1 NULL NULL
3 Store3 2 Manager 1 NULL NULL
1The one I've named Staffing and you didn't name in the question. Note also (for future questions) that my presentation of the sample data takes up approximately as much space as yours in the question, provides the data types, and is a runnable script.
Please use Cross join followed by Left join and filter on IMP and StoreID null.
create table #Stores
(storeID int, Name varchar(100))
create table #ES
(empid int,storeID int)
create table #E
(eid int,employee varchar(100), imp int)
insert into #stores values(
1,'Store1'),
(2,'Store2'),
(3,'Store3')
insert into #ES values(
1,1),(2,1),(3,1),(1,2),(3,2)
insert into #E values
(1,'Cashier',1),
(2,'Manager', 1),
(3,'Guard',0)
select * from #Stores
select * from #ES
select * from #E
select #stores.storeid,#E.eid from #Stores
cross join #E
LEFT join #ES
on #ES.storeid = #Stores.storeid
and #E.eid = #ES.empid
where #E.imp = 1
and #ES.storeID is null
Try this query.
I assumed the table name of the "Employee" is dbo.Employee and table name of "Stores" is dbo.Stores and the intermediate table is "dbo.EmpStore"
SELECT S.StoreID, E.EmID
FROM dbo.Stores S
CROSS JOIN dbo.Employees E
LEFT JOIN dbo.EmpStore ES ON ES.EmID = E.EmID AND ES.StoreID = S.StoreID
WHERE E.Important=1 AND ES.EmID IS NULL

Reverse order of a XML Column in SQL Server

In a SQL Server table, I have a XML column where status are happened (first is oldest, last current status).
I have to write a stored procedure that returns the statuses: newest first, oldest last.
This is what I wrote:
ALTER PROCEDURE [dbo].[GetDeliveryStatus]
#invoiceID nvarchar(255)
AS
BEGIN
SET NOCOUNT ON;
DECLARE #xml xml
SET #xml = (SELECT statusXML
FROM Purchase
WHERE invoiceID = #invoiceID )
SELECT
t.n.value('text()[1]', 'nvarchar(50)') as DeliveryStatus
FROM
#xml.nodes('/statuses/status') as t(n)
ORDER BY
DeliveryStatus DESC
END
Example of value in the statusXML column:
<statuses>
<status>A</status>
<status>B</status>
<status>A</status>
<status>B</status>
<status>C</status>
</statuses>
I want the procedure to return:
C
B
A
B
A
with ORDER BY .... DESC it return ALPHABETIC reversed (C B B A A)
How should I correct my procedure ?
Create a sequence for the nodes based on the existing order then reverse it.
WITH [x] AS (
SELECT
t.n.value('text()[1]', 'nvarchar(50)') as DeliveryStatus
,ROW_NUMBER() OVER (ORDER BY t.n.value('..', 'NVARCHAR(100)')) AS [Order]
FROM
#xml.nodes('/statuses/status') as t(n)
)
SELECT
DeliveryStatus
FROM [x]
ORDER BY [x].[Order] DESC
... results ...
DeliveryStatus
C
B
A
B
A
There is no need to declare a variable first. You can (and you should!) read the needed values from your table column directly. Best was an inline table valued function (rather than a SP just to read something...)
Better performance
inlineable
You can query many InvoiceIDs at once
set-based
Try this (I drop the mock-table at the end - carefull with real data!):
CREATE TABLE Purchase(ID INT IDENTITY,statusXML XML, InvocieID INT, OtherValues VARCHAR(100));
INSERT INTO Purchase VALUES('<statuses>
<status>A</status>
<status>B</status>
<status>A</status>
<status>B</status>
<status>C</status>
</statuses>',100,'Other values of your row');
GO
WITH NumberedStatus AS
(
SELECT ID
,InvocieID
, ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS Nr
,stat.value('.','nvarchar(max)') AS [Status]
,OtherValues
FROM Purchase
CROSS APPLY statusXML.nodes('/statuses/status') AS A(stat)
WHERE InvocieID=100
)
SELECT *
FROM NumberedStatus
ORDER BY Nr DESC
GO
--Clean-Up
--DROP TABLE Purchase;
The result
+---+-----+---+---+--------------------------+
| 1 | 100 | 5 | C | Other values of your row |
+---+-----+---+---+--------------------------+
| 1 | 100 | 4 | B | Other values of your row |
+---+-----+---+---+--------------------------+
| 1 | 100 | 3 | A | Other values of your row |
+---+-----+---+---+--------------------------+
| 1 | 100 | 2 | B | Other values of your row |
+---+-----+---+---+--------------------------+
| 1 | 100 | 1 | A | Other values of your row |
+---+-----+---+---+--------------------------+

How to retrieve unique records having unique values in two columns from a table in SQL Server

I want to query a table where I need the result that contains unique values from two columns together. For e.g.
Table
EnquiryId | EquipmentId | Price
-----------+--------------+-------
1 | E20 | 10
1 | E50 | 40
1 | E60 | 20
2 | E30 | 90
2 | E20 | 10
2 | E90 | 10
3 | E90 | 10
3 | E60 | 10
For each EnquiryId, EquipmentId will be unique in the table. Now I want a result where I can get something like this
EnquiryId | EquipmentId | Price
-----------+--------------+-------
1 | E20 | 10
2 | E30 | 90
3 | E90 | 10
In the result each enquiryId present in the table should be displayed uniquely.
If suppose I have 3 EquipmentIds "E20,E50,E60" for EnquiryId "1".. Any random EquipmentId should be displayed from these three values only.
Any help would be appreciated. Thank you in advance.
QUERY
;WITH cte AS
(
SELECT *,
ROW_NUMBER() OVER
(PARTITION BY enquiryID
ORDER BY enquiryID ) AS RN
FROM tbl
)
SELECT enquiryID,equipmentID,Price
FROM cte
WHERE RN=1
FIND FIDDLE HERE
The following code must help you..
Sorry that I ended up in a lengthy solution only. Run it in your SSMS and see the result.
Declare #tab table (EnquiryId int, EquipmentId varchar(10),Price int)
Insert into #tab values
(1,'E20',10),
(1,'E50',40),
(1,'E60',20),
(2,'E30',90),
(2,'E20',10),
(2,'E90',10),
(3,'E90',10),
(3,'E60',10)
----------------------------------------------
Declare #s int = 1
Declare #e int,#z varchar(10)
Declare #Equipment table (EquipmentId varchar(10),ind int)
Insert into #Equipment (EquipmentId) Select Distinct EquipmentId From #tab
Declare #Enquiry table (id int identity(1,1),EnquiryId int,EquipmentId varchar(10))
Insert into #Enquiry (EnquiryId) Select Distinct EnquiryId From #tab
Set #e = ##ROWCOUNT
While #s <= #e
begin
Select Top 1 #z = T.EquipmentId
From #tab T
Join #Enquiry E On T.EnquiryId = E.EnquiryId
Join #Equipment Eq On Eq.EquipmentId = T.EquipmentId
Where E.id = #s
And Eq.ind is Null
Order by NEWID()
update #Enquiry
Set EquipmentId = #z
Where id = #s
update #Equipment
Set ind = 1
Where EquipmentId = #z
Set #s = #s + 1
End
Select T.EnquiryId,T.EquipmentId,T.Price
From #tab T
left join #Enquiry E on T.EnquiryId = E.EnquiryId
Where T.EquipmentId = E.EquipmentId
You can use GROUP BY (Typical way) to remove duplicate value.
Basic steps are:
Alter table & Add Identity Column.
Group by columns which can be dupicate.
Delete those record.
Check here Remove Duplicate Rows from a Table in SQL Server

Resources