Related
Following is the stored procedure
ALTER PROCEDURE [dbo].[get_data_Dyna]
{
#param1 varchar(max) = null,
#param2 varchar(max) = null,
#start varchar(max) = null,
#end varchar(max) = null
}
AS
SELECT * from table where
(#param1 IS NULL OR column1 IN (SELECT data FROM dbo.delimited_list_to_table(#param1,',')))
AND (#param2 IS NULL OR column2 IN (SELECT data FROM dbo.delimited_list_to_table(#param2,',')))
AND ....?????
How this is working :
All the parameters can be comma seperated
#param1 value can be 'Germany' or 'Germany,USA' or null. This is working as expected.
Same goes for #param2
I'm trying to include rest of the parameters which is expected to work as follows :
#start='0' and #end='100' : In this case, where clause will look like this
...AND val BETWEEN #start AND #end
#start='48,60' and #end='51,99' : In this case, where clause will look like this
...AND ((val Between 48 and 51) or (val Between 60 and 99))
#start='48,60,75' and #end='51,99,203' : In this case, where clause will look like this
...AND ((val Between 48 and 51) or (val Between 60 and 99) or (val Between 75 and 203))
I'm unable to include above 2nd/3rd point correctly. I tried to write it dynamically which is working for single values [Point 1], but how to write point 2/3 ?
Any help is greatly appreciated.
Ok, i think the best approach here would be to use temp tables or table variable.
Lets go with temp tables.
create table #StartEnd (start int not null, end int not null, primary key (start,end))
then we insert from #start and #end into it using dbo.delimited_list_to_table. Now i am not sure about your implementation of it, so i will assume the values are numbered
insert into #StartEnd
select starts.data, ends.data
from dbo.delimited_list_to_table(#start,',') as starts
join dbo.delimited_list_to_table(#end,',') as ends
on starts.index = ends.index
Now we have to filter the values. Two approaches. Join or Exists condition
...
join #StartEnd on val between start and end
...
and exists (select 1 from #StartEnd where val between start and end)
Hope this helps
there you go. The comments / explainations are within the query
-- create a sample table
declare #tbl table
(
val int
)
-- put in some sample data
insert into #tbl
values (48), (60), (51), (99), (75), (203)
-- these are the input parameter
declare #start varchar(100),
#end varchar(100)
-- and these are the input value
select #start = '48,60,75',
#end = '51,99,203'
-- the actual query
; with
start_end as
(
-- here i am using [DelimitedSplit8K][1]
select s = s.Item, e = e.Item
from dbo.[DelimitedSplit8K](#start, ',') s
inner join dbo.[DelimitedSplit8K](#end, ',') e
on s.ItemNumber = e.ItemNumber
)
select t.val
from #tbl t
where exists
(
select *
from start_end x
where t.val between x.s and x.e
)
you can get it here DelimitedSplit8K
Sample input (from our understanding, we guess the your data):
select
* into ##delimit
from (
values
(1 ,'Ger','Ind', 100 )
,(2 ,'Ind',Null, 10 )
,(3 ,'Ger',Null, 24 )
,(4 ,'Ind','Ger', 54 )
,(5 ,'USA','Ind', 56 )
,(6 ,Null,'USA', 75 )-- NULL. But USA is three time came.
,(7 ,'USA','USA', 60 )-- same country with diff val.
,(8 ,'USA','USA', 80 )-- same country with diff val.
) demilit(Id,FromPr,ToPr,Val)
select * from ##delimit
Procedure (you just use this instead of your procedure):
CREATE PROCEDURE [dbo].[get_data_Dyna]
(#param1 varchar(max) = NULL,
#param2 varchar(max) = NULL,
#start varchar(max) = NULL,
#end varchar(max) = NULL)
AS
BEGIN
SELECT *
FROM ##delimit d
JOIN
( --| Here we check the val btw #start and #end
SELECT DISTINCT
s.FinalColumn StartVal, --|
e.FinalColumn EndVal --|
FROM
dbo.WithoutDelimit (#start, ',') s --| S means 'Start'
JOIN
(SELECT *
FROM dbo.WithoutDelimit (#end, ',')) e ON s.id = e.id --| E means 'End'
) se --| se mean StartEnd
ON d.val BETWEEN se.StartVal AND se.EndVal --| Here YOUR CONDITION is accomplished
WHERE
( -- | checks whether
frompr IN -- | column1 in #param1 or not
(SELECT FinalColumn FROM dbo.WithoutDelimit (#param1,',') -- | frompr means, 'column1'
) OR #param1 is NULL -- |
)
and ( -- | checks whether
ToPr in ( -- | column2 in #param2 or not
select FinalColumn from dbo.WithoutDelimit (#param2,',') -- | frompr means, 'column2'
) or #param2 is null -- |
)
end
Call stored procedure:
[get_data_Dyna] null,'usa','75','100,' -- 6 rows
[get_data_Dyna] 'Ind,Ger',null,'1,15','20,30' --2 and 3 rows are selected.
[get_data_Dyna] 'usa','usa','50,60','55,79'
-- 7 and 8 has same country. But due to Val, 8 has been rejected.
[get_data_Dyna] NULL,'usa','70,60','80,79'
-- 6 and 7 and 8 has been selected. Due to val condition.
Function (called from the stored procedure):
alter function WithoutDelimit -- We use one function for all conditions.
(#Parameter varchar (max),
#delimit varchar (1))
returns #FinalTable table (
Id int identity (1,1) -- Auto increment
, FinalColumn varchar (max) -- It returns the values as a column.
) as
begin
;with cte as -- recursive cte.
(
select convert (varchar (255), #Parameter + #delimit) con
, convert (varchar (255), #Parameter + #delimit) want
union all
select convert (varchar (255), stuff (con, 1, CHARINDEX (#demilit,con),'') )
, substring (con, 1, CHARINDEX (#delimit, con) - 1)
from cte
where con <> ''
) insert into #FinalTable (FinalColumn)
select want from cte
where con <> want
return
end
Revert us, if query need update.
I have a table called #Tbl1, Each GROUP is 1 row and I have to extract the number of rows for each to #Tbl_Insert type.
Declare #Tbl1 Table (TableName NVARCHAR(250),ColumnName NVARCHAR(250),DataType NVARCHAR(250),DataValue NVARCHAR(250),InGroup NVARCHAR(250))
Declare #Tbl_Insert Table (ID INT, Name NVARCHAR(250), Age INT)
-- Sample Data
Insert Into #Tbl1 values ('#Tbl_Insert','ID','INT','1','Group1'),('#Tbl_Insert','Name','NVARCHAR(250)','John.Adam','Group1'),('#Tbl_Insert','Age','INT','10','Group1')
Insert Into #Tbl1 values ('#Tbl_Insert','ID','INT','2','Group2'),('#Tbl_Insert','Name','NVARCHAR(250)','Andy.Law','Group2'),('#Tbl_Insert','Age','INT','18','Group2')
I can convert #tbl1 to row by row into #Table_TEMP
Declare #Table_TEMP (Data nvarchar(max))
Insert Into #Table_TEMP
SELECT LEFT([DataValues] , LEN([DataValues] )-1)
FROM #Tbl1 AS extern
CROSS APPLY
(
SELECT Concat('''', Replace( ISNULL([DataValue],''), '''','' ) + ''',')
FROM #Tbl1 AS intern
WHERE extern.InGroup = intern.InGroup
Order By InGroup, ColumnName
FOR XML PATH('')
) pre_trimmed ( [DataValues])
GROUP BY InGroup, [DataValues]
I have to extract the number of rows in #Tbl1 ( Or #Table_TEMP) to #Tbl_Insert.
I don't want to use cursor to loop Insert row by row in #Table_TEMP, because, when you met with big data (example > 10000 rows). It's run to slow.
Please help.
I found sample in stackorverflow
Declare #tbl_Temp Table (Data NVARCHAR(MAX))
Declare #tbl2 Table (A NVARCHAR(MAX),B NVARCHAR(MAX),C NVARCHAR(MAX))
Insert Into #tbl_Temp values ('a1*b1*c1')
INSERT INTO #tbl2 (A,B,C)
SELECT PARSENAME(REPLACE(Data,'*','.'),3)
,PARSENAME(REPLACE(Data,'*','.'),2)
,PARSENAME(REPLACE(Data,'*','.'),1)
FROM #tbl_Temp
select * from #tbl2
It's nearly the same, but,
My data have "DOT", can not use PARSENAME
I must know numbers of DOT to Build Dynamics SQL??
PARSENAME only support 3 "DOT", It's null when More Dot.
EXAMPLE:
Declare #ObjectName nVarChar(1000)
Set #ObjectName = 'HeadOfficeSQL1.Northwind.dbo.Authors'
SELECT
PARSENAME(#ObjectName, 5) as Server4,
PARSENAME(#ObjectName, 4) as Server,
PARSENAME(#ObjectName, 3) as DB,
PARSENAME(#ObjectName, 2) as Owner,
PARSENAME(#ObjectName, 1) as Object
If, i understand correctly you will need to use apply in order to fetch the records & insert the data into other table
insert into #Tbl_Insert (ID, Name, Age)
select max(a.id) [id], max(a.Name) [Name], max(a.Age) [Age] from #Tbl1 t
cross apply
(values
(case when t.ColumnName = 'ID' then t.DataValue end,
case when t.ColumnName = 'Name' then t.DataValue end,
case when t.ColumnName = 'Age' then t.DataValue end, t.InGroup)
) as a(id, Name, Age, [Group])
group by a.[Group]
select * from #Tbl_Insert
I do both #Tbl_Insert & create 1 store to do like PARSENAME. It's improved performance.
create function dbo.fnGetCsvPart(#csv varchar(8000),#index tinyint, #last bit = 0)
returns varchar(4000)
as
/* function to retrieve 0 based "column" from csv string */
begin
declare #i int; set #i = 0
while 1 = 1
begin
if #index = 0
begin
if #last = 1 or charindex(',',#csv,#i+1) = 0
return substring(#csv,#i+1,len(#csv)-#i+1)
else
return substring(#csv,#i+1,charindex(',',#csv,#i+1)-#i-1)
end
select #index = #index-1, #i = charindex(',',#csv,#i+1)
if #i = 0 break
end
return null
end
GO
I've got dirty data in a column with variable alpha length. I just want to strip out anything that is not 0-9.
I do not want to run a function or proc. I have a script that is similar that just grabs the numeric value after text, it looks like this:
Update TableName
set ColumntoUpdate=cast(replace(Columnofdirtydata,'Alpha #','') as int)
where Columnofdirtydata like 'Alpha #%'
And ColumntoUpdate is Null
I thought it would work pretty good until I found that some of the data fields I thought would just be in the format Alpha # 12345789 are not.
Examples of data that needs to be stripped
AB ABCDE # 123
ABCDE# 123
AB: ABC# 123
I just want the 123. It is true that all data fields do have the # prior to the number.
I tried substring and PatIndex, but I'm not quite getting the syntax correct or something. Anyone have any advice on the best way to address this?
See this blog post on extracting numbers from strings in SQL Server. Below is a sample using a string in your example:
DECLARE #textval NVARCHAR(30)
SET #textval = 'AB ABCDE # 123'
SELECT LEFT(SUBSTRING(#textval, PATINDEX('%[0-9.-]%', #textval), 8000),
PATINDEX('%[^0-9.-]%', SUBSTRING(#textval, PATINDEX('%[0-9.-]%', #textval), 8000) + 'X') -1)
Here is an elegant solution if your server supports the TRANSLATE function (on sql server it's available on sql server 2017+ and also sql azure).
First, it replaces any non numeric characters with a # character.
Then, it removes all # characters.
You may need to add additional characters that you know may be present in the second parameter of the TRANSLATE call.
select REPLACE(TRANSLATE([Col], 'abcdefghijklmnopqrstuvwxyz+()- ,#+', '##################################'), '#', '')
You can use stuff and patindex.
stuff(Col, 1, patindex('%[0-9]%', Col)-1, '')
SQL Fiddle
This works well for me:
CREATE FUNCTION [dbo].[StripNonNumerics]
(
#Temp varchar(255)
)
RETURNS varchar(255)
AS
Begin
Declare #KeepValues as varchar(50)
Set #KeepValues = '%[^0-9]%'
While PatIndex(#KeepValues, #Temp) > 0
Set #Temp = Stuff(#Temp, PatIndex(#KeepValues, #Temp), 1, '')
Return #Temp
End
Then call the function like so to see the original something next to the sanitized something:
SELECT Something, dbo.StripNonNumerics(Something) FROM TableA
In case if there are some characters possible between digits (e.g. thousands separators), you may try following:
declare #table table (DirtyCol varchar(100))
insert into #table values
('AB ABCDE # 123')
,('ABCDE# 123')
,('AB: ABC# 123')
,('AB#')
,('AB # 1 000 000')
,('AB # 1`234`567')
,('AB # (9)(876)(543)')
;with tally as (select top (100) N=row_number() over (order by ##spid) from sys.all_columns),
data as (
select DirtyCol, Col
from #table
cross apply (
select (select C + ''
from (select N, substring(DirtyCol, N, 1) C from tally where N<=datalength(DirtyCol)) [1]
where C between '0' and '9'
order by N
for xml path(''))
) p (Col)
where p.Col is not NULL
)
select DirtyCol, cast(Col as int) IntCol
from data
Output is:
DirtyCol IntCol
--------------------- -------
AB ABCDE # 123 123
ABCDE# 123 123
AB: ABC# 123 123
AB # 1 000 000 1000000
AB # 1`234`567 1234567
AB # (9)(876)(543) 9876543
For update, add ColToUpdate to select list of the data cte:
;with num as (...),
data as (
select ColToUpdate, /*DirtyCol, */Col
from ...
)
update data
set ColToUpdate = cast(Col as int)
CREATE FUNCTION FN_RemoveNonNumeric (#Input NVARCHAR(512))
RETURNS NVARCHAR(512)
AS
BEGIN
DECLARE #Trimmed NVARCHAR(512)
SELECT #Trimmed = #Input
WHILE PATINDEX('%[^0-9]%', #Trimmed) > 0
SELECT #Trimmed = REPLACE(#Trimmed, SUBSTRING(#Trimmed, PATINDEX('%[^0-9]%', #Trimmed), 1), '')
RETURN #Trimmed
END
GO
SELECT dbo.FN_RemoveNonNumeric('ABCDE# 123')
Pretty late to the party, I found the following which I though worked brilliantialy.. if anyone is still looking
SELECT
(SELECT CAST(CAST((
SELECT SUBSTRING(FieldToStrip, Number, 1)
FROM master..spt_values
WHERE Type='p' AND Number <= LEN(FieldToStrip) AND
SUBSTRING(FieldToStrip, Number, 1) LIKE '[0-9]' FOR XML Path(''))
AS xml) AS varchar(MAX)))
FROM
SourceTable
Here's a version which pulls all digits from a string; i.e. given I'm 35 years old; I was born in 1982. The average family has 2.4 children. this would return 35198224. i.e. it's good where you've got numeric data which may have been formatted as a code (e.g. #123,456,789 / 123-00005), but isn't appropriate if you're looking to pull out specific numbers (i.e. as opposed to digits / just the numeric characters) from the text. Also it only handles digits; so won't return negative signs (-) or periods .).
declare #table table (id bigint not null identity (1,1), data nvarchar(max))
insert #table (data)
values ('hello 123 its 45613 then') --outputs: 12345613
,('1 some other string 98 example 4') --outputs: 1984
,('AB ABCDE # 123') --outputs: 123
,('ABCDE# 123') --outputs: 123
,('AB: ABC# 123') --outputs: 123
; with NonNumerics as (
select id
, data original
--the below line replaces all digits with blanks
, replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(data,'0',''),'1',''),'2',''),'3',''),'4',''),'5',''),'6',''),'7',''),'8',''),'9','') nonNumeric
from #table
)
--each iteration of the below CTE removes another non-numeric character from the original string, putting the result into the numerics column
, Numerics as (
select id
, replace(original, substring(nonNumeric,1,1), '') numerics
, replace(nonNumeric, substring(nonNumeric,1,1), '') charsToreplace
, len(replace(nonNumeric, substring(nonNumeric,1,1), '')) charsRemaining
from NonNumerics
union all
select id
, replace(numerics, substring(charsToreplace,1,1), '') numerics
, replace(charsToreplace, substring(charsToreplace,1,1), '') charsToreplace
, len(replace(charsToreplace, substring(charsToreplace,1,1), '')) charsRemaining
from Numerics
where charsRemaining > 0
)
--we select only those strings with `charsRemaining=0`; i.e. the rows for which all non-numeric characters have been removed; there should be 1 row returned for every 1 row in the original data set.
select * from Numerics where charsRemaining = 0
This code works by removing all the digits (i.e. the characters we want) from a the given strings by replacing them with blanks. Then it goes through the original string (which includes the digits) removing all of the characters that were left (i.e. the non-numeric characters), thus leaving only the digits.
The reason we do this in 2 steps, rather than just removing all non-numeric characters in the first place is there are only 10 digits, whilst there are a huge number of possible characters; so replacing that small list is relatively fast; then gives us a list of those non-numeric characters which actually exist in the string, so we can then replace that small set.
The method makes use of recursive SQL, using common table expressions (CTEs).
To add on to Ken's answer, this handles commas and spaces and parentheses
--Handles parentheses, commas, spaces, hyphens..
declare #table table (c varchar(256))
insert into #table
values
('This is a test 111-222-3344'),
('Some Sample Text (111)-222-3344'),
('Hello there 111222 3344 / How are you?'),
('Hello there 111 222 3344 ? How are you?'),
('Hello there 111 222 3344. How are you?')
select
replace(LEFT(SUBSTRING(replace(replace(replace(replace(replace(c,'(',''),')',''),'-',''),' ',''),',',''), PATINDEX('%[0-9.-]%', replace(replace(replace(replace(replace(c,'(',''),')',''),'-',''),' ',''),',','')), 8000),
PATINDEX('%[^0-9.-]%', SUBSTRING(replace(replace(replace(replace(replace(c,'(',''),')',''),'-',''),' ',''),',',''), PATINDEX('%[0-9.-]%', replace(replace(replace(replace(replace(c,'(',''),')',''),'-',''),' ',''),',','')), 8000) + 'X') -1),'.','')
from #table
Create function fn_GetNumbersOnly(#pn varchar(100))
Returns varchar(max)
AS
BEGIN
Declare #r varchar(max) ='', #len int ,#c char(1), #x int = 0
Select #len = len(#pn)
while #x <= #len
begin
Select #c = SUBSTRING(#pn,#x,1)
if ISNUMERIC(#c) = 1 and #c <> '-'
Select #r = #r + #c
Select #x = #x +1
end
return #r
End
In your case It seems like the # will always be after teh # symbol so using CHARINDEX() with LTRIM() and RTRIM() would probably perform the best. But here is an interesting method of getting rid of ANY non digit. It utilizes a tally table and table of digits to limit which characters are accepted then XML technique to concatenate back to a single string without the non-numeric characters. The neat thing about this technique is it could be expanded to included ANY Allowed characters and strip out anything that is not allowed.
DECLARE #ExampleData AS TABLE (Col VARCHAR(100))
INSERT INTO #ExampleData (Col) VALUES ('AB ABCDE # 123'),('ABCDE# 123'),('AB: ABC# 123')
DECLARE #Digits AS TABLE (D CHAR(1))
INSERT INTO #Digits (D) VALUES ('0'),('1'),('2'),('3'),('4'),('5'),('6'),('7'),('8'),('9')
;WITH cteTally AS (
SELECT
I = ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM
#Digits d10
CROSS APPLY #Digits d100
--add more cross applies to cover longer fields this handles 100
)
SELECT *
FROM
#ExampleData e
OUTER APPLY (
SELECT CleansedPhone = CAST((
SELECT TOP 100
SUBSTRING(e.Col,t.I,1)
FROM
cteTally t
INNER JOIN #Digits d
ON SUBSTRING(e.Col,t.I,1) = d.D
WHERE
I <= LEN(e.Col)
ORDER BY
t.I
FOR XML PATH('')) AS VARCHAR(100))) o
Declare #MainTable table(id int identity(1,1),TextField varchar(100))
INSERT INTO #MainTable (TextField)
VALUES
('6B32E')
declare #i int=1
Declare #originalWord varchar(100)=''
WHile #i<=(Select count(*) from #MainTable)
BEGIN
Select #originalWord=TextField from #MainTable where id=#i
Declare #r varchar(max) ='', #len int ,#c char(1), #x int = 0
Select #len = len(#originalWord)
declare #pn varchar(100)=#originalWord
while #x <= #len
begin
Select #c = SUBSTRING(#pn,#x,1)
if(#c!='')
BEGIN
if ISNUMERIC(#c) = 0 and #c <> '-'
BEGIN
Select #r = cast(#r as varchar) + cast(replace((SELECT ASCII(#c)-64),'-','') as varchar)
end
ELSE
BEGIN
Select #r = #r + #c
END
END
Select #x = #x +1
END
Select #r
Set #i=#i+1
END
I have created a function for this
Create FUNCTION RemoveCharacters (#text varchar(30))
RETURNS VARCHAR(30)
AS
BEGIN
declare #index as int
declare #newtexval as varchar(30)
set #index = (select PATINDEX('%[A-Z.-/?]%', #text))
if (#index =0)
begin
return #text
end
else
begin
set #newtexval = (select STUFF ( #text , #index , 1 , '' ))
return dbo.RemoveCharacters(#newtexval)
end
return 0
END
GO
Here is the answer:
DECLARE #t TABLE (tVal VARCHAR(100))
INSERT INTO #t VALUES('123')
INSERT INTO #t VALUES('123S')
INSERT INTO #t VALUES('A123,123')
INSERT INTO #t VALUES('a123..A123')
;WITH cte (original, tVal, n)
AS
(
SELECT t.tVal AS original,
LOWER(t.tVal) AS tVal,
65 AS n
FROM #t AS t
UNION ALL
SELECT tVal AS original,
CAST(REPLACE(LOWER(tVal), LOWER(CHAR(n)), '') AS VARCHAR(100)),
n + 1
FROM cte
WHERE n <= 90
)
SELECT t1.tVal AS OldVal,
t.tval AS NewVal
FROM (
SELECT original,
tVal,
ROW_NUMBER() OVER(PARTITION BY tVal + original ORDER BY original) AS Sl
FROM cte
WHERE PATINDEX('%[a-z]%', tVal) = 0
) t
INNER JOIN #t t1
ON t.original = t1.tVal
WHERE t.sl = 1
You can create SQL CLR scalar function in order to be able to use regular expressions like replace patterns.
Here you can find example of how to create such function.
Having such function will solve the issue with just the following lines:
SELECT [dbo].[fn_Utils_RegexReplace] ('AB ABCDE # 123', '[^0-9]', '');
SELECT [dbo].[fn_Utils_RegexReplace] ('ABCDE# 123', '[^0-9]', '');
SELECT [dbo].[fn_Utils_RegexReplace] ('AB: ABC# 123', '[^0-9]', '');
More important, you will be able to solve more complex issues as the regular expressions will bring a whole new world of options directly in your T-SQL statements.
Use this:
REPLACE(TRANSLATE(SomeString, REPLACE(TRANSLATE(SomeString, '0123456789', '##########'), '#', ''), REPLICATE('#', LEN(REPLACE(TRANSLATE(SomeString, '0123456789', '##########'), '#', '') + 'x') - 1)), '#', '')
Demo:
DROP TABLE IF EXISTS #MyTempTable;
CREATE TABLE #MyTempTable (SomeString VARCHAR(255));
INSERT INTO #MyTempTable
VALUES ('ssss123ssg99d362sdg')
, ('hey 62q&*^(n43')
, (NULL)
, ('')
, ('hi')
, ('123');
SELECT SomeString
, REPLACE(TRANSLATE(SomeString, REPLACE(TRANSLATE(SomeString, '0123456789', '##########'), '#', ''), REPLICATE('#', LEN(REPLACE(TRANSLATE(SomeString, '0123456789', '##########'), '#', '') + 'x') - 1)), '#', '')
FROM #MyTempTable;
DROP TABLE IF EXISTS #MyTempTable;
Results:
SomeString
(No column name)
ssss123ssg99d362sdg
12399362
hey62q&*^(n43
6243
NULL
NULL
hi
123
123
While the OP wanted to "strip out anything that is not 0-9", the post is also tagged with "substring" and "patindex", and the OP mentioned the concern "not quite getting the syntax correct or something". To address that the requirements note that "all data fields do have the # prior to the number" and to provide an answer that addresses the challenges with substring/patindex, consider the following:
/* A sample select */
;WITH SampleValues AS
( SELECT 'AB ABCDE # 123' [Columnofdirtydata]
UNION ALL SELECT 'AB2: ABC# 123')
SELECT
s.Columnofdirtydata,
f1.pos1,
'['+ f2.substr +']' [InspectOutput]
FROM
SampleValues s
CROSS APPLY (SELECT PATINDEX('%# %',s.Columnofdirtydata) [pos1]) f1
CROSS APPLY (SELECT SUBSTRING(s.Columnofdirtydata, f1.pos1 + LEN('#-'),LEN(s.Columnofdirtydata)) [substr]) f2
/* Using update scenario from OP */
UPDATE t1
SET t1.Columntoupdate = CAST(f2.substr AS INT)
FROM
TableName t1
CROSS APPLY (SELECT PATINDEX('%# %',t1.Columnofdirtydata) [pos1]) f1
CROSS APPLY (SELECT SUBSTRING(t1.Columnofdirtydata, f1.pos1 + LEN('#-'),LEN(t1.Columnofdirtydata)) [substr]) f2
Note that my syntax advice for patindex/substring, is to:
consider using APPLY as a way to temporarily alias results from one function for use as parameters in the next. It's not uncommon to (in ETL, for example) need to parse out parameter/position-based substrings in an updatable column of a staging table. If you need to "debug" and potentially fix some parsing logic, this style will help.
consider using LEN('PatternSample') in your substring logic, to account for reusing this pattern or adjusting it when your source data changes (instead of "+ 1"
SUBSTRING() requires a length parameter, but it can be greater than the length of the string. Therefore, if you are getting "the rest of the string" after the pattern, you can just use "The source length"
DECLARE #STR VARCHAR(400)
DECLARE #specialchars VARCHAR(50) = '%[~,#,#,$,%,&,*,(,),!^?:]%'
SET #STR = '1, 45 4,3 68.00-'
WHILE PATINDEX( #specialchars, #STR ) > 0
---Remove special characters using Replace function
SET #STR = Replace(Replace(REPLACE( #STR, SUBSTRING( #STR, PATINDEX( #specialchars, #STR ), 1 ),''),'-',''), ' ','')
SELECT #STR
SELECT REGEXP_REPLACE( col, '[^[:digit:]]', '' ) AS new_col FROM my_table
I am trying to get information for products that have an ID that is contained in a list. The problem is that the list contains some single values and some range values:
PX03 - PX069, PX20, PX202, PX25 - PX270, PX250 - PX2509, PX251, PX2511 -
PX2513
Basically what I am looking for is some way to take a list or string containing both values and ranges and the end output is a table or list that has all of the values within the ranges individually so that I can loop through them.
I have a stored procedure that loops through all the ID's in the main products table that use the 'PX' prefix, but the table has all ids (i.e. PX 1 - 9999, LX 00001 - 99999) and I only want to search through those contained in the above list. I could write it out all the id's individually but some of the ranges contain many values, which would be time consuming to go through.
My idea was to create a separate table containing this list, in which there would be three columns: an identity column, and then one column each for the beginning and end of the range. Any items that do not have a range would just have the same value for beginning and end range, i.e.:
----------------------------------
rownum | range_start | range_end|
----------------------------------
1 PX03 PX069
2 PX20 PX20
3 PX202 PX202
4 PX25 PX25
5 PX250 PX2509
and then populating a table using something like:
SELECT id from product_table
WHERE id BETWEEN listtable.range_start AND listtable.range_end
where product_table is my original table with the product id's and their information and listtable is the new table I just created. This would give me:
id|
---
PX03
PX030
PX031
PX032
PX033
.
.
.
PX067
PX068
PX069
PX20
PX202
PX25
PX250
PX251
etc.
but I am thinking I would need to iterate through the list and I am not sure how to do that. Any ideas, hints or suggestions?
UPDATE
After creating the table using the solution given by #asantaballa, it was as simple as using an inner join:
SELECT d.id
FROM product_table d
INNER JOIN #RangeTable r
ON d.id BETWEEN r.RangeFrom AND r.RangeTo
See if this works for you for the part about converting the string to a table.
Declare #StrList Varchar(1000) = 'PX03 - PX069, PX20, PX202, PX25 - PX270, PX250 - PX2509, PX251, PX2511 - PX2513'
Declare #RangeTable Table (RangeFrom VarChar(32), RangeTo VarChar(32))
Select #StrList = Replace(#StrList,' ', '') + ','
Declare #StrListItem Varchar(32)
While CHARINDEX(',', #StrList) > 0
Begin
Select #StrListItem = SUBSTRING(#StrList,1,CHARINDEX(',', #StrList) - 1)
Declare
#RangeFrom VarChar(32)
, #RangeTo VarChar(32)
If CHARINDEX('-', #StrListItem) = 0
Begin
Select
#RangeFrom = #StrListItem
, #RangeTo = #StrListItem
End
Else
Begin
Select
#RangeFrom = SUBSTRING(#StrListItem, 1, CHARINDEX('-', #StrListItem) - 1)
, #RangeTo = SUBSTRING(#StrListItem, CHARINDEX('-', #StrListItem) + 1, LEN(#StrListItem) - CHARINDEX('-', #StrListItem))
End
Insert Into #RangeTable (RangeFrom, RangeTo) Values (#RangeFrom, #RangeTo)
Select #StrList = SUBSTRING(#StrList, CHARINDEX(',', #StrList) + 1, LEN(#StrList) - CHARINDEX(',', #StrList))
End
Select * From #RangeTable
Here is your string and product_table
DECLARE #STR VARCHAR(100) = 'PX03 - PX069, PX20, PX202, PX25 - PX270, PX250 - PX2509, PX251, PX2511 - PX2513'
SELECT * INTO #product_table
FROM
(
SELECT 'PX4' PRODID
UNION ALL
SELECT 'PX26'
UNION ALL
SELECT 'PX75'
UNION ALL
SELECT 'PX77'
)TAB
Now create a table to hold the value
CREATE TABLE #listtable(ROWNUM int IDENTITY(1,1),range_start VARCHAR(100),range_end VARCHAR(100))
Now insert the splitted value to the table.
INSERT INTO #listtable
SELECT
ISNULL(PARSENAME(REPLACE(Split.a.value('.', 'VARCHAR(100)'),'-','.'),2),Split.a.value('.', 'VARCHAR(100)')) 'range_start' ,
PARSENAME(REPLACE(Split.a.value('.', 'VARCHAR(100)'),'-','.'),1) 'range_end'
FROM
(
SELECT CAST ('<M>' + REPLACE(#STR, ',', '</M><M>') + '</M>' AS XML) AS Data
) AS A
CROSS APPLY Data.nodes ('/M') AS Split(a)
Since Id is string, you need a function to extract numbers from Id(function created by God of SQL Server - Pinal Dave)
CREATE FUNCTION dbo.udf_GetNumeric
(#strAlphaNumeric VARCHAR(256))
RETURNS VARCHAR(256)
AS
BEGIN
DECLARE #intAlpha INT
SET #intAlpha = PATINDEX('%[^0-9]%', #strAlphaNumeric)
BEGIN
WHILE #intAlpha > 0
BEGIN
SET #strAlphaNumeric = STUFF(#strAlphaNumeric, #intAlpha, 1, '' )
SET #intAlpha = PATINDEX('%[^0-9]%', #strAlphaNumeric )
END
END
RETURN ISNULL(#strAlphaNumeric,0)
END
First of all keep in mind that we will not get PX1,PX2,PX3,PX4 if you give id BETWEEN listtable.range_start AND listtable.range_end because those are of varchar type and not numbers. So we need to extract numbers from each PX and get the values between them and append PX.
Here is the query which filters the IDs in product_table which are in the range between listtable
;WITH CTE AS
(
SELECT ROWNUM,CAST(dbo.udf_GetNumeric(range_start)AS INT) NUMBERS,
CAST(dbo.udf_GetNumeric(range_end)AS INT) RTO1
FROM #listtable
UNION ALL
SELECT T.ROWNUM,NUMBERS+1,RTO1
FROM #listtable T
JOIN CTE ON CTE.ROWNUM = T.ROWNUM
WHERE NUMBERS < RTO1
)
SELECT PRODID IDS--,ROWNUM,NUMBERS NUMS,'PX'+CAST(NUMBERS AS VARCHAR(10)) IDS2
FROM CTE
JOIN #product_table ON PRODID='PX'+CAST(NUMBERS AS VARCHAR(10))
ORDER BY NUMBERS
option (MaxRecursion 0)
SQL FIDDLE
I have written the following function that takes in two strings (comma-separated), splits them into two different temp tables and then uses those temp tables to find what percentage of words match in those two temp tables. The problem is that when I am using it per row basis on a data set of about 200k rows, the query times out!
Are there any optimizations that you can see that can be done?
ALTER FUNCTION [GetWordSimilarity](#String varchar(8000),
#String2 varchar(8000),#Delimiter char(1))
returns decimal(16,2)
as
begin
declare #result as decimal (16,2)
declare #temptable table (items varchar(8000))
declare #temptable2 table (items varchar(8000))
declare #numberOfCommonWords decimal(16,2)
declare #countTable1 decimal(16,2)
declare #countTable2 decimal(16,2)
declare #denominator decimal(16,2)
set #result = 0.0 --dummy value
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null or len(#String2) = 0 or #String2 is null return 0.0
--populating #temptable
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(ltrim(rtrim(#slice)))
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
select #idx = 1
----populating #temptable2
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String2)
if #idx!=0
set #slice = left(#String2,#idx - 1)
else
set #slice = #String2
if(len(#slice)>0)
insert into #temptable2(Items) values(ltrim(rtrim(#slice)))
set #String2 = right(#String2,len(#String2) - #idx)
if len(#String2) = 0 break
end
--calculating percentage of words match
if (((select COUNT(*) from #temptable) = 0) or ((select COUNT(*) from #temptable2) = 0))
return 0.0
select #numberOfCommonWords = COUNT(*) from
(
select distinct items from #temptable
intersect
select distinct items from #temptable2
) a
select #countTable1 = COUNT (*) from #temptable
select #countTable2 = COUNT (*) from #temptable2
if(#countTable1 > #countTable2) set #denominator = #countTable1
else set #denominator = #countTable2
set #result = #numberOfCommonWords/#denominator
return #result
end
Thanks a bunch !
There is no way to write a T SQL UDF with heavy string manipulation inside that will behave OK on large number of rows. You will get some gain if you use the Numbers table, though:
declare
#col_list varchar(1000),
#sep char(1)
set #col_list = 'TransactionID, ProductID, ReferenceOrderID, ReferenceOrderLineID, TransactionDate, TransactionType, Quantity, ActualCost, ModifiedDate'
set #sep = ','
select substring(#col_list, n, charindex(#sep, #col_list + #sep, n) - n)
from numbers where substring(#sep + #col_list, n, 1) = #sep
and n < len(#col_list) + 1
Your best course of action would be to write the whole thing in SQLCLR.
The problem of course is with the design. You shouldn't be storing comma-separated data in a SQL database to start with.
But, I guess we're stuck with it for now.
One thing to consider is converting the function to SQLCLR; SQL by itself is not very good with string operations. (Well, in fact, no language is good with string operations IMHO but SQL really is bad at it =)
The splitter you use to fill #Temptables 1 & 2 can be optimized by using the code from Jeff Moden who wrote several fantastic articles of which the last one can be found here : http://www.sqlservercentral.com/articles/Tally+Table/72993/
Taking his splitter + optimizing the rest of the code a bit I managed to get from 771 seconds to 305 seconds on a 200K random data sample.
Some things to note: the results aren't quite the same. I checked some manually and I actually think the new results are more accurate but don't really have time to go bughunting on both versions.
I tried to convert this to a more set-based approach where I first load all the words in a table that has all words for all row_id's and then join them back together. Although the joining is quite fast, it simply takes too long to create the initial tables so it even loses out on the original function.
Maybe I'll try to figure out another way to make it faster but for now I hope this will help you out a bit.
ALTER FUNCTION [GetWordSimilarity2](#String1 varchar(8000),
#String2 varchar(8000),#Delimiter char(1))
returns decimal(16,2)
as
begin
declare #temptable1 table (items varchar(8000), row_id int IDENTITY(1, 1), PRIMARY KEY (items, row_id))
declare #temptable2 table (items varchar(8000), row_id int IDENTITY(1, 1), PRIMARY KEY (items, row_id))
declare #numberOfCommonWords decimal(16,2)
declare #countTable1 decimal(16,2)
declare #countTable2 decimal(16,2)
-- based on code from Jeff Moden (http://www.sqlservercentral.com/articles/Tally+Table/72993/)
--populating #temptable1
;WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#String1),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#String1,t.N,1) = #Delimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#Delimiter,#String1,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
INSERT #temptable1 (items)
SELECT Item = SUBSTRING(#String1, l.N1, l.L1)
FROM cteLen l
SELECT #countTable1 = ##ROWCOUNT
----populating #temptable2
;WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#String2),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#String2,t.N,1) = #Delimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#Delimiter,#String2,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
INSERT #temptable2 (items)
SELECT Item = SUBSTRING(#String2, l.N1, l.L1)
FROM cteLen l
SELECT #countTable2 = ##ROWCOUNT
--calculating percentage of words match
if #countTable1 = 0 OR #countTable2 = 0
return 0.0
select #numberOfCommonWords = COUNT(DISTINCT t1.items)
from #temptable1 t1
JOIN #temptable2 t2
ON t1.items = t2.items
RETURN #numberOfCommonWords / (CASE WHEN (#countTable1 > #countTable2) THEN #countTable1 ELSE #countTable2 END)
end