Port STRING_AGG to FOR XML - sql-server

I'm currently working on Data Discovery and Classification and I have a query that allows me to see a preview of the data while doing the classification.
This is an example of how it works against AdventureWorks:
DECLARE #TableName VARCHAR(100) = 'Product'
DROP TABLE IF EXISTS #ColumnsToDisplay
SELECT ROW_NUMBER () OVER (ORDER BY tab.name) AS Iteration,
SCHEMA_NAME (tab.schema_id) AS schema_name,
tab.name AS table_name,
--col.column_id,
col.name AS column_name,
--t.name AS data_type,
--col.max_length,
--col.precision,
CAST(NULL AS VARCHAR(MAX)) AS DataSample
INTO #ColumnsToDisplay
FROM sys.tables AS tab
JOIN sys.columns AS col
ON col.object_id = tab.object_id
--LEFT JOIN sys.types AS t
-- ON col.user_type_id = t.user_type_id
WHERE tab.name = #TableName
DECLARE #Iterations INT = 0,
#CurrentIteration INT = 1;
SELECT #Iterations = MAX (Iteration)
FROM #ColumnsToDisplay
WHILE #CurrentIteration <= #Iterations
BEGIN
DECLARE #CurrentTableName VARCHAR(100) = '',
#CurrentColumnName VARCHAR(100) = '',
#DynamicQuery NVARCHAR(1000) = N''
DECLARE #Sample VARCHAR(MAX)
SET #CurrentTableName = '';
SET #DynamicQuery = N'';
SELECT #CurrentTableName = CONCAT (ttq.schema_name, '.', ttq.table_name),
#CurrentColumnName = ttq.column_name
FROM #ColumnsToDisplay AS ttq
WHERE ttq.Iteration = #CurrentIteration
IF (#CurrentTableName = '')
BEGIN
SET #CurrentIteration += 1
CONTINUE
END
SET #DynamicQuery = CONCAT (N'
SELECT #Sample = STRING_AGG(t.ColumnData,'', '')
FROM (
SELECT TOP 5 CAST(x.', #CurrentColumnName, ' AS VARCHAR(MAX)) AS ColumnData
FROM ', #CurrentTableName, ' AS x
WHERE x.', #CurrentColumnName, ' IS NOT NULL
)t')
EXECUTE sys.sp_executesql #DynamicQuery,
N'#Sample VARCHAR(MAX) OUTPUT',
#Sample = #Sample OUTPUT
UPDATE #ColumnsToDisplay
SET DataSample = #Sample
WHERE Iteration = #CurrentIteration
SET #CurrentIteration += 1
END
SELECT ctd.Iteration,
ctd.schema_name,
ctd.table_name,
--ctd.column_id,
ctd.column_name,
--ctd.data_type,
--ctd.max_length,
--ctd.precision,
ctd.DataSample
FROM #ColumnsToDisplay AS ctd
Here the result:
Iteration
schema_name
table_name
column_name
DataSample
1
Production
Product
ProductID
980, 365, 771, 404, 977
2
Production
Product
Name
Adjustable Race, All-Purpose Bike Stand, AWC Logo Cap, BB Ball Bearing, Bearing Ball
3
Production
Product
ProductNumber
AR-5381, BA-8327, BB-7421, BB-8107, BB-9108
4
Production
Product
MakeFlag
0, 0, 1, 0, 1
5
Production
Product
FinishedGoodsFlag
0, 0, 0, 0, 0
6
Production
Product
Color
Black, Black, Black, Silver, Silver
The problem is that this query only works from SQL Server 2017 and above because it uses STRING_AGG. For SQL Server 2016 and below I'm supposed to use STUFF instead.
I followed this example but I couldn't really fix it.
The only thing I know is that the part of the code that I need to port is this:
SET #DynamicQuery = CONCAT (N'
SELECT #Sample = STRING_AGG(t.ColumnData,'', '')
FROM (
SELECT TOP 5 CAST(x.', #CurrentColumnName, ' AS VARCHAR(MAX)) AS ColumnData
FROM ', #CurrentTableName, ' AS x
WHERE x.', #CurrentColumnName, ' IS NOT NULL
)t')
Can anyone help me port STRING_AGG to STUFF?
Thank you

First, let's clarify that stuff is, as Stu mentions, just used to remove the first delimiter.
So, if you had
'a','b','c'
and you wanted to use a commas delimiter, xml path would give you:
,a,b,c
and you use stuff to cut off the first comma:
a,b,c
Your query seems to have no delimiters at all, so you won't use stuff. With that out of the picture, here's me untested guess, it's tricky what with the dynamic query added on top of it:
SET #DynamicQuery = CONCAT (N'
set #Sample =
(
select t.ColumnData
from ', #CurrentTableName, ' AS x
where x.', #CurrentColumnName, ' IS NOT NULL
for xml path(''''), type
).value(''.'',''nvarchar(max)'')
)t')

Than you for your help.
The right question was:
SET #DynamicQuery = CONCAT (N'
SELECT #Sample = STUFF((SELECT '', ''+ t.ColumnData
FROM (
SELECT TOP 5 CAST(x.[', #CurrentColumnName, '] AS VARCHAR(MAX)) AS ColumnData
FROM ', #CurrentTableName, ' AS x
WHERE x.[', #CurrentColumnName, '] IS NOT NULL
) AS t
FOR XML PATH('''')),1,1,'''')')

Related

How to Show the Correct Column Name and Data Type in T-SQL?

I produced DDLs to create tables on Oracle using T-SQL since my source tables are in SQL Server. While I was using only 4 tables for testing, it produced 4 DDLs but it repeated the name of the last column and the data type of the table even though The number of columns is correct.
For example, TABLE_01 has 6 columns and it repeats the last column name and data type for 6 times.
CREATE TABLE TABLE_01(
ISKEY INT
ISKEY INT
ISKEY INT
ISKEY INT
ISKEY INT
ISKEY INT
);
Here is my code. Does anyone find the reason why it repeats them? I couldn't find the cause.
Please help me to solve this problem. I would appreciate your taking time and sharing knowledge for me.
DECLARE #MyList TABLE (Value NVARCHAR(50))
INSERT INTO #MyList VALUES ('TABLE_01')
INSERT INTO #MyList VALUES ('TABLE_02')
INSERT INTO #MyList VALUES ('TABLE_03')
INSERT INTO #MyList VALUES ('TABLE_04')
DECLARE #VALUE VARCHAR(50)
DECLARE #COLNAME VARCHAR(50) = ''
DECLARE #COLTYPE VARCHAR(50) = ''
DECLARE #COLNUM INT = 0
DECLARE #COL_COUNTER INT = 0
DECLARE #COUNTER INT = 0;
DECLARE #MAX INT = (SELECT COUNT(*) FROM #MyList)
-- Loop for Multiple Tables
WHILE #COUNTER < #MAX
BEGIN
SET #VALUE = (SELECT VALUE FROM
(SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL))) [index] , Value from #MyList) R
ORDER BY R.[index] OFFSET #COUNTER
ROWS FETCH NEXT 1 ROWS ONLY);
SELECT CONCAT('CREATE TABLE ' , REPLACE(UPPER(#VALUE), '_',''), '(')
PRINT 'CREATE TABLE ' + REPLACE(UPPER(#VALUE), '_','') + '('
SET #COLNUM = 0
SET #COL_COUNTER = 0
;WITH numcol AS
(
select schema_name(tab.schema_id) as schema_name,
tab.name as table_name,
col.column_id,
col.name as column_name,
t.name as data_type,
col.max_length,
col.precision
from sys.tables as tab
inner join sys.columns as col
on tab.object_id = col.object_id
left join sys.types as t
on col.user_type_id = t.user_type_id
where schema_name(tab.schema_id) = 'dbo' AND tab.name = #VALUE
)
SELECT #COLNUM = COUNT(*) OVER (PARTITION BY schema_name, table_name) FROM numcol
-- Loop for Multiple Columns
WHILE #COL_COUNTER < #COLNUM
BEGIN
SET #COLNAME = ''
SET #COLTYPE = ''
SELECT #COLNAME = REPLACE(UPPER(COL.name), '_',''), #COLTYPE = CASE WHEN UPPER(col_type.name) = 'MONEY' THEN ' ' +' NUMBER(19,4)'
WHEN UPPER(col_type.name) = 'REAL' THEN ' ' +' FLOAT(23)'
WHEN UPPER(col_type.name) = 'FLOAT' THEN ' ' +' FLOAT(49)'
WHEN UPPER(col_type.name) = 'NVARCHAR' THEN ' ' +' NCHAR'
ELSE ' ' + UPPER(col_type.name)
END
FROM sys.columns COL
INNER JOIN sys.tables TAB
On COL.object_id = TAB.object_id
left join sys.types as col_type
on col.user_type_id = col_type.user_type_id
WHERE OBJECT_NAME(TAB.object_id) = #VALUE
PRINT #COLNAME + #COLTYPE
SET #COL_COUNTER = #COL_COUNTER + 1
END
PRINT ');'
SET #COUNTER = #COUNTER + 1
END
Thank you.
That' a really horrible way to create an SQL script. It's effectively a cursor, and more difficult to write, it's also completely unnecessary.
You can just build the whole thing in one go using STRING_AGG.
I make no comment on the validity of the result for Oracle, as I don't know Oracle well enough.
DECLARE #table sysname = 'YourTable';
DECLARE #schema sysname = 'dbo';
DECLARE #columns nvarchar(max);
SELECT #columns = STRING_AGG(CONCAT(
' '
REPLACE(UPPER(col.name), '_', ''),
' ',
CASE UPPER(col_type.name)
WHEN 'MONEY' THEN ' NUMBER(19,4)'
WHEN 'REAL' THEN ' FLOAT(23)'
WHEN 'FLOAT' THEN ' FLOAT(49)'
WHEN 'NVARCHAR' THEN ' NCHAR'
ELSE UPPER(col_type.name)
END,
), ',
')
FROM sys.columns col
INNER JOIN sys.tables tab ON col.object_id = tab.object_id
JOIN sys.types as col_type ON col.user_type_id = col_type.user_type_id
JOIN sys.schemas sch ON sch.schema_id = tab.schema_id
WHERE tab.name = #table
AND sch.name = #schema;
SELECT
CONCAT(
'CREATE TABLE ',
REPLACE(UPPER(#table), '_',''),
' (
',
#columns,
'
)'
);

Update with dynamic tables

I have to write update using dynamic sql becaus i know only name of column that I want to update and names of columns which I will use to join tables in my update. But I don't know the numbers of tables and names. Names of tables I will get in parameter of my procedure in this way
declare #Tables = N'Customer,Employee,Owner'
So I want to have update like this:
update t
set [Status] = 100
from
TemporaryTable t
left join Customer t1 on t1.RecordId = t.RecordId
left join Employee t2 on t2.RecordId = t.RecordId
left join Owner t3 on t3.RecordId =t.RecordId
where
t1.RecordId is null
and t2.RecordId is NULL
and t3.RecordId is null
I know that each table will have column RecordId and want to left join this tables to my TemporaryTable on this column but I don't know the names and numbers of tables. For example I will have one, two, or ten tables with different names. I know that this tables names will be save in parameter #Tables in that way:
#Tables = N'Customer,Employee,Owner'
There is possilble to write this update in dynamic way?
This is an answer, which helps ... to write update using dynamic sql ... and only shows how to generate a dynamic statement. It's based on string splitting. From SQL Server 2016+ you may use STRING_SPLIT() (because here the order of the substrings is not important). For previous versions you need to find a string splitting function.
T-SQL:
DECLARE #Tables nvarchar(max) = N'Customer,Employee,Owner'
DECLARE #join nvarchar(max) = N''
DECLARE #where nvarchar(max) = N''
DECLARE #stm nvarchar(max) = N''
SELECT
#join = #join + CONCAT(
N' LEFT JOIN ',
QUOTENAME(s.[value]),
N' t',
ROW_NUMBER() OVER (ORDER BY (SELECT 1)),
N' ON t',
ROW_NUMBER() OVER (ORDER BY (SELECT 1)),
N'.RecordId = t.RecordId'
),
#where = #where + CONCAT(
N' AND t',
ROW_NUMBER() OVER (ORDER BY (SELECT 1)),
N'.RecordId is NULL'
)
FROM STRING_SPLIT(#Tables, N',') s
SET #stm = CONCAT(
N'UPDATE t SET [Status] = 100 ',
N'FROM TemporaryTable t',
#join,
N' WHERE ',
STUFF(#where, 1, 5, N'')
)
PRINT #stm
EXEC sp_executesql #stm
Notes:
One note, that I think is important - consider passing tables names using table value type for parameter, not as comma-separated text.
It seems like this will suit your needs, though I don't fully understand what you're trying to do. Here we're constructing the final SQL in two pieces (#s and #where) and then concatenating into the final SQL at the end.
declare #Tables varchar(100) = N'Customer,Employee,Owner'
declare #tablenames table (tablename nvarchar(100))
insert #tablenames (tablename)
select value
from string_split(#Tables, ',');
declare #where varchar(max) = ''
declare #s varchar(max) = '
update t
set [Status] = 100
from TemporaryTable t'
select #s += '
left join ' + tablename + ' on ' + tablename + '.RecordId = t.RecordId'
, #where += case when #where = '' then '' else ' and ' end + tablename + '.RecordId is null
'
from #tablenames
print #s + char(13) + ' where ' + #where
exec( #s + char(13) + ' where ' + #where)

A generic stored procedure that can take any source table name as a parameter and merge to a target table

This is my code.
Create Procedure Merge_tables
#tablename varchar(20)
As
create table temp1 ( column_name varchar(20) )
insert into temp1 (column_name)
select Column_Name
from INFORMATION_SCHEMA.COLUMNS
where TABLE_NAME = 'result'
intersect
select Column_Name
from INFORMATION_SCHEMA.COLUMNS
where TABLE_NAME = '#tablename'
Declare #name varchar(max)
Declare #concat varchar(max)
set #concat = ''
while (select COUNT(*) from temp1)>0
Begin
set #name = (select top 1 * from temp1)
set #concat = #concat + #name + ','
select #concat as combined
delete temp1 where temp1.column_name = #name
End
Merge result as T
using #tablename as S on T.TXN_KEY = S.TXN_KEY
when not matched then
insert ('+#concat+') values ('+#concat+')
when matched then
update set T.TXN_KEY = S.TXN_KEY(?)
Table temp1 is storing the common column names. Only specific thing is the key to be matched upon which is TXN_KEY. Rest everything else is generic. Towards the end of the while loop #concat has the combined column names separated by a comma.
The error I get in the merge statement is:
Msg 207, Level 16, State 1, Line 17
Invalid column name '+#concat+'
Also, for update statement to work #concat string needs to be split to set values for individual columns. I have been trying to crack this for a while now.
Thanks in advance.
Ok you have to pass your target table and source table, to define Primary keys. This works, I have tested it a lot.
CREATE PROCEDURE Merge_Tables
(
#tablenameTarget VARCHAR(128),
#tablenameSource VARCHAR(128)
)
AS
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED
SET NOCOUNT ON
--variables
DECLARE #targetPK VARCHAR(128),
#sourcePK VARCHAR(128),
#columns VARCHAR(MAX),
#sql VARCHAR(8000)
--temp table for the primary keys
CREATE TABLE #tableMapping
(
TargetPK VARCHAR(128),
SourcePK VARCHAR(128),
Columns VARCHAR(MAX)
)
--temp table for the comma delimted columns
CREATE TABLE #Columns
(
ColumnsUpdate VARCHAR(MAX)
)
--get the primary keys for both target and source tables. so we make sure we dont update or insert them
INSERT INTO #tableMapping
SELECT cu.COLUMN_NAME,
sourcePK.COLUMN_NAME,
data.columns
FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS ta
INNER JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE cu
ON cu.Constraint_name = ta.CONSTRAINT_NAME
OUTER APPLY
(
SELECT cus.COLUMN_NAME
FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS tas
INNER JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE cus
ON cus.Constraint_name = ta.CONSTRAINT_NAME
WHERE tas.Table_Name = #tablenameSource
AND ta.CONSTRAINT_TYPE = 'Primary Key'
) AS sourcePK
OUTER APPLY
(
SELECT STUFF(
(
SELECT ',' + Column_Name
FROM INFORMATION_SCHEMA.Columns Columns
WHERE ta.Table_Name = Columns.Table_Name
AND Columns.Column_Name <> cu.COLUMN_NAME --dont get the primary key
ORDER BY Column_Name
FOR XML PATH ('')
), 1, 1, '') columns
) AS data
WHERE ta.Table_Name = #tablenameTarget
AND ta.CONSTRAINT_TYPE = 'Primary Key'
--populate the variables so we can use it in our dynamic merge statement
SELECT #targetPK = TargetPK,
#sourcePK = SourcePK,
#columns = Columns
FROM #tableMapping
--make sure the rows match from the source and target tables, and make it in a comma delimted string
INSERT INTO #Columns
SELECT
STUFF(
(
SELECT ',' + 'TRGT.' + Column_Name + ' = SRCE.' + COLUMN_NAME
FROM INFORMATION_SCHEMA.Columns Columns
WHERE t.Table_Name = Columns.Table_Name
AND Column_Name <> #targetPK
ORDER BY Column_Name
FOR XML PATH ('')
), 1, 1, ''
)Columns
FROM INFORMATION_SCHEMA.Columns t
INNER JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS ta
ON ta.TABLE_NAME = t.TABLE_NAME
INNER JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE ccu
ON ccu.Constraint_name = ta.CONSTRAINT_NAME
WHERE t.Table_Name = '' + #tablenameTarget + ''
INTERSECT
SELECT
STUFF(
(
SELECT ',' + 'TRGT.' + Column_Name + ' = SRCE.' + COLUMN_NAME
FROM INFORMATION_SCHEMA.Columns Columns
WHERE t.Table_Name = Columns.Table_Name
AND Column_Name <> #sourcePK
ORDER BY Column_Name
FOR XML PATH ('')
), 1, 1, ''
)Columns
FROM INFORMATION_SCHEMA.Columns t
WHERE t.Table_Name = '' + #tablenameSource + ''
--use dynamic sql for our merge statement
SET #sql = 'MERGE ' + #tablenameTarget + ' AS TRGT
USING ' + #tablenameSource + ' AS SRCE
ON SRCE.' + #sourcePK + ' = TRGT.' + #targetPK + '
WHEN MATCHED THEN UPDATE SET ' + (SELECT ColumnsUpdate FROM #Columns)+ '
WHEN NOT MATCHED BY TARGET THEN
INSERT (' + (SELECT #Columns)+ ')
VALUES (' + (SELECT 'SRCE.' + REPLACE(#columns, ',',',SRCE.')) + ')
WHEN NOT MATCHED BY SOURCE THEN
DELETE;'
EXEC (#sql)
DROP TABLE #Columns
DROP TABLE #tableMapping
1) Firstly why do you need the INTERSECT?
2) Secondly nothing will be inserted into temp1, because you say WHERE
TABLE_NAME = '#tablename'. There will never be a table name #tablename.
Change it to WHERE TABLE_NAME = '' + #tablename + ''
3)Also '+#concat+' needs to be '' + #concat + ''
4) I really think the merge statement will need to be in Dynamic SQL, for you to split the #concat columns.

How do I validate a variable against multple database tables

Does anyone know how to check a a variable against all database table with columns storing the same type of information? I have a poorly designed database that stores ssn in over 60 tables within one database. some of the variations of columns in the various tables include:
app_ssn
ca_ssn
cand_ssn
crl_ssn
cu_ssn
emtaddr_ssn
re_ssn
sfcart_ssn
sfordr_ssn
socsecno
ssn
Ssn
SSN
I want to create a stored procedure that will accept a value and check it against every table that has 'ssn' in the name.Does anyone have idea as to how to do this?
-- I assume that table/column names don't need to be surrounded by square braces. You may want to save matches in a table - I just select them. I also assume ssn is a char.
alter proc proc1
#search1 varchar(500)
as
begin
set nocount on
declare #strsql varchar(500)
declare #curtable sysname
declare #prevtable sysname
declare #column sysname
select top 1 #curtable= table_schema+'.'+table_name, #column=column_name
from INFORMATION_SCHEMA.COLUMNS
where CHARINDEX('ssn',column_name) > 0
order by table_schema+'.'+table_name +column_name
-- make sure that at least one column has ssn in the column name
if #curtable is not null
begin
while (1=1)
begin
set #strsql = 'select * from ' +#curtable +' where '+''''+#search1+''''+ ' = '+#column
print #strsql
-- any matches for passed in ssn will match here...
exec (#strsql)
set #prevtable = #curtable+#column
select top 1 #curtable= table_schema+'.'+table_name, #column=column_name
from INFORMATION_SCHEMA.COLUMNS
where CHARINDEX('ssn',column_name) > 0
and table_schema+'.'+table_name +column_name> #prevtable
order by table_schema+'.'+table_name +column_name
-- when we run out of columns that contain ssn we are done...
if ##ROWCOUNT = 0
break
end
end
end
What you will need to do is some research. But here is where you can start;
SELECT tbl.NAME AS TableName
,cl.NAME AS ColumnName
,IDENTITY(INT, 1, 1) AS ID
INTO #ColumnsToLoop
FROM sys.tables tbl
JOIN sys.columns cl ON cl.object_id = tbl.object_id
This will give you the table / column relation then you can simply build a dynamic SQL string based on each row in the query above (basically loop it) and use EXEC or sp_execsql. So basically;
DECLARE #Loop int = (select min(ID) From #ColumnsToLoop),#MX int = (Select MAX(ID) From #ColumnsToLoop)
WHILE(#Loop<=#MX)
BEGIN
DECLARE #SQL nvarchar(MAX) = 'SQL String'
//Construct the dynamic SQL String
EXEC(#SQL);
SET #Loop += 1
END
Perhaps I went a little too crazy with this one, but let me know. I thought it would best the primary key of the search results with the table name so you could join it to your tables. I also managed to do it without a single cursor or loop.
DECLARE #SSN VARCHAR(25) = '%99%',
#SQL VARCHAR(MAX);
WITH CTE_PrimaryKeys
AS
(
SELECT TABLE_CATALOG,
TABLE_SCHEMA,
TABLE_NAME,
column_name
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE D
WHERE OBJECTPROPERTY(OBJECT_ID(constraint_name), 'IsPrimaryKey') = 1
),
CTE_Columns
AS
(
SELECT A.*,
CONCAT(A.TABLE_CATALOG,'.',A.TABLE_SCHEMA,'.',A.TABLE_NAME) AS FullTableName,
CASE WHEN B.COLUMN_NAME IS NOT NULL THEN 1 ELSE 0 END AS IsPrimaryKey
FROM INFORMATION_SCHEMA.COLUMNS A
LEFT JOIN CTE_PrimaryKeys B
ON A.TABLE_CATALOG = B.TABLE_CATALOG
AND A.TABLE_SCHEMA = B.TABLE_SCHEMA
AND A.TABLE_NAME = B.TABLE_NAME
AND A.COLUMN_NAME = B.COLUMN_NAME
),
CTE_Select
AS
(
SELECT
'SELECT ' +
--This returns the pk_col casted as Varchar and the table name in another columns
STUFF((SELECT ',CAST(' + COLUMN_NAME + ' AS VARCHAR(MAX)) AS pk_col,''' + B.TABLE_NAME + ''' AS Table_Name'
FROM CTE_Columns B
WHERE A.Table_Name = B.TABLE_NAME
AND B.IsPrimaryKey = 1
FOR XML PATH ('')),1,1,'')
+ ' FROM ' + fullTableName +
--This is where I list the columns where LIKE desired SSN
' WHERE ' +
STUFF((SELECT COLUMN_NAME + ' LIKE ''' + #SSN + ''' OR '
FROM CTE_Columns B
WHERE A.Table_Name = B.TABLE_NAME
--This is where I filter so I only get desired columns
AND (
--Uncomment the Collate if your database is case sensitive
COLUMN_NAME /*COLLATE SQL_Latin1_General_CP1_CI_AS*/ LIKE '%ssn%'
--list your column Names that don't have ssn in them
--OR COLUMN_NAME IN ('col1','col2')
)
FOR XML PATH ('')),1,0,'') AS Selects
FROM CTE_Columns A
GROUP BY A.FullTableName,A.TABLE_NAME
)
--Unioning them all together and getting rid of last trailing "OR "
SELECT #SQL = COALESCE(#sql,'') + SUBSTRING(selects,1,LEN(selects) - 3) + ' UNION ALL ' + CHAR(13) --new line for easier debugging
FROM CTE_Select
WHERE selects IS NOT NULL
--Look at your code
SELECT SUBSTRING(#sql,1,LEN(#sql) - 11)

How can I inexpensively determine if a column contains only NULL records?

I have a large table with 500 columns and 100M rows. Based on a small sample, I believe only about 50 of the columns contain any values, and the other 450 contain only NULL values. I want to list the columns that contain no data.
On my current hardware, it would take about 24 hours to query every column (select count(1) from tab where col_n is not null)
Is there a less expensive way to determine that a column is completely empty/NULL?
What about this:
SELECT
SUM(CASE WHEN column_1 IS NOT NULL THEN 1 ELSE 0) column_1_count,
SUM(CASE WHEN column_2 IS NOT NULL THEN 1 ELSE 0) column_2_count,
...
FROM table_name
?
You can easily create this query if you use INFORMATION_SCHEMA.COLUMNS table.
EDIT:
Another idea:
SELECT MAX(column_1), MAX(column_2),..... FROM table_name
If result contains value, column is populated. It should require one table scan.
Try this one -
DDL:
IF OBJECT_ID ('dbo.test2') IS NOT NULL
DROP TABLE dbo.test2
CREATE TABLE dbo.test2
(
ID BIGINT IDENTITY(1,1) PRIMARY KEY
, Name VARCHAR(10) NOT NULL
, IsCitizen BIT NULL
, Age INT NULL
)
INSERT INTO dbo.test2 (Name, IsCitizen, Age)
VALUES
('1', 1, NULL),
('2', 0, NULL),
('3', NULL, NULL)
Query 1:
DECLARE
#TableName SYSNAME
, #ObjectID INT
, #SQL NVARCHAR(MAX)
SELECT
#TableName = 'dbo.test2'
, #ObjectID = OBJECT_ID(#TableName)
SELECT #SQL = 'SELECT' + CHAR(13) + STUFF((
SELECT CHAR(13) + ', [' + c.name + '] = ' +
CASE WHEN c.is_nullable = 0
THEN '0'
ELSE 'CASE WHEN ' + totalrows +
' = SUM(CASE WHEN [' + c.name + '] IS NULL THEN 1 ELSE 0 END) THEN 1 ELSE 0 END'
END
FROM sys.columns c WITH (NOWAIT)
CROSS JOIN (
SELECT totalrows = CAST(MIN(p.[rows]) AS VARCHAR(50))
FROM sys.partitions p
WHERE p.[object_id] = #ObjectID
AND p.index_id IN (0, 1)
) r
WHERE c.[object_id] = #ObjectID
FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'), 1, 2, ' ') + CHAR(13) + 'FROM ' + #TableName
PRINT #SQL
EXEC sys.sp_executesql #SQL
Output 1:
SELECT
[ID] = 0
, [Name] = 0
, [IsCitizen] = CASE WHEN 3 = SUM(CASE WHEN [IsCitizen] IS NULL THEN 1 ELSE 0 END) THEN 1 ELSE 0 END
, [Age] = CASE WHEN 3 = SUM(CASE WHEN [Age] IS NULL THEN 1 ELSE 0 END) THEN 1 ELSE 0 END
FROM dbo.test2
Query 2:
DECLARE
#TableName SYSNAME
, #SQL NVARCHAR(MAX)
SELECT #TableName = 'dbo.test2'
SELECT #SQL = 'SELECT' + CHAR(13) + STUFF((
SELECT CHAR(13) + ', [' + c.name + '] = ' +
CASE WHEN c.is_nullable = 0
THEN '0'
ELSE 'CASE WHEN '+
'MAX(CAST([' + c.name + '] AS CHAR(1))) IS NULL THEN 1 ELSE 0 END'
END
FROM sys.columns c WITH (NOWAIT)
WHERE c.[object_id] = OBJECT_ID(#TableName)
FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'), 1, 2, ' ') + CHAR(13) + 'FROM ' + #TableName
PRINT #SQL
EXEC sys.sp_executesql #SQL
Output 2:
SELECT
[ID] = 0
, [Name] = 0
, [IsCitizen] = CASE WHEN MAX(CAST([IsCitizen] AS CHAR(1))) IS NULL THEN 1 ELSE 0 END
, [Age] = CASE WHEN MAX(CAST([Age] AS CHAR(1))) IS NULL THEN 1 ELSE 0 END
FROM dbo.test2
Results:
ID Name IsCitizen Age
----------- ----------- ----------- -----------
0 0 0 1
Could you check if colums idexing will help you reach some performance improve
CREATE UNIQUE NONCLUSTERED INDEX IndexName ON dbo.TableName(ColumnName)
WHERE ColumnName IS NOT NULL;
GO
SQL server query to get the list of columns in a table along with Data types, NOT NULL, and PRIMARY KEY constraints
Run SQL in best answer of above questions and generate a new query like below.
Select ISNULL(column1,1), ISNULL(column2,1), ISNULL(column3,1) from table
You would not need to 'count' all of the 100M records. When you simply back out of the query with a TOP 1 as soon as you hit a column with a not-null value, would save a lot of time while providing the same information.
500 Columns?!
Ok, the right answer to your question is: normalize your table.
Here's what happening for the time being:
You don't have an index on that column so SQL Server has to do a full scan of your humongous table.
SQL Server will certainly fully read every row (it means every columns even if you're only interested in one).
And since your row are most likely over 8kb... http://msdn.microsoft.com/en-us/library/ms186981%28v=sql.105%29.aspx
Seriously, normalize your table and if needed split it horizontally (put "theme grouped" columns inside separate table, to only read them when you need them).
EDIT: You can rewrite your query like this
select count(col_n) from tab
and if you want to get all columns at once (better):
SELECT
COUNT(column_1) column_1_count,
COUNT(column_2) column_2_count,
...
FROM table_name
If most records are not null maybe you can mix some of the approach suggested (for example check only nullable fields) with this:
if exists (select * from table where field is not null)
this should speed up the search because exists stops the search as soon as condition is met, in this example a single not null record is enough to decide the status of the field.
If the field has an index this should be almost instant.
Normally adding top 1 to this query is not needed because the query optimizer knows that you do not need to retrieve all the matching records.
You can use this stored procedure to the trick You need to provide the table name you wish to query note that if you'll pass to procedure the #exec parameter = 1 it will execute the select query
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE PROCEDURE [dbo].[SP_SELECT_NON_NULL_COLUMNS] ( #tablename varchar (100)=null, #exec int =0)
AS BEGIN
SET NOCOUNT ON
IF #tablename IS NULL
RAISERROR('CANT EXECUTE THE PROC, TABLE NAME IS MISSING',16 ,1)
ELSE
BEGIN
IF OBJECT_ID('tempdb..#table') IS NOT NULL DROP TABLE #table
DECLARE #i VARCHAR (max)=''
DECLARE #sentence VARCHAR (max)=''
DECLARE #SELECT VARCHAR (max)
DECLARE #LocalTableName VARCHAR(50) = '['+#tablename+']'
CREATE TABLE #table (ColumnName VARCHAR (max))
SELECT #i+=
' IF EXISTS ( SELECT TOP 1 '+column_name+' FROM ' +#LocalTableName+' WHERE ' +column_name+
' '+'IS NOT NULL) INSERT INTO #table VALUES ('''+column_name+''');'
FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name=#tablename
INSERT INTO #table
EXEC (#i)
SELECT #sentence = #sentence+' '+columnname+' ,' FROM #table
DROP TABLE #table
IF #exec=0
BEGIN
SELECT 'SELECT '+ LTRIM (left (#sentence,NULLIF(LEN (#sentence)-1,-1)))+
+' FROM ' +#LocalTableName
END
ELSE
BEGIN
SELECT #SELECT= 'SELECT '+ LTRIM (left (#sentence,NULLIF(LEN (#sentence)-1,-1)))+
+' FROM '+#LocalTableName
EXEC (#SELECT)
END
END
END
Use it like this:
EXEC [dbo].[SP_SELECT_NON_NULL_COLUMNS] 'YourTableName' , 1

Resources