Select subsequent numbers in the table with result in ranges - sql-server

I need to deliver a result in the following format: [12-14][20][34-35]
Is there any more intelligent or elegant approach than iterating cursor on the select of this column?
The select might be quite large.
Table:
|---------------------|
| col1 |
|---------------------|
| 12 |
|---------------------|
| 13 |
|---------------------|
| 14 |
|---------------------|
| 20 |
|---------------------|
| 34 |
|---------------------|
| 35 |
|---------------------|

As others have said, this is a gaps and islands problem so you need to work out which numbers are the start of a sequence and which numbers are the end of a sequence.
I did this using two subqueries and joining them together:
SELECT CASE
WHEN startSeq.Col1 = endSeq.Col1
THEN '[' + CONVERT(VARCHAR(2),startSeq.Col1) +']'
ELSE '[' + CONVERT(VARCHAR(2),startSeq.Col1) + '-' + CONVERT(VARCHAR(2),endSeq.Col1) + ']'
END
FROM (
SELECT Col1,
ROW_NUMBER() OVER(ORDER BY Col1) AS RowN
FROM Nums a
WHERE NOT EXISTS (
SELECT Col1
FROM Nums b
WHERE b.Col1 = a.Col1 - 1
)
) startSeq
JOIN (
SELECT Col1,
ROW_NUMBER() OVER(ORDER BY Col1) AS RowN
FROM Nums a
WHERE NOT EXISTS (
SELECT Col1
FROM Nums b
WHERE b.Col1 = a.Col1 + 1
)
) endSeq
ON startSeq.RowN = endSeq.RowN

Related

Snowflake count nulls in all columns

I've seen a few questions like this - Count NULL Values from multiple columns with SQL
But is there really not a way to count nulls in a table with say, over 30 columns? Like I don't want to specify them all by name?
But is there really not a way to count nulls in a table with say, over 30 columns? Like I don't want to specify them all by name?
yes exactly that. I don't understand why it's so difficult - it's like 1 line in pandas?
Keypoint here is if something is not provided as "batteries included" then you need to write your own version. It is not so hard as it may look.
Let's say the input table is as follow:
CREATE OR REPLACE TABLE t AS SELECT $1 AS col1, $2 AS col2, $3 AS col3, $4 AS col4
FROM VALUES (1,2,3,10),(NULL,2,3,10),(NULL,NULL,4,10),(NULL,NULL,NULL,10);
SELECT * FROM t;
/*
+------+------+------+------+
| COL1 | COL2 | COL3 | COL4 |
+------+------+------+------+
| 1 | 2 | 3 | 10 |
| NULL | 2 | 3 | 10 |
| NULL | NULL | 4 | 10 |
| NULL | NULL | NULL | 10 |
+------+------+------+------+
*/
You probably know how to write the query that gives the desired output, but as it was not provided in the question I will use my own version:
WITH cte AS (
SELECT
COUNT(*) AS total_rows
,total_rows - COUNT(col1) AS col1
,total_rows - COUNT(col2) AS col2
,total_rows - COUNT(col3) AS col3
,total_rows - COUNT(col4) AS col4
FROM t
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (col1,col2,col3, col4))
ORDER BY COLUMN_NAME;
/*
+-------------+--------------------+-------------------+
| COLUMN_NAME | NULLS_COLUMN_COUNT | NULLS_TOTAL_COUNT |
+-------------+--------------------+-------------------+
| COL1 | 3 | 6 |
| COL2 | 2 | 6 |
| COL3 | 1 | 6 |
| COL4 | 0 | 6 |
+-------------+--------------------+-------------------+
*/
Here we could see that the query is "static" in nature with few moving parts(column_count_list/table_name/column_list):
WITH cte AS (
SELECT
COUNT(*) AS total_rows
<column_count_list>
FROM <table_name>
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (<column_list>))
ORDER BY COLUMN_NAME;
Now using the metadata and variables:
-- input
SET sch_name = 'my_schema';
SET tab_name = 't';
SELECT
LISTAGG(c.COLUMN_NAME, ', ') WITHIN GROUP(ORDER BY c.COLUMN_NAME) AS column_list
,ANY_VALUE(c.TABLE_SCHEMA || '.' || c.TABLE_NAME) AS full_table_name
,LISTAGG(REPLACE(SPACE(6) || ',total_rows - COUNT(<col_name>) AS <col_name>'
|| CHAR(13)
, '<col_name>', c.COLUMN_NAME), '')
WITHIN GROUP(ORDER BY COLUMN_NAME) AS column_count_list
,REPLACE(REPLACE(REPLACE(
'WITH cte AS (
SELECT
COUNT(*) AS total_rows
<column_count_list>
FROM <table_name>
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (<column_list>))
ORDER BY COLUMN_NAME;'
,'<column_count_list>', column_count_list)
,'<table_name>', full_table_name)
,'<column_list>', column_list) AS query_to_run
FROM INFORMATION_SCHEMA.COLUMNS c
WHERE TABLE_SCHEMA = UPPER($sch_name)
AND TABLE_NAME = UPPER($tab_name);
Running the code will generate the query to be run:
Copying the output and running it will give the output. This template could be further refined and wrapped with stored procedure if needed(but I will left it as an exercise).
#chris you should note that the metadata in Snowflake is similar to SQL Server. So anything you want to know at metadata level, would have already been solved by SQL Server practitioners.
See this link - Count number of NULL values in each column in SQL
This is different in Oracle where the metadata table gives the number of nulls in each column as well as density.

mssql - retrieve unique values of a column based on another column

I have a table with two columns: ColumnA, ColumnB, with rows:
| A | 1 |
| B | 1 |
| B | 2 |
| C | 1 |
| C | 1 |
| C | 1 |
| A | 2 |
| B | 1 |
| A | 2 |
| A | 1 |
I would like to write a query that would return all unique values for ColumnB, for each unique value of ColumnA, where ColumnA has more than 1 value in ColumnB i.e.
| A | 1 |
| A | 2 |
| B | 1 |
| B | 2 |
C 1 should be omitted because there is only one distinct value for ColumnA = 'C'
There might be a simpler approach but this works:
SELECT t.ColumnA, t2.ColumnB
FROM ( select ColumnA
from dbo.TableName t
group by t.ColumnA
having count(distinct t.ColumnB) > 1) t
CROSS APPLY ( select distinct t2.ColumnB
from dbo.TableName t2
where t.ColumnA=t2.ColumnA ) t2
The first subquery returns all unique ColumnA values that have multiple (different) ColumnB values. The 2nd subquery returns all distinct ColumnB values of those ColumnA-values with CROSS APPLY.
SELECT DISTINCT * FROM x WHERE ColumnA IN(
SELECT xd.ColumnA
FROM (
SELECT DISTINCT ColumnA, ColumnB FROM x
) xd
GROUP BY xd.ColumnA HAVING COUNT(*) > 1
)
SELECT y.ColumnA, y.ColumnB
FROM (
SELECT ColumnA, ColumnB, COUNT(*) OVER (PARTITION BY ColumnA) m
FROM x
GROUP BY ColumnA, ColumnB
) y
WHERE m > 1

How to find distinct values of string columns in hive? [duplicate]

I have a comma-separated column(string) with duplicate values. I want to remove duplicates:
e.g.
column_name
-----------------
gun,gun,man,gun,man
shuttle,enemy,enemy,run
hit,chase
I want result like:
column_name
----------------
gun,man
shuttle,enemy,run
hit,chase
I am using hive database.
Option 1: keep last occurrence
This will keep the last occurrence of every word.
E.g. 'hello,world,hello,world,hello' will result in 'world,hello'
select regexp_replace
(
column_name
,'(?<=^|,)(?<word>.*?),(?=.*(?<=,)\\k<word>(?=,|$))'
,''
)
from mytable
;
+-------------------+
| gun,man |
| shuttle,enemy,run |
| hit,chase |
+-------------------+
Option 2: keep first occurrence
This will keep the first occurrence of every word.
E.g. 'hello,world,hello,world,hello' will result in 'hello,world'
select reverse
(
regexp_replace
(
reverse(column_name)
,'(?<=^|,)(?<word>.*?),(?=.*(?<=,)\\k<word>(?=,|$))'
,''
)
)
from mytable
;
Option 3: sorted
E.g. 'Cherry,Apple,Cherry,Cherry,Cherry,Banana,Apple' will result in 'Apple,Banana,Cherry'
select regexp_replace
(
concat_ws(',',sort_array(split(column_name,',')))
,'(?<=^|,)(?<word>.*?)(,\\k<word>(?=,|$))+'
,'${word}'
)
from mytable
;
If value sort is not a concern:
with mytable as (
select 'gun,gun,man,gun,man' as column_name union
select 'shuttle,enemy,enemy,run' as column_name union
select 'hit,chase' as column_name
) -- test data
SELECT column_name, concat_ws(',',collect_set(item)) from (
select distinct column_name, s.item from mytable
lateral view explode(split(column_name,',')) s as item
) t
group by column_name
;
+--------------------------+--------------------+--+
| column_name | _c1 |
+--------------------------+--------------------+--+
| gun,gun,man,gun,man | gun,man |
| hit,chase | chase,hit |
| shuttle,enemy,enemy,run | enemy,run,shuttle |
+--------------------------+--------------------+--+
If want to keep the value sorted:
with mytable as (
select 'gun,gun,man,gun,man' as column_name union
select 'shuttle,enemy,enemy,run' as column_name union
select 'hit,chase' as column_name
) -- test data
select column_name,concat_ws(',',collect_set(item)) as column_name_distincted
from (
select column_name,item, min(pos) as pos
from (
select column_name,pos,item
from mytable
lateral view posexplode(split(column_name,',')) s as pos,item
) t
group by column_name,item
order by column_name,pos
) t
group by column_name
;
+--------------------------+-------------------------+--+
| column_name | column_name_distincted |
+--------------------------+-------------------------+--+
| gun,gun,man,gun,man | gun,man |
| hit,chase | hit,chase |
| shuttle,enemy,enemy,run | shuttle,enemy,run |
+--------------------------+-------------------------+--+

SQL Server to combine multi rows into single row where col0=col1

my table :
previousid|CurrentID|Data
| 1 | 2 | Data 1
| 2 | 3 | Data 2
| 3 | 4 | Data 3
| 4 | 5 | Data 4
Result i look for :
Select .... where PreviousID=1 :
|Col0|Col1|Col2 |Col3|Col 4|Col5| Col6 | Col7 | Col8
|1 |2 |Data 1|3 |Data 2| 4 | data 3| 5 | data 4
Select .....where PreviousID=2
|Col0|Col1|Col2 |Col3|Col 4|Col5| Col6 |
|2 |3 |Data 2|4 |Data 3| 5 | data 4|
i tried to create some SQL server query to get result with no luck, please help me guys
We can do this in a few steps:
Declare and set a variable to use for our root node, and create a temporary table to store the results from our recusive query:
Insert the results from the recusive query into the temporary table
Generate and execute dynamic sql to pivot() the temporary table.
(alternate) Generate and execute dynamic sql to use conditional aggregation instead of pivot():
rextester demo: http://rextester.com/MRFZC75180
test setup:
create table t (PreviousID int, CurrentID int, Data varchar(32));
insert into t values
(1,2,'Data 1'),(2,3,'Data 2'),(3,4,'Data 3'),(4,5,'Data 4');
Declare and set a variable to use for our root node, and create a temporary table to store the results from our recusive query:
declare #PreviousId int = 2;
create table #temp (PreviousID int
, Level int
, Col varchar(32)
, Value varchar(32)
, rn int
);
Insert the results from the recusive query into the temporary table
;with cte as (
select PreviousID, CurrentID, Data, level = 0
from t
where previousId = #PreviousId
union all
select c.PreviousID, c.CurrentID, c.Data, level = p.level +1
from t c
inner join cte as p
on c.PreviousID = p.CurrentID
)
insert into #temp
select p.PreviousId, t.level, x.col, x.value
, rn = row_number() over (order by t.level, x.col)
from cte t
cross apply (
select top 1
PreviousId
from cte i
order by level
) as p (PreviousId)
cross apply (
values ('CurrentId',convert(varchar(32),CurrentId)),('Data',Data)
) as x (col,value);
results so far:
+------------+-------+-----------+--------+----+
| PreviousID | Level | Col | Value | rn |
+------------+-------+-----------+--------+----+
| 2 | 0 | CurrentId | 3 | 1 |
| 2 | 0 | Data | Data 2 | 2 |
| 2 | 1 | CurrentId | 4 | 3 |
| 2 | 1 | Data | Data 3 | 4 |
| 2 | 2 | CurrentId | 5 | 5 |
| 2 | 2 | Data | Data 4 | 6 |
+------------+-------+-----------+--------+----+
Generate and execute dynamic sql to pivot() the temporary table.
/* pivot */
declare #cols nvarchar(max);
declare #sql nvarchar(max);
select #cols = stuff((
select
', Col'+convert(nvarchar(10),rn)
from #temp
order by 1
for xml path (''), type).value('.','nvarchar(max)')
,1,1,'')
select #sql ='
select Col0=PreviousID, ' + #cols +'
from (
select PreviousID, Value, rn= ''Col''+convert(nvarchar(10),rn)
from #temp
) as t
pivot (max([Value]) for [rn] in (' + #cols +')) p'
select #sql as CodeGenerated;
exec sp_executesql #sql;
code generated:
select Col0=PreviousID, Col1, Col2, Col3, Col4, Col5, Col6
from (
select PreviousID, Value, rn= 'Col'+convert(nvarchar(10),rn)
from #temp
) as t
pivot (max([Value]) for [rn] in ( Col1, Col2, Col3, Col4, Col5, Col6)) p
returns:
+------+------+--------+------+--------+------+--------+
| Col0 | Col1 | Col2 | Col3 | Col4 | Col5 | Col6 |
+------+------+--------+------+--------+------+--------+
| 2 | 3 | Data 2 | 4 | Data 3 | 5 | Data 4 |
+------+------+--------+------+--------+------+--------+
(alternate) Generate and execute dynamic sql to use conditional aggregation instead of pivot():
/* conditional aggregation */
--declare #cols nvarchar(max);
--declare #sql nvarchar(max);
select #cols = stuff((
select
char(10)+' , '
+ 'Col'+convert(nvarchar(10),rn)
+' = max(case when rn = '+convert(nvarchar(10),rn)+' then Value end)'
from #temp
order by 1
for xml path (''), type).value('.','nvarchar(max)')
,1,0,'')
select #sql ='
select Col0 = PreviousID'+#cols+'
from #temp
group by PreviousID'
select #sql as CodeGenerated;
exec sp_executesql #sql;
code generated:
select Col0 = PreviousID
, Col1 = max(case when rn = 1 then Value end)
, Col2 = max(case when rn = 2 then Value end)
, Col3 = max(case when rn = 3 then Value end)
, Col4 = max(case when rn = 4 then Value end)
, Col5 = max(case when rn = 5 then Value end)
, Col6 = max(case when rn = 6 then Value end)
from #temp
group by PreviousID
returns:
+------+------+--------+------+--------+------+--------+
| Col0 | Col1 | Col2 | Col3 | Col4 | Col5 | Col6 |
+------+------+--------+------+--------+------+--------+
| 2 | 3 | Data 2 | 4 | Data 3 | 5 | Data 4 |
+------+------+--------+------+--------+------+--------+
I think try with concatenating column and give them alias.
For better understanding go through this link https://www.mssqltips.com/sqlservertip/2985/concatenate-sql-server-columns-into-a-string-with-concat/

Update several columns with latest values from another table

Here's the data:
[ TABLE_1 ]
id | prod1 | date1 | prod2 | date2 | prod3 | date3 |
---|--------|--------|--------|--------|--------|-------|
1 | null | null | null | null | null | null |
2 | null | null | null | null | null | null |
3 | null | null | null | null | null | null |
[ TABLE_2 ]
id | date | product |
-----|-------------|-----------|
1 | 20140101 | X |
1 | 20140102 | Y |
1 | 20140103 | Z |
2 | 20141201 | data |
2 | 20141201 | Y |
2 | 20141201 | Z |
3 | 20150101 | data2 |
3 | 20150101 | data3 |
3 | 20160101 | X |
Both tables have other columns not listed here.
date is formatted: yyyymmdd and datatype is int.
[ TABLE_2 ] doesn't have empty rows, just tried to make sample above more readable.
Here's the Goal:
I need to update [ TABLE_1 ] prod1,date1,prod2,date2,prod3,date3
with product collected from [ TABLE_2 ] with corresponding date values.
Data must be sorted so that "latest" product becomes prod1,
2nd latest product will be prod2 and 3rd is prod3.
Latest product = biggest date (int).
If dates are equal, order doesn't matter. (see id=2 and id=3).
Updated [ TABLE_1 ] should be:
id | prod1 | date1 | prod2 | date2 | prod3 | date3 |
---|--------|----------|--------|----------|--------|----------|
1 | Z | 20140103 | Y | 20140102 | X | 20140101 |
2 | data | 20141201 | Y | 20141201 | Z | 20141201 |
3 | X | 20160101 | data2 | 20150101 | data3 | 20150101 |
Ultimate goal is to get the following :
[ TABLE_3 ]
id | order1 | order2 | order3 | + Columns from [ TABLE_1 ]
---|--------------------|----------------------|------------|--------------------------
1 | 20140103:Z | 20140102:Y | 20140103:Z |
2 | 20141201:data:Y:Z | NULL | NULL |
3 | 20160101:X | 20150101:data2:data3 | NULL |
I have to admit this exceeds my knowledge and I haven't tried anything.
Should I do it with JOIN or SELECT subquery?
Should I try to make it in one SQL -clause or perhaps in 3 steps,
each prod&date -pair at the time ?
What about creating [ TABLE_3 ] ?
It has to have columns from [ TABLE_1 ].
Is it easiest to create it from [ TABLE_2 ] -data or Updated [ TABLE_1 ] ?
Any help would be highly appreciated.
Thanks in advance.
I'll post some of my own shots on comments.
After looking into it (after my comment), a stored procedure would be best, that you can call to view the data as a pivot, and do away with TABLE_1. Obviously if you need to make this dynamic, you'll need to look into dynamic pivots, it's a bit of a hack with CTEs:
CREATE PROCEDURE DBO.VIEW_AS_PIVOTED_DATA
AS
;WITH CTE AS (
SELECT ID, [DATE], 'DATE' + CAST(ROW_NUMBER() OVER(PARTITION BY ID ORDER BY [DATE] DESC) AS VARCHAR) AS [RN]
FROM TABLE_2)
, CTE2 AS (
SELECT ID, PRODUCT, 'PROD' + CAST(ROW_NUMBER() OVER(PARTITION BY ID ORDER BY [DATE] DESC) AS VARCHAR) AS [RN]
FROM TABLE_2)
, CTE3 AS (
SELECT ID, [DATE1], [DATE2], [DATE3]
FROM CTE
PIVOT(MAX([DATE]) FOR RN IN ([DATE1],[DATE2],[DATE3])) PIV)
, CTE4 AS (
SELECT ID, [PROD1], [PROD2], [PROD3]
FROM CTE2
PIVOT(MAX(PRODUCT) FOR RN IN ([PROD1],[PROD2],[PROD3])) PIV)
SELECT A.ID, [PROD1], [DATE1], [PROD2], [DATE2], [PROD3], [DATE3]
FROM CTE3 AS A
JOIN CTE4 AS B
ON A.ID=B.ID
Construction:
WITH ranked AS (
SELECT [id]
,[date]
,[product]
,row_number() over (partition by id order by date desc) rn
FROM [sistemy].[dbo].[TABLE_2]
)
SELECT id, [prod1],[date1],[prod2],[date2],[prod3],[date3]
FROM
(
SELECT id, type+cast(rn as varchar(1)) col, value
FROM ranked
CROSS APPLY
(
SELECT 'date', CAST([date] AS varchar(8))
UNION ALL
SELECT 'prod', product
) ca(type, value)
) unpivoted
PIVOT
(
max(value)
for col IN ([prod1],[date1],[prod2],[date2],[prod3],[date3])
) pivoted
You need to take a few steps to achive the aim.
Rank your products by date:
SELECT [id]
,[date]
,[product]
,row_number() over (partition by id order by date desc) rn
FROM [sistemy].[dbo].[TABLE_2]
Unpivot your date and product columns into one column. You can use UNPIVOT OR CROSS APPLY statements. I prefer CROSS APPLY
SELECT id, type+cast(rn as varchar(1)) col, value
FROM ranked
CROSS APPLY
(
SELECT 'date', CAST([date] AS varchar(8))
UNION ALL
SELECT 'prod', product
) ca(type, value)
or the same result using UNPIVOT
SELECT id, type+cast(rn as varchar(1)) col, value
FROM (
SELECT [id],
rn,
CAST([date] AS varchar(500)) date,
CAST([product] AS varchar(500)) prod
FROM ranked) t
UNPIVOT
(
value FOR type IN (date, product)
) unpvt
and at last you use PIVOTE and get a result.

Resources