How to find distinct values of string columns in hive? [duplicate] - arrays

I have a comma-separated column(string) with duplicate values. I want to remove duplicates:
e.g.
column_name
-----------------
gun,gun,man,gun,man
shuttle,enemy,enemy,run
hit,chase
I want result like:
column_name
----------------
gun,man
shuttle,enemy,run
hit,chase
I am using hive database.

Option 1: keep last occurrence
This will keep the last occurrence of every word.
E.g. 'hello,world,hello,world,hello' will result in 'world,hello'
select regexp_replace
(
column_name
,'(?<=^|,)(?<word>.*?),(?=.*(?<=,)\\k<word>(?=,|$))'
,''
)
from mytable
;
+-------------------+
| gun,man |
| shuttle,enemy,run |
| hit,chase |
+-------------------+
Option 2: keep first occurrence
This will keep the first occurrence of every word.
E.g. 'hello,world,hello,world,hello' will result in 'hello,world'
select reverse
(
regexp_replace
(
reverse(column_name)
,'(?<=^|,)(?<word>.*?),(?=.*(?<=,)\\k<word>(?=,|$))'
,''
)
)
from mytable
;
Option 3: sorted
E.g. 'Cherry,Apple,Cherry,Cherry,Cherry,Banana,Apple' will result in 'Apple,Banana,Cherry'
select regexp_replace
(
concat_ws(',',sort_array(split(column_name,',')))
,'(?<=^|,)(?<word>.*?)(,\\k<word>(?=,|$))+'
,'${word}'
)
from mytable
;

If value sort is not a concern:
with mytable as (
select 'gun,gun,man,gun,man' as column_name union
select 'shuttle,enemy,enemy,run' as column_name union
select 'hit,chase' as column_name
) -- test data
SELECT column_name, concat_ws(',',collect_set(item)) from (
select distinct column_name, s.item from mytable
lateral view explode(split(column_name,',')) s as item
) t
group by column_name
;
+--------------------------+--------------------+--+
| column_name | _c1 |
+--------------------------+--------------------+--+
| gun,gun,man,gun,man | gun,man |
| hit,chase | chase,hit |
| shuttle,enemy,enemy,run | enemy,run,shuttle |
+--------------------------+--------------------+--+
If want to keep the value sorted:
with mytable as (
select 'gun,gun,man,gun,man' as column_name union
select 'shuttle,enemy,enemy,run' as column_name union
select 'hit,chase' as column_name
) -- test data
select column_name,concat_ws(',',collect_set(item)) as column_name_distincted
from (
select column_name,item, min(pos) as pos
from (
select column_name,pos,item
from mytable
lateral view posexplode(split(column_name,',')) s as pos,item
) t
group by column_name,item
order by column_name,pos
) t
group by column_name
;
+--------------------------+-------------------------+--+
| column_name | column_name_distincted |
+--------------------------+-------------------------+--+
| gun,gun,man,gun,man | gun,man |
| hit,chase | hit,chase |
| shuttle,enemy,enemy,run | shuttle,enemy,run |
+--------------------------+-------------------------+--+

Related

Snowflake count nulls in all columns

I've seen a few questions like this - Count NULL Values from multiple columns with SQL
But is there really not a way to count nulls in a table with say, over 30 columns? Like I don't want to specify them all by name?
But is there really not a way to count nulls in a table with say, over 30 columns? Like I don't want to specify them all by name?
yes exactly that. I don't understand why it's so difficult - it's like 1 line in pandas?
Keypoint here is if something is not provided as "batteries included" then you need to write your own version. It is not so hard as it may look.
Let's say the input table is as follow:
CREATE OR REPLACE TABLE t AS SELECT $1 AS col1, $2 AS col2, $3 AS col3, $4 AS col4
FROM VALUES (1,2,3,10),(NULL,2,3,10),(NULL,NULL,4,10),(NULL,NULL,NULL,10);
SELECT * FROM t;
/*
+------+------+------+------+
| COL1 | COL2 | COL3 | COL4 |
+------+------+------+------+
| 1 | 2 | 3 | 10 |
| NULL | 2 | 3 | 10 |
| NULL | NULL | 4 | 10 |
| NULL | NULL | NULL | 10 |
+------+------+------+------+
*/
You probably know how to write the query that gives the desired output, but as it was not provided in the question I will use my own version:
WITH cte AS (
SELECT
COUNT(*) AS total_rows
,total_rows - COUNT(col1) AS col1
,total_rows - COUNT(col2) AS col2
,total_rows - COUNT(col3) AS col3
,total_rows - COUNT(col4) AS col4
FROM t
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (col1,col2,col3, col4))
ORDER BY COLUMN_NAME;
/*
+-------------+--------------------+-------------------+
| COLUMN_NAME | NULLS_COLUMN_COUNT | NULLS_TOTAL_COUNT |
+-------------+--------------------+-------------------+
| COL1 | 3 | 6 |
| COL2 | 2 | 6 |
| COL3 | 1 | 6 |
| COL4 | 0 | 6 |
+-------------+--------------------+-------------------+
*/
Here we could see that the query is "static" in nature with few moving parts(column_count_list/table_name/column_list):
WITH cte AS (
SELECT
COUNT(*) AS total_rows
<column_count_list>
FROM <table_name>
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (<column_list>))
ORDER BY COLUMN_NAME;
Now using the metadata and variables:
-- input
SET sch_name = 'my_schema';
SET tab_name = 't';
SELECT
LISTAGG(c.COLUMN_NAME, ', ') WITHIN GROUP(ORDER BY c.COLUMN_NAME) AS column_list
,ANY_VALUE(c.TABLE_SCHEMA || '.' || c.TABLE_NAME) AS full_table_name
,LISTAGG(REPLACE(SPACE(6) || ',total_rows - COUNT(<col_name>) AS <col_name>'
|| CHAR(13)
, '<col_name>', c.COLUMN_NAME), '')
WITHIN GROUP(ORDER BY COLUMN_NAME) AS column_count_list
,REPLACE(REPLACE(REPLACE(
'WITH cte AS (
SELECT
COUNT(*) AS total_rows
<column_count_list>
FROM <table_name>
)
SELECT COLUMN_NAME, NULLS_COLUMN_COUNT,SUM(NULLS_COLUMN_COUNT) OVER() AS NULLS_TOTAL_COUNT
FROM cte
UNPIVOT (NULLS_COLUMN_COUNT FOR COLUMN_NAME IN (<column_list>))
ORDER BY COLUMN_NAME;'
,'<column_count_list>', column_count_list)
,'<table_name>', full_table_name)
,'<column_list>', column_list) AS query_to_run
FROM INFORMATION_SCHEMA.COLUMNS c
WHERE TABLE_SCHEMA = UPPER($sch_name)
AND TABLE_NAME = UPPER($tab_name);
Running the code will generate the query to be run:
Copying the output and running it will give the output. This template could be further refined and wrapped with stored procedure if needed(but I will left it as an exercise).
#chris you should note that the metadata in Snowflake is similar to SQL Server. So anything you want to know at metadata level, would have already been solved by SQL Server practitioners.
See this link - Count number of NULL values in each column in SQL
This is different in Oracle where the metadata table gives the number of nulls in each column as well as density.

How to SQL PIVOT on two columns and with dynamic column names?

I have a key value pair set of rows to associate to a unique identifier (ApplicationId).
The data would look something like this:
| ApplicationId | Key | Value | Date |
| 123 | A | abc | 2020-3-1 14:00:01.000 |
| 123 | B | abd | 2020-3-1 14:00:02.000 |
| 123 | C | abe | 2020-3-1 14:00:03.000 |
| 124 | A | abf | 2020-3-1 14:01:00.000 |
| 124 | D | abg | 2020-3-1 14:01:01.000 |
The end result i'm looking for would be this:
| ApplicationId | A | A_Date | B | B_Date | C | C_Date | D | D_Date |
| 123 | abc | 2020-3-1 14:00:01.000 | abd | 2020-3-1 14:00:02.000 | abe | 2020-3-1 14:00:03.000 | NULL | NULL |
| 124 | abf | 2020-3-1 14:01:00.000 | NULL | NULL | NULL | NULL | abg | 2020-3-1 14:01:01.000 |
The Keys A,B,C,D are unknown so hard coding the column names isn't possible.
Here is something that works with one PIVOT
IF OBJECT_ID('tempdb.dbo.#_BLAH') IS NOT NULL DROP TABLE #_BLAH
SELECT et.[ApplicationId] et.[Key], et.[Value], et.[Date]
INTO #_BLAH
FROM ExampleTbl et
WHERE et.[Date] > DATEADD(dd, -1, GetDate())
DECLARE #_cols AS NVARCHAR(MAX)
DECLARE #_sql AS NVARCHAR(MAX)
SELECT
#_cols += QUOTENAME([Key]) + ','
FROM
#_BLAH
GROUP BY
[Key];
SET #_cols = STUFF((SELECT ',' + QUOTENAME(T.[Key])
FROM #_BLAH AS T
GROUP BY T.[Key]
FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'),1,1,'')
set #_sql = 'SELECT [ApplicationId], ' + #_cols + '
FROM ( SELECT * FROM #_BLAH) AS SRC
PIVOT ( MAX([Value]) FOR [Key] IN (' + #_cols + ') ) AS p';
EXEC(#_sql)
I've so far been unable to find an example or an article attempting to make a second dynamic column and adding in the value that relates the specific Key in my example.
My SQL above will accomplish creating the row i want except for the #_Date column i need.
Try this:
DROP TABLE IF EXISTS #DataSource;
DROP TABLE IF EXISTS #DataSourcePrepared;
CREATE TABLE #DataSource
(
[ApplicationId] INT
,[Key] CHAR(1)
,[Value] VARCHAR(12)
,[Date] DATETIME2(0)
);
INSERT INTO #DataSource ([ApplicationId], [Key], [Value], [Date])
VALUES (123, 'A', 'abc', '2020-3-1 14:00:01.000')
,(123, 'B', 'abd', '2020-3-1 14:00:02.000')
,(123, 'C', 'abe', '2020-3-1 14:00:03.000')
,(124, 'A', 'abf', '2020-3-1 14:01:00.000')
,(124, 'D', 'abg', '2020-3-1 14:01:01.000');
CREATE TABLE #DataSourcePrepared
(
[ApplicationId] INT
,[ColumnName] VARCHAR(32)
,[Value] VARCHAR(32)
)
INSERT INTO #DataSourcePrepared ([ApplicationId], [ColumnName], [Value])
SELECT [ApplicationId]
,[Key]
,[value]
FROM #DataSource
UNION ALL
SELECT [ApplicationId]
,[Key] + '_Date'
,CONVERT(VARCHAR(19), [Date], 121)
FROM #DataSource;
DECLARE #DymanimcTSQLSatement NVARCHAR(MAX)
,#DynamicColumns NVARCHAR(MAX);
SET #DynamicColumns = STUFF
(
(
SELECT ',' + QUOTENAME([ColumnName])
FROM #DataSourcePrepared
GROUP BY [ColumnName]
ORDER BY [ColumnName]
FOR XML PATH(''), TYPE
).value('.', 'NVARCHAR(MAX)')
,1
,1
,''
);
SET #DymanimcTSQLSatement = N'
SELECT *
FROM #DataSourcePrepared
PIVOT
(
MAX([value]) FOR [ColumnName] IN (' + #DynamicColumns +')
) PVT;';
EXECUTE sp_executesql #DymanimcTSQLSatement;
You just need to prepare the data before the actual PIVOT. Also, note that I am ordering the columns when I am building the dynamic part by name. In your real case, you may want to change this to something complex.
you can try this
DECLARE #_cols AS NVARCHAR(MAX) =''
DECLARE #_sql AS NVARCHAR(MAX)
SELECT
#_cols +=','+ QUOTENAME([Key]) + ',' + QUOTENAME([Key]+'_Date')
FROM
(SELECT DISTINCT [Key] FROM ExampleTbl) T
SET #_cols = STUFF(#_cols,1,1,'')
set #_sql = 'SELECT * FROM (
SELECT ApplicationId, [Key], Value FROM ExampleTbl
UNION ALL
SELECT ApplicationId, [Key] + ''_Date'' AS [Key], CONVERT(VARCHAR(30), [Date],121 ) AS Value FROM ExampleTbl
) SRC
PIVOT (MAX(Value) FOR [Key] IN ('+#_cols +' )) AS PVT';
EXEC(#_sql)
Result:
ApplicationId A A_Date B B_Date C C_Date D D_Date
------------- ------- --------------------------- ---------- -------------------------- ------------ ------------------------- ------- -------------------------
123 abc 2020-03-01 14:00:01.000 abd 2020-03-01 14:00:02.000 abe 2020-03-01 14:00:03.000 NULL NULL
124 abf 2020-03-01 14:01:00.000 NULL NULL NULL NULL abg 2020-03-01 14:01:01.000

Select subsequent numbers in the table with result in ranges

I need to deliver a result in the following format: [12-14][20][34-35]
Is there any more intelligent or elegant approach than iterating cursor on the select of this column?
The select might be quite large.
Table:
|---------------------|
| col1 |
|---------------------|
| 12 |
|---------------------|
| 13 |
|---------------------|
| 14 |
|---------------------|
| 20 |
|---------------------|
| 34 |
|---------------------|
| 35 |
|---------------------|
As others have said, this is a gaps and islands problem so you need to work out which numbers are the start of a sequence and which numbers are the end of a sequence.
I did this using two subqueries and joining them together:
SELECT CASE
WHEN startSeq.Col1 = endSeq.Col1
THEN '[' + CONVERT(VARCHAR(2),startSeq.Col1) +']'
ELSE '[' + CONVERT(VARCHAR(2),startSeq.Col1) + '-' + CONVERT(VARCHAR(2),endSeq.Col1) + ']'
END
FROM (
SELECT Col1,
ROW_NUMBER() OVER(ORDER BY Col1) AS RowN
FROM Nums a
WHERE NOT EXISTS (
SELECT Col1
FROM Nums b
WHERE b.Col1 = a.Col1 - 1
)
) startSeq
JOIN (
SELECT Col1,
ROW_NUMBER() OVER(ORDER BY Col1) AS RowN
FROM Nums a
WHERE NOT EXISTS (
SELECT Col1
FROM Nums b
WHERE b.Col1 = a.Col1 + 1
)
) endSeq
ON startSeq.RowN = endSeq.RowN

mssql - retrieve unique values of a column based on another column

I have a table with two columns: ColumnA, ColumnB, with rows:
| A | 1 |
| B | 1 |
| B | 2 |
| C | 1 |
| C | 1 |
| C | 1 |
| A | 2 |
| B | 1 |
| A | 2 |
| A | 1 |
I would like to write a query that would return all unique values for ColumnB, for each unique value of ColumnA, where ColumnA has more than 1 value in ColumnB i.e.
| A | 1 |
| A | 2 |
| B | 1 |
| B | 2 |
C 1 should be omitted because there is only one distinct value for ColumnA = 'C'
There might be a simpler approach but this works:
SELECT t.ColumnA, t2.ColumnB
FROM ( select ColumnA
from dbo.TableName t
group by t.ColumnA
having count(distinct t.ColumnB) > 1) t
CROSS APPLY ( select distinct t2.ColumnB
from dbo.TableName t2
where t.ColumnA=t2.ColumnA ) t2
The first subquery returns all unique ColumnA values that have multiple (different) ColumnB values. The 2nd subquery returns all distinct ColumnB values of those ColumnA-values with CROSS APPLY.
SELECT DISTINCT * FROM x WHERE ColumnA IN(
SELECT xd.ColumnA
FROM (
SELECT DISTINCT ColumnA, ColumnB FROM x
) xd
GROUP BY xd.ColumnA HAVING COUNT(*) > 1
)
SELECT y.ColumnA, y.ColumnB
FROM (
SELECT ColumnA, ColumnB, COUNT(*) OVER (PARTITION BY ColumnA) m
FROM x
GROUP BY ColumnA, ColumnB
) y
WHERE m > 1

SQL Server Transpose Rows into Columns using IDs as Column Names

I have a large file that has the following fields:
Table 1:
+---------+--------+-----------+
| User_Id | Key_Id | Value |
+---------+--------+-----------+
| 100 | 74 | 37 |
| 100 | 65 | Male |
| 100 | 279 | G235467 |
+---------+--------+-----------+
and I have another file that tells what each 'Key_Id' is called (they are column names) e.g.
Table 2:
+--------+------------------+
| Key_Id | Key |
+--------+------------------+
| 65 | Gender |
| 66 | Height |
| 74 | Age |
| 279 | ReferenceNo |
I want to create a table using the Key_Id names found in the Key column of table 2, transpose all of the values from table 1 into table 2, but also include the User_Id from table 1 as this relates to an individual.
PS. Table 2 has nearly 300 keys that would need turning into individual fields
So ultimately I would like a table that looks like this:
+---------+---------+--------+-------+--------------+--------+
| User_Id | Gender | Height | Age | ReferenceNo | etc |
+---------+---------+--------+-------+--------------+--------+
| 100 | Male | | 37 | G235467 | |
So that each User_Id is a row and that all the Keys are columns with their respective values
You can use a dynamic sql query as below.
Query
declare #sql as varchar(max);
select #sql = 'select t1.[User_Id], ' + stuff((select +
', max(case t2.[Key_Id] when ' + cast([Key_Id] as varchar(100)) +
' then t1.[Value] end) as [' + [Key] + '] '
from Table2
for xml path('')
), 1, 2, '') +
'from Table1 t1 left join Table2 t2 on t1.[Key_Id] = t2.[Key_Id] group by t1.[User_Id];'
exec(#sql);
Find a demo here
You need to get a coma-separated list of those 300 key names to be used in PIVOT/UNPIVOT operators in T-SQL like described here
https://learn.microsoft.com/en-us/sql/t-sql/queries/from-using-pivot-and-unpivot
you can use pivot as below:
Select * from (
Select u.UserId, k.[key], u.[Value] from table1 u
join table2 k on u.keyid = k.keyid ) a
pivot ( max([Value]) for [key] in ([Gender], [Height], [Age], [ReferenceNo]) ) p
For dynamic list of keys you can use dynamic sql as below:
Declare #cols1 varchar(max)
Declare #query nvarchar(max)
Select #cols1 = stuff((select ','+QuoteName([Key]) from table2 group by [Key] for xml path('')),1,1,'')
Set #Query = 'Select * from (
Select u.UserId, k.[key], u.[Value] from table1 u
join table2 k on u.keyid = k.keyid ) a
pivot ( max([Value]) for [key] in (' + #cols1 + ') ) p '
Select #Query --Check the generated query and execute by uncommenting below query
--exec sp_executesql #Query

Resources