Delete duplicated row and update the row using the duplicated row id - sql-server

This is the Scenario : I have a duplicate rows in my table with the same Id , Name and so on .
1) I have to find the duplicate row matching all the criteria ( this is done)
2) Delete them only if the criteria match
3) Use the id of the deleted record and update the existing row in the table
For this i have created a 2 temp table. Temp1 is the table with all the record. Temp2 consist of duplicated row.
IF OBJECT_ID('tempdb..#Temp1') IS NOT NULL
DROP TABLE #Temp1
IF OBJECT_ID('tempdb..#Temp2') IS NOT NULL
DROP TABLE #Temp2
IF OBJECT_ID('tempdb..#Temp3') IS NOT NULL
DROP TABLE #Temp3
CREATE Table #Temp1 (
Id int,
Name NVARCHAR(64),
StudentNo INT NULL,
ClassCode NVARCHAR(8) NULL,
Section NVARCHAR(8) NULL,
)
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(1,'Joe',123,'A1', 'I')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(1,'Joe',123,'A1', 'I')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(2,'Harry',113,'X2', 'H')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(2,'Harry',113,'X2', 'H')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(3,'Elle',121,'J1', 'E1')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(3,'Elle',121,'J1', 'E')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(8,'Jane',191,'A1', 'E')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(5,'Silva',811,'S1', 'SE')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(6,'Juan',411,'S2', 'SE')
INSERT INTO #Temp1 (Id, Name,StudentNo,ClassCode,Section) Values(7,'Carla',431,'S2', 'SE')
;WITH CTE AS (
select
ROW_NUMBER() over (partition by Id
, StudentNo
order by Id, StudentNo)as Duplicate_RowNumber
, * from #Temp1 )
select t1.Id,t1.Name,t1.StudentNo,t1.Section,t1.ClassCode
INTO #Temp2
from CTE as c INNER JOIN #Temp1 as t1 ON t1.Id = c.Id
and t1.StudentNo = t1.StudentNo
and c.Duplicate_RowNumber >1
-- this will have 6 rows all the duplicates are included
--select * from #Temp2
-- this is for output clause
DECLARE #inserted Table (Id int,
Name NVARCHAR(64),
StudentNo INT NULL,
ClassCode NVARCHAR(8) NULL,
Section NVARCHAR(8) NULL)
DELETE FROM #temp1
OUTPUT deleted.Id , deleted.Name ,deleted.StudentNo ,deleted.ClassCode ,deleted.Section into #inserted
WHERE EXISTS ( SELECT * FROM #Temp2 as t2
where #temp1.Id = t2.Id
and #temp1.Name = t2.Name
and #temp1.StudentNo = t2.StudentNo
and #temp1.ClassCode = t2.ClassCode
and #temp1.Section = t2.Section)
-- this is to check what is delete so that i can join it and update the table temp1
select * from #inserted
You can see below the query should not delete the last two highlighted column because the Section does not match. It should only delete matching criteria from Temp1 and Temp2.
Scenario 2 : Delete the duplicate record in Temp1 and use the key in order to update the data to NULL for Section and Classcode . This is what i expect with the highlighted to be NULLs .
You can run this query yourself - just copy and paste.

Yes, for scenario #1 it is going to delete the rows because the problem is in this section.
I added this table for references.
Added this #temp2 table to clarify for later use.
CREATE Table #Temp2 (
Id int,
Name Varchar(64),
StudentNo INT NULL,
ClassCode Varchar(8) NULL,
Section Varchar(8) NULL,
)
IF OBJECT_ID('tempdb..#tmp4') IS NOT NULL
DROP TABLE #tmp4
select t1.Id,t1.Name,t1.StudentNo,t1.Section,t1.ClassCode,
Duplicate_RowNumber
INTO #Duplicatedata
from CTE as c INNER JOIN #Temp1 as t1 ON t1.Id = c.Id
and t1.StudentNo = t1.StudentNo
and c.Duplicate_RowNumber >1
select * from #Duplicatedata
This is going to satisfy both condition as #temp 1 will have both rows for Elle as your join condition is only on ID and Student No.
I added row number column for clarity.
Id Name StudentNo Section ClassCode Duplicate_RowNumber
1 Joe 123 I A1 2
1 Joe 123 I A1 2
2 Harry 113 H X2 2
2 Harry 113 H X2 2
3 Elle 121 E1 J1 2
3 Elle 121 E J1 2
As your Partition is based by Student No and ID, every duplicate row will have 2 or more row numbers.
You can use this approach to delete.
select
ROW_NUMBER() over (partition by Id
, StudentNo
order by Id, StudentNo, section)as Duplicate_RowNumber
, * into #tmp4 from #Temp1
--You can add section in your order as well for consistency purpose.
delete
from #tmp4
output deleted.id, deleted.Name, deleted.StudentNo, deleted.ClassCode,
deleted.Section into #Temp2
where Duplicate_RowNumber > 1
After that it seems like you want to keep one row in your final table and put the other one in you deleted table. For Elle it will delete one of the rows from Final table and keep only one since your partition is not based on section.
To make sure that you delete 1 row from your final table you can use this.
DELETE t
OUTPUT deleted.Id , deleted.Name ,deleted.StudentNo ,deleted.ClassCode
,deleted.Section into #inserted FROM
(select *, row_number() over (Partition by tm.name, tm.studentNo Order by ID,
StudentNo, section ) rownum from #temp1 tm) t
join #Temp2 t2 on t.Id = t2.Id
and t.Name = t2.Name
and t.StudentNo = t2.StudentNo
and t.ClassCode = t2.ClassCode
and t.Section = t2.Section
where t.rownum > 1
If you notice I added this row number, so that it will not two delete the rows from final table, since Joe and Harry has all the matching attributes, and it will delete two rows.
select * from #inserted
Output you get:
Id Name StudentNo ClassCode Section
3 Elle 121 J1 E1
2 Harry 113 X2 H
1 Joe 123 A1 I
Finally you can update final table in this way. #Scenario 2
update TMP
SET ClassCode = NULL, SECTION = NULL
FROM
#Temp1 TMP
JOIN #INSERTED I ON TMP.Id = I.Id
AND TMP.StudentNo = I.StudentNo
SELECT * FROM #Temp1
Final Output:
Id Name StudentNo ClassCode Section
1 Joe 123 NULL NULL
2 Harry 113 NULL NULL
3 Elle 121 NULL NULL
8 Jane 191 A1 E
5 Silva 811 S1 SE
6 Juan 411 S2 SE
7 Carla 431 S2 SE
Please note that I have added scripts and output only for the parts where it required change, and rest part is same script provided by you.

Related

Delete duplicate rows and its dependencies in other tables by using Temporary tables

I have a product table wish contains a duplicated rows and its uniq Id is referenced in other 6 tables.
I want to delete these duplicate rows in the product table and inside the other tables that depends on my product.
I think about using temporary tables to:
create global temporary table for each table related to my product
get duplicate records
get its dependencies
save the first rows with min Id
delete the other rows
I have this idea, but I don't know how to well implement it.
I don't know, if the choice of temp tables is right or not.
Thanks for your help and advises.
Assuming that duplicate products have the same product.name
If there are more criteria for a dup, then adapt the criteria in the EXISTS accordingly.
create table #tmpProductsToDelete (product_id int primary key);
--
-- collect the products that have a higher id with the same name in the temp table
--
insert into #tmpProductsToDelete
select id
from dbo.Product t1
where exists
(
select 1
from dbo.Product t2
where t2.name = t1.name
-- and t2.colA = t1.colA
-- and t2.colB = t1.colB
and t2.id > t1.id
);
Then double check if those are the products to delete.
select *
from dbo.Product
where id in (select product_id from #tmpProductsToDelete);
Maybe first copy those dups into a copy of Product.
Same for the 6 tables with a FK to the Product table.
IF OBJECT_ID('dbo.cpyProduct', 'U') IS NULL
BEGIN
SELECT TOP 0 *, GetDate() as RemoveOn
INTO dbo.cpyProduct FROM dbo.Product
UNION
SELECT TOP 0 *, NULL FROM dbo.Product;
END;
INSERT INTO dbo.cpyProduct
SELECT *, GetDate() AS RemoveOn
FROM dbo.Product
WHERE id IN (select product_id from #tmpProductsToDelete);
IF OBJECT_ID('dbo.cpyTable1', 'U') IS NULL
BEGIN
SELECT TOP 0 *, GetDate() as RemoveOn
INTO dbo.cpyTable1 FROM dbo.Table1
UNION ALL
SELECT TOP 0 *, NULL FROM dbo.Table1;
END;
INSERT INTO dbo.cpyTable1
SELECT *, GetDate() AS RemoveOn
FROM dbo.Table1
WHERE product_id IN (select product_id from #tmpProductsToDelete);
IF OBJECT_ID('dbo.cpyTable2', 'U') IS NULL
BEGIN
SELECT TOP 0 *, GetDate() as RemoveOn
INTO dbo.cpyTable2 FROM dbo.Table2
UNION ALL
SELECT TOP 0 *, NULL FROM dbo.Table2;
END;
INSERT INTO dbo.cpyTable2
SELECT *, GetDate() AS RemoveOn
FROM dbo.Table2
WHERE product_id IN (select product_id from #tmpProductsToDelete);
-- Rinse & repeat for the other 4 tables
SELECT * FROM dbo.cpyProduct;
SELECT * FROM dbo.cpyTable1;
SELECT * FROM dbo.cpyTable2;
Then cleanup.
--
-- delete them from the 6 tables with a FK to the products table
--
delete from dbo.Table1
where product_id in (select product_id from #tmpProductsToDelete);
delete from dbo.Table2
where product_id in (select product_id from #tmpProductsToDelete);
delete from dbo.Table3
where product_id in (select product_id from #tmpProductsToDelete);
delete from dbo.Table4
where product_id in (select product_id from #tmpProductsToDelete);
delete from dbo.Table5
where product_id in (select product_id from #tmpProductsToDelete);
delete from dbo.Table6
where product_id in (select product_id from #tmpProductsToDelete);
-- remove the dups from the base table
delete from dbo.Product
where id in (select product_id from #tmpProductsToDelete);
A test on rextester here
create table dbo.hasduplicates
(
id int identity,
--assume colA, colB is the entity/unique combo
colA varchar(10),
colB int,
someOtherColumn varchar(40)
);
insert into dbo.hasduplicates(colA, colB, someOtherColumn)
values
('A', 1, 'A1 - 1'),
('A', 1, 'A1 - 2'),
('A', 1, 'A1 - 3'),
--
('A', 2, 'A2 - 1'),
('A', 2, 'A2 - 2'),
--
('B', 1, 'B1 - 1'),
('B', 1, 'B1 - 2'),
('B', 1, 'B1 - 3');
select *
from dbo.hasduplicates;
--temp table holding the to-be-deleted ids (of the duplicates)
create table #ToBedeleted(IdToDelete int);
with dup
as
(
select *, row_number() over (partition by colA, colB /*<--cols of your entity go here*/ order by id) as RowNum
from dbo.hasduplicates
)
insert into #ToBedeleted(IdToDelete)
select Id
from dup
where RowNum >= 2;
--contains the ids for deletion
select * from #ToBedeleted;
--cleanup the referencing tables
/*
DELETE FROM dbo.Table1 WHERE Table1Id IN (SELECT IdToDelete FROM #ToBedeleted);
DELETE FROM dbo.Table2 WHERE Table2Id IN (SELECT IdToDelete FROM #ToBedeleted);
.............
DELETE FROM dbo.Table6 WHERE Table6Id IN (SELECT IdToDelete FROM #ToBedeleted);
--finally cleanup your products table
DELETE FROM dbo.hasduplicates WHERE Id IN (SELECT IdToDelete FROM #ToBedeleted);
*/
--/*
drop table #ToBedeleted;
drop table dbo.hasduplicates;
--*/
One method is store the duplicate id's in variable and based on id delete the duplicate records. (Assuming ProductRefTable is Reference table & ProductId is foreign key)
CREATE TABLE Product
(
ID INT NOT NULL IDENTITY(1,1),
Value INT,
CONSTRAINT PK_ID PRIMARY KEY(ID)
)
INSERT INTO Product([Value])
VALUES(1),(2),(3),(4),(5),(5),(3),(5)
DECLARE #DupIDS varchar(max)='';
SELECT #DupIDS =STRING_AGG(ID,',')
FROM Product
WHERE ID NOT IN (SELECT min(ID)
FROM Product
GROUP BY Value)
Delete From Product
WHERE id in (SELECT value FROM STRING_SPLIT(#DupIDS , ','))
Delete From ProductRefTable
WHERE ProductId IN (select STRING_SPLIT(#DupIDS ,',')

SQL Server: How to select missing rows in table from another table?

I have two tables like below:
table1:
StoreId SKU
------------
1 abc
2 abc
3 abc
1 xyz
4 xyz
table2:
StoreId
--------
1
2
3
4
5
I want to select missing storeid from the table1 which are in table 2. But condition is that in above example for SKU abc storeid 4 and 5 are missing and for sku xyz 2,3,5 are missing. So I want below table as output
SKU,ID
------
abc 4
abc 5
xyz 2
xyz 3
xyz 5
I am able to pull only the overall missing store which is 5 using below query.
SELECT
SKU, t2.StoreId
FROM
#table1 t1
FULL OUTER JOIN
#table2 t2 ON t1.StoreId = t2.StoreId
WHERE
t1.StoreId IS NULL
Below is test create and insert query.
Declare #table1 As table
(
StoreId varchar(4),
SKU varchar(5)
)
Declare #table2 As table
(
StoreId int
)
BEGIN
Insert Into #table1(SKU,StoreId) values('abc',1)
Insert Into #table1(SKU,StoreId) values('abc',2)
Insert Into #table1(SKU,StoreId) values('abc',3)
Insert Into #table1(SKU,StoreId) values('xyz',1)
Insert Into #table1(SKU,StoreId) values('xyz',4)
Insert Into #table2(StoreId) values(1)
Insert Into #table2(StoreId) values(2)
Insert Into #table2(StoreId) values(3)
Insert Into #table2(StoreId) values(4)
Insert Into #table2(StoreId) values(5)
END
Thank you
You need to get a list of all skus and tables, and then show only rows which do not appear in table1:
select SKU, StoreID
from #table2 t2
cross join (select distinct sku from #table1) t1
where not exists (select 1 from #table1 table1
where table1.SKU = t1.SKU
and table1.StoreId = t2.StoreId)
Here is an alternative solution with the same result.
Syntax is very similar to the answer from #BeanFrog:
SELECT
t3.SKU, t2.StoreID
FROM
#table2 t2
CROSS JOIN
(SELECT distinct SKU
FROM #table1) t3
LEFT JOIN
#table1 t1
ON
t1.SKU = t3.SKU
and t1.StoreId = t2.StoreId
WHERE
t1.sku is null

SQL Server: How to select top rows of a group based on value of the column of that group?

I have two tables like below.
table 1
id rem
1 2
2 1
table 2
id value
1 abc
1 xyz
1 mno
2 mnk
2 mjd
EDIT:
#output
id value
1 abc
1 xyz
2 mnk
What i want to do is select top 2 rows of table2 with id one as rem value is 2 for id 1 and top 1 row with id 2 as its rem value is 1 and so on. I am using MS sqlserver 2012 My whole scenario is more complex than this. Please help.
Thank you.
EDIT : I know that i should have given what i have done and how i am doing it but for this particular part i don't have idea for starting. I could do this by using while loop for each unique id but i want to do it in one go if possible.
First, SQL tables represent unordered sets. There is no specification of which values you get, unless you include an order by.
For this purpose, I would go with row_number():
select t2.*
from table1 t1 join
(select t2.*,
row_number() over (partition by id order by id) as seqnum
from table2 t2
) t2
on t1.id = t2.id and t2.seqnum <= t1.rem;
Note: The order by id in the windows clause should be based on which rows you want. If you don't care which rows, then order by id or order by (select null) is fine.
Try This:
DECLARE #tbl1 TABLE (id INT, rem INT)
INSERT INTO #tbl1 VALUES (1, 2), (2, 1)
DECLARE #tbl2 TABLE (id INT, value VARCHAR(10))
INSERT INTO #tbl2 VALUES (1, 'abc'), (1, 'xyz'),
(1, 'mno'), (2, 'mnk'), (2, 'mjd')
SELECT * FROM #tbl1 -- your table 1
SELECT * FROM #tbl2 -- your table 2
SELECT id,value,rem FROM ( SELECT ROW_NUMBER() OVER (PARTITION BY T.ID ORDER BY T.ID) rowid,
T.id,T.value,F.rem FROM #tbl2 T LEFT JOIN #tbl1 F ON T.id = F.id ) A WHERE rowid = 1
-- your required output
Hope it helps.

Delete rows from table based on condition if other data exists within the table

I have the following data returned by a query:-
CUSTACCOUNT DIVISION EXTPERSON SALESMAN
C0001729 ECD 5637263283 Ian
C0001729 Fuel 5637369057 Peter
C0001729 Fuel NULL House
C0001729 ECD NULL House
C0001729 BSC 5637263239 Andrew
I would like a way to delete all rows which have null in the EXTPERSON column only if another row has the same DIVISION and EXTPERSON has a value, if they do not then to leave the rows in and not delete... I hope that makes sense. Is there a way to do this?
Is there a simple way of doing this?
Please use below code, assuming #ACCOUNTS has your table data
DELETE A FROM #ACCOUNTS A
WHERE EXTPERSON IS NULL
AND EXISTS (SELECT 1 FROM #ACCOUNTS B
WHERE A.DIVISION = B.DIVISION
AND ISNULL(B.EXTPERSON,0)<>0)
For such tasks, I used to left join and check in where if I have a link.
For exemple, for a SQL struct as:
`a`(`id`, `type`, `val`)
If i want to "select only lines, ID included, that already have the same type than another"...
select a1.* from `a` a1 left join `a` a2 on a2.`type`=a1.`type` and a2.`id`!=a1.`id` where a2.`id` is not null;
Such a task may also be performed with a subquery using group by, count, having... But subqueries are. Erm. Subqueries.
DELETE from TABLE1 T1 where EXISTS
(select CUSTACCOUNT, DIVISION, EXTPERSON,SALESMAN from TABLE2 T2
where EXTPERSON IS NULL AND T1.DIVISION<>T2.DIVISION
) and EXTPERSON IS NULL
Sample data setup:
create table #data (CUSTACCOUNT varchar(50), DIVISION varchar(50), EXTPERSON varchar(50), SALESMAN varchar(50))
insert into #data values
('C0001729', 'ECD', '5637263283', 'Ian'),
('C0001729', 'Fuel', '5637369057', 'Peter'),
('C0001729', 'Fuel', NULL, 'House'),
('C0001729', 'ECD', NULL, 'House'),
('C0001729', 'BSC', '5637263239', 'Andrew'),
('C0001729', 'SomeOther', NULL, 'Name')
Delete query:
;with aData
as (
select
rn = row_number() over (partition by DIVISION order by EXTPERSON desc),
cntExt = count(EXTPERSON) over (partition by DIVISION)
from #data
)
delete from aData
where rn > cntExt and cntExt > 0
Check table data:
select * from #data
Output:
CUSTACCOUNT DIVISION EXTPERSON SALESMAN
------------ ----------- ----------- ----------
C0001729 ECD 5637263283 Ian
C0001729 Fuel 5637369057 Peter
C0001729 BSC 5637263239 Andrew
C0001729 SomeOther NULL Name
you can use following query:
DELETE FROM t
FROM tbl t
WHERE
EXTPERSON IS NULL
AND EXISTS(
SELECT 1 FROM tbl WHERE t.DIVISION =DIVISION AND EXTPERSON IS NOT NULL
)
Explanation:
We are deleting the rows which are null and in the outer WHERE query part we check for other rows which have dame DIVISION but have value in EXTPERSON.
Test scripts:
create table tbl ( CUSTACCOUNT nvarchar(20),DIVISION nvarchar(20), EXTPERSON nvarchar(20),SALESMAN nvarchar(20))
INSERT INTO tbl values
('C0001729','ECD','5637263283','Ian')
,('C0001729','Fuel','5637369057','Peter')
,('C0001729','Fuel', NULL,'House')
,('C0001729','ECD', NULL,'House')
,('C0001729','BSC','5637263239','Andrew')
select * from tbl
begin transaction
DELETE FROM t
FROM tbl t
WHERE
EXTPERSON IS NULL
AND EXISTS(
SELECT 1 FROM tbl WHERE t.DIVISION =DIVISION AND EXTPERSON IS NOT NULL
)
select * from tbl
rollback transaction
drop table tbl
Result:
delete delTable
from table delTable
join table hasValue
on delTable.EXTPERSON is null
and hasValue.EXTPERSON is not null
and delTable.DIVISION = hasValue.DIVISION

How to select records where all of their statuses are zero?

Suppose I have following tables
Person table and personStatus table.
declare #Persons table
(PersonId int)
insert into #Persons select 10
insert into #Persons select 11
declare #PersonStatus table
(id int,statuss int)
insert into #PersonStatus (id,statuss) values(10,0)
insert into #PersonStatus (id,statuss) values(10,0)
insert into #PersonStatus (id,statuss) values(11,1)
insert into #PersonStatus (id,statuss) values(10,0)
insert into #PersonStatus (id,statuss) values(11,0)
Now I want to find person IDs that all of their statuses are zero
result is just ---> 10
How to do it?
SELECT id
FROM #PersonStatus
GROUP BY ID
HAVING COUNT(DISTINCT statuss) = 1 AND
MAX(statuss) = 0
SQLFiddle Demo
OR
SELECT id
FROM #PersonStatus
GROUP BY ID
HAVING MAX(statuss) = MIN(statuss) AND
MAX(statuss) = 0
SQLFiddle Demo
Since I assume the #Persons table has more than just a PersonId column and you might want other columns from there, I think #Persons needs to be part of the query.
SELECT p.PersonId --, other columns from p
FROM #Persons AS p
WHERE EXISTS (SELECT id FROM #PersonStatus
WHERE id = p.PersonId
GROUP BY id HAVING MAX(statuss) = 0);
select distinct p.id from #PersonStatus as p
where Statuss = 0
and not exists (select null from #Personstatus as t
where p.id = t.id and t.statuss <> 0)

Resources