Removing duplicate rows while also updating relations

Removing duplicate rows while also updating relations - sql-server

My data is set up as follows:
CREATE TABLE TableA
(
id int IDENTITY,
name varchar(256),
description varchar(256)
)
CREATE TABLE TableB
(
id int IDENTITY,
name varchar(256),
description varchar(256)
) --unique constraint on name, description
CREATE TABLE TableA_TableB
(
idA int,
idB int
) --composite key referencing TableA and TableB
The situation is that I have many duplicate records in TableB that violate the unique constraint, and those duplicate records are referenced in TableA_TableB. So I'm trying to remove those records, which is simple enough (using the following CTE), but what would be the best way to update the records in TableA_TableB to reflect this change, i.e, have the TableA_TableB records reference the same ID in TableB as opposed to different IDs for each of the duplicates?
;WITH cte
AS (SELECT ROW_NUMBER() OVER (PARTITION BY [Name], [Description]
ORDER BY ( SELECT 0)) RN
FROM TableB)
DELETE FROM cte
WHERE RN = 1

Note: changed b.RowNum=1 to b.RowNum>1
First, you should try with ROLLBACK and then, if it's OK, uncomment COMMIT (this script wasn't tested):
DECLARE #UpdatedRows TABLE(ID INT PRIMARY KEY);
BEGIN TRANSACTION;
;WITH Base
AS(
SELECT ROW_NUMBER() OVER (PARTITION BY [Name], [Description] ORDER BY ( SELECT 0)) RowNum,
MIN(id) OVER(PARTITION BY [Name], [Description]) AS NewID,
ID -- Old ID
FROM TableB
),TableB_RowsForUpdate
AS(
SELECT *
FROM Base b
WHERE b.RowNum>1
)
UPDATE target
SET IDB=b.NewID
OUTPUT deleted.IDB INTO #UpdatedRows
FROM TableA_TableB target
INNER JOIN TableB_RowsForUpdate b ON target.IDB=b.ID;
DELETE b
FROM TableB b INNER JOIN #UpdatedRows upd ON b.ID=upd.ID;
ROLLBACK;
-- COMMIT;

Related

Filling the ID column of a table NOT using a cursor

Tables have been created and used without and ID column, but ID column is now needed. (classic)
I heard everything could be done without cursors. I just need every row to contain a different int value so I was looking for some kind of row number function :
How do I use ROW_NUMBER()?
I can't tell exactly how to use it even with these exemples.
UPDATE [TableA]
SET [id] = (select ROW_NUMBER() over (order by id) from [TableA])
Subquery returned more than 1 value.
So... yes of course it return more than one value. Then how to mix both update and row number to get that column filled ?
PS. I don't need a precise order, just unique values. I also wonder if ROW_NUMBER() is appropriate in this situation...

You can use a CTE for the update
Example
Declare #TableA table (ID int,SomeCol varchar(50))
Insert Into #TableA values
(null,'Dog')
,(null,'Cat')
,(null,'Monkey')
;with cte as (
Select *
,RN = Row_Number() over(Order by (Select null))
From #TableA
)
Update cte set ID=RN
Select * from #TableA
Updated Table
ID SomeCol
1 Dog
2 Cat
3 Monkey

You can use a subquery too as
Declare #TableA table (ID int,SomeCol varchar(50))
Insert Into #TableA values
(null,'Dog')
,(null,'Cat')
,(null,'Monkey');
UPDATE T1
SET T1.ID = T2.RN
FROM #TableA T1 JOIN
(
SELECT ROW_NUMBER()OVER(ORDER BY (SELECT 1)) RN,
*
FROM #TableA
) T2
ON T1.SomeCol = T2.SomeCol;
Select * from #TableA

When inserting a row, can the `ParentId` be set as the `Id` from a previous inserted row within the same query?

Suppose I have a table called #tblTemp like this:
DECLARE #tblTemp TABLE
(
Id INT NOT NULL IDENTITY,
Name VARCHAR(MAX) NOT NULL,
ParentId INT NULL
)
and my XML structure (assigned to #Xml) was:
<Data>
<MyRow Name="I am the Parent"/>
<MyRow Name="I am the child" ParentName="I am the Parent"/>
</Data>
Question: would it be possible to insert into the ParentId column within the same query?
SQL Script
INSERT INTO #tblTemp ([Name], [ParentId])
SELECT
Rw.value('#Name','VARCHAR(MAX)'), -- Name
(SELECT [Id] -- Select ID From Parent Name
FROM #tblTemp AS [TT]
WHERE [TT].[Name] = Rw.value('#ParentName', 'VARCHAR(MAX)'))
FROM
#Xml.nodes('Data/MyRow') AS Data(Rw)
SELECT *
FROM #tblTemp AS [TT]
The script inserts NULL into the ParentId column as I suspect the previous inserts haven't been committed yet so the table will be empty.
Alternative: if it isn't possible to insert into the ParentId column within the same query, then my alternative would be to do the insert then update the table where required.

Try it like this:
DECLARE #tblTemp TABLE
(
Id INT NOT NULL,
Name VARCHAR(MAX) NOT NULL,
ParentId INT NULL
)
DECLARE #xml XML=
N'<Data>
<MyRow Name="I am the Parent"/>
<MyRow Name="I am the child" ParentName="I am the Parent"/>
<MyRow Name="another child" ParentName="I am the Parent"/>
<MyRow Name="baby" ParentName="I am the child"/>
</Data>';
WITH DerivedTable AS
(
SELECT r.value(N'#Name',N'nvarchar(max)') AS [Name]
,r.value(N'#ParentName',N'nvarchar(max)') AS [ParentName]
,ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS RowNmbr
FROM #xml.nodes(N'/Data/MyRow') AS A(r)
)
,recCTE AS
(
SELECT 1 AS Lvl
,[Name]
,[ParentName]
,RowNmbr
,CAST(NULL AS BIGINT) AS ParentRowNmbr
,CAST(N'' AS NVARCHAR(MAX)) AS [ParentPath]
FROM DerivedTable
WHERE ParentName IS NULL
UNION ALL
SELECT r.Lvl+1
,t.[Name]
,t.[ParentName]
,t.RowNmbr
,r.RowNmbr
,r.[ParentPath]+t.[ParentName]+N'|'
FROM DerivedTable AS t
INNER JOIN recCTE AS r ON r.[Name]=t.[ParentName]
)
--Use this SELECT to see all columns returned by the recursive CTE
--SELECT * FROM recCTE
INSERT INTO #tblTemp(ID,[Name],ParentId)
SELECT RowNmbr, [Name],ParentRowNmbr
FROM recCTE;
SELECT * FROM #tblTemp;
The result
Id Name ParentId
1 I am the Parent NULL
2 I am the child 1
3 another child 1
4 baby 2
Short explanation:
The first CTE will read the values as derived table and use ROW_NUMBER() to give a running number to each row as ID.
The second CTE is recursively travelling down the road.
The result can be inserted directly into your table.
Attention
I changed your table from ID is IDENTITY to a normal INT column. You can use SELECT MAX(ID) first to get the highest existing ID and add this to ROW_NUMBER() in the first CTE. Otherwise it might happen, that the IDs given by ROW_NUMBER() are not the same as the ID given by IDENTITY.

How can I delete the duplicate rows while keeping the original record?

I have an one table as below a picture which indicates some duplicated rows.I can find the duplicated rows but I could not able to delete it because of there is no any unique ID that I can distinguish. There were lots of duplicated rows like that in same table I just screenshot a piece of that.
As a result,according to the below picture, how can I delete the duplicated rows but keep original ?

One solution you could consider is copying all unique records into a temporary table, thus removing the duplicates. You could then truncate the original table and re-populate it from the temporary table you've created. The code would be something like this:
SELECT DISTINCT * INTO #tempTable FROM MyTable
TRUNCATE TABLE MyTable;
INSERT INTO MyTable (LocationID, UnitID, CameraID ... IsActiveHours)
SELECT LocationID, UnitID, CameraID ... IsActiveHours FROM #tempTable;
This isn't always an option due to key constraints and amount of data, but useful in certain cases. Take it as you may.

You could use a cte and Row_Number() to accomplish this. If you are satisfied with the results, remove the final select and un-comment the delete statement
;with cte as (
Select *,RowNr=Row_Number() over (Partition By LocationId Order by Date_T)
From YourTable
)
Select * from cte Where RowNr>1
-- Delete From cte Where RowNr>1

You would be best adding an identity column to make things easier however this can be done without a TRUNCATE using the following:
--GET DUPLICATE ROWS INTO A TEMP TABLE (YOU MAY NOT NEED TO USE ALL THE COLUMNS TO IDENTIFY A DUPLICATE)
SELECT ROW_NUMBER() OVER (ORDER BY ColA) AS RowNo, ColA, ColB, ColC, COUNT(*) As [Count]
INTO #TEMP1
FROM test
GROUP BY ColA, ColB, ColC
HAVING COUNT(*) > 1
--LOOP THROUGH DUPLICATES
DECLARE #RowNo INT
DECLARE #Duplicates INT
SET #RowNo = 1
WHILE EXISTS(SELECT * FROM #TEMP1)
BEGIN
--GET A COUNT OF ADDITIONAL ROWS FOR THIS DUPLICATE
SET #Duplicates = (SELECT [Count] FROM #TEMP1 WHERE RowNo = #RowNo) - 1
--DELETE THE ROWS WE DONT NEED
DELETE TOP (#Duplicates) t1
FROM test t1
JOIN #TEMP1 t2 ON t1.ColA = t2.ColA AND t1.ColB = t2.ColB AND t1.ColC = t2.ColC
WHERE t2.RowNo = #RowNo
--REMOVE THE ROW FROM THE TEMP TABLE
DELETE FROM #TEMP1 WHERE RowNo = #RowNo
--INCREASE THE ROW NO TO MOVE TO THE NEXT ROW
SET #RowNo = #RowNo + 1
END
--DROP THE TEMP TABLE
DROP TABLE #TEMP1

This is the query that fix this issue.
WITH X AS (
SELECT ROW_NUMBER() OVER(PARTITION BY LocationId,date_t ORDER BY LocationId desc) as 'rownum',LocationId,
date_T AS T
FROM Counts
)
--SELECT * FROM X WHERE rownum >1
DELETE FROM X
WHERE rownum <> 1

SQL Server: Insert Into 2 Tables in one query

I have seen a few questions similar to this but none gave me the answer I was looking for.
So here is the example
[Table A]:
ID pk/auto-increment
Name
Age
...
[Table B]:
ID pk/auto-increment
FK_A_ID fk
Comment
I have an import of data that contains over 700 rows (and growing)
[Table Import]
Name / Age / ... / Comment
Is it possible to use a query similar to:
INSERT INTO [TABLE A] (Name, Age, ...), [Table B] (FK_A_ID, Comments)
SELECT
Name, Age, ..., ##IDENTITY, Comment
FROM
[TABLE Import]
Or a shorter question, is it possible to insert into 2 tables in the same query referencing the first insert? - when I right it out like that it seems unlikely.
Thanks

You can't. But you can use transaction, like this:
START TRANSACTION;
INSERT INTO tableA
SELECT Name, Age, ... FROM tableImport;
INSERT INTO tableB
SELECT A.ID, I.Comment
FROM tableA A INNER JOIN tableImport I
ON A.Name = I.Name AND A.Age = I.Age AND ...;-- (if columns not unique)
COMMIT;

I think you can do it with some temporary tables, and the row_number feature, then perform separate inserts in to TABLE A and TABLE B from the temporary table
UNTESTED
create table source
(
Name varchar(50),
age int,
comment varchar(100)
)
go
insert into source
(name, age, comment)
values
('adam',12,'something'),
('steve',12,'everything'),
('paul',12,'nothing'),
('john',12,'maybe')
create table a
(
id int identity(1,1) not null,
name varchar(50),
age int,
rowid int
)
go
create table b
(
id int identity(1,1) not null,
comment varchar(50),
fkid int not null
)
go
declare #tempa table
(
RowID int,
Name varchar(50),
age int,
comment varchar(100)
)
go
insert into #tempa
(rowid, name, age, comment)
SELECT ROW_NUMBER() OVER(ORDER BY name DESC) AS RowId,
name, age, comment
FROM source
go
insert into a
(name, age, rowid)
select name, age, rowid
from #tempa
insert into b
(comment, fkid)
select t.comment,
a.id as fkid
from #tempa t inner join a a
on t.rowid = a.rowid

In my honest opinion, the best way to do this is create a stored procedure and rollback in case of failure. If you do so, you don't need a transaction because until you supply the "COMMIT" command nothing will be inserted.

SQL Server Another simple question

I have 2 temp Tables [Description] and [Institution], I want to have these two in one table.
They are both tables that look like this:
Table1; #T1
|Description|
blabla
blahblah
blagblag
Table2; #T2
|Institution|
Inst1
Inst2
Inst3
I want to get it like this:
Table3; #T3
|Description| |Institution|
blabla Inst1
blahblah Inst2
blagblag Inst3
They are already in sort order.
I just need to get them next to each other..
Last time I asked was something almost the same.
I used this query
Create Table #T3
(
[From] Datetime
,[To] Datetime
)
INSERT INTO #T3
SELECT #T1.[From]
, MIN(#T2.[To])
FROM #T1
JOIN #T2 ON #T1.[From] < #T2.[To]
GROUP BY #T1.[From]
Select * from #T3
It did work for the date values, but it won't work here ? :s
Thank you.

One thing that concerns me is that you say that the values "are already in sort order". There really is no default sort order -- if you don't specify a sort order, you are at the mercy of SQL Server to determine the order in which the data is returned. The solution below assumes that there is some way to sort the data such that the records "match up" (using the ORDER BY clauses).
Hope this helps,
John
-- Table 1 test data
Create Table #T1
(
[Description] nvarchar(30)
)
INSERT INTO #T1 ([Description]) VALUES ('desc1')
INSERT INTO #T1 ([Description]) VALUES ('desc2')
INSERT INTO #T1 ([Description]) VALUES ('desc3')
-- Table 2 test data
Create Table #T2
(
[Institution] nvarchar(30)
)
INSERT INTO #T2 (Institution) VALUES ('Inst1')
INSERT INTO #T2 (Institution) VALUES ('Inst2')
INSERT INTO #T2 (Institution) VALUES ('Inst3')
-- Create table 3
Create Table #T3
(
[Description] nvarchar(30),
[Institution] nvarchar(30)
);
-- Use CTE2 to add row numbers to the data; use the row numbers to join the tables
-- you must specify the sort order for the data in the tables
WITH CTE1 (Description, RowNum) AS
(
SELECT [Description], ROW_NUMBER() OVER(ORDER BY [Description]) as RowNum
FROM #T1
),
CTE2 (Institution, RowNum) AS
(
SELECT Institution, ROW_NUMBER() OVER(ORDER BY Institution) as RowNum
FROM #T2
)
INSERT INTO #T3
SELECT CTE1.Description, CTE2.Institution
FROM CTE1
LEFT JOIN CTE2 ON CTE1.RowNum = CTE2.RowNum
Select * from #T3

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Removing duplicate rows while also updating relations - sql-server

Related

Filling the ID column of a table NOT using a cursor

When inserting a row, can the `ParentId` be set as the `Id` from a previous inserted row within the same query?

How can I delete the duplicate rows while keeping the original record?

SQL Server: Insert Into 2 Tables in one query

SQL Server Another simple question

Categories

Resources