SQL select from table with an update subquery - sql-server

I'm trying to update a table and at the same time (in the same transaction) select from the table and perform a count on one of its columns.
See below for example code. The last 2 querys (the UPDATE and the SELECT) I want to merge them into one query somehow.
CREATE TABLE ChildProcesses
(
Id INT IDENTITY(1,1) PRIMARY KEY,
ParentId INT,
ProcessIsFinished BIT
);
INSERT INTO ChildProcesses (ParentId, ProcessIsFinished) VALUES (1234, 0);
INSERT INTO ChildProcesses (ParentId, ProcessIsFinished) VALUES (1234, 0);
INSERT INTO ChildProcesses (ParentId, ProcessIsFinished) VALUES (1234, 0);
INSERT INTO ChildProcesses (ParentId, ProcessIsFinished) VALUES (1234, 0);
UPDATE ChildProcesses SET ProcessIsFinished = 1 WHERE Id = 4;
SELECT
COUNT(*) AS CountProcessesStillRunning
FROM
ChildProcesses
WHERE
ParentId = 1234 AND
ProcessIsFinished = 0;
The reason for all of this is, I have a distributed process where my code is running in many places. This code performs processing on my "ChildProcesses" and when they're done they set the "ChildProcesses.ProcessIsFinished" to "1". When all of the child processes have finished I need to perform some cleanup steps, but only when ALL of the child processes have finished and I only want to perform the cleanup steps once. So I'm trying to find a way to both "mark the process as finished" and at the same time "check to see if this was the last child process to finish". If this is the last child process to finish then i'll have that child process perform the cleanup steps.
I thought to have the UPDATE query as a subquery to the SELECT, but I get errors saying that's not allowed. Here's what I tried that's not working:
SELECT
COUNT(*) AS CountProcessesStillRunning
FROM
ChildProcesses
WHERE
ParentId = (UPDATE ChildProcesses SET ProcessIsFinished = 1 OUTPUT INSERTED.ParentId WHERE Id = 4) AND
ProcessIsFinished = 0;
Any help would be appreciated.

Took inspiration from #CharlieFace on this one.
I was originally trying to SELECT with a subquery having UPDATE...OUTPUT, which isn't allowed.
However, I can UPDATE...OUTPUT...INTO a temporary table. So I decided to UPDATE all ChildProcesses but use a CASE to just update my particular ChildId. Then output all of those modified entries into a temporary table. This temporary table I could then compute if all child processes had completed or if there were still running child processes.
Here's the new query.
DECLARE #ParentId INT = 1234;
DECLARE #ChildId INT = 1;
DECLARE #Output TABLE (Id INT, ParentId INT, ProcessIsFinished BIT);
UPDATE
ChildProcesses
SET
ProcessIsFinished =
CASE Id
WHEN #ChildId THEN 1 -- update only 'this' child process
ELSE ProcessIsFinished -- leave the other entries alone
END
OUTPUT
INSERTED.*
INTO
#Output
WHERE
ParentId = #ParentId;
SELECT
COUNT(*) As StillHasRunningChildProcesses
FROM
#Output
WHERE
ProcessIsFinished = 0;

You cannot nest UPDATE...OUTPUT except inside a INSERT...SELECT, you are not allowed a bare SELECT and you are not allowed to group.
Instead you can use a table variable
DECLARE #output TABLE (ParentId int);
UPDATE ChildProcesses
SET ProcessIsFinished = 1
OUTPUT inserted.ParentId
INTO #output (ParentId)
WHERE Id = 4;
SELECT
COUNT(*) AS CountProcessesStillRunning
FROM
ChildProcesses cp
JOIN
#output u ON u.ParentId = cp.ParentId
WHERE
cp.ProcessIsFinished = 0;
To avoid race conditions, you might want a transaction. To enure you lock the ParentId rows that you are going to read afterwards, use a SELECT with a UPDLOCK and a SERIALIZABLE hint
SET XACT_ABORT, NOCOUNT ON;
BEGIN TRAN;
DECLARE #dummy int;
SELECT
#dummy = COUNT(*)
FROM
ChildProcesses cp
JOIN ChildProcesses cp2 WITH (UPDLOCK, SERIALIZABLE)
ON cp2.ParentId = cp.ParentId
WHERE
cp.Id = 4;
DECLARE #output TABLE (ParentId int);
UPDATE ChildProcesses WITH (SERIALIZABLE)
SET ProcessIsFinished = 1
OUTPUT inserted.ParentId
INTO #output (ParentId)
WHERE Id = 4;
SELECT
COUNT(*) AS CountProcessesStillRunning
FROM
ChildProcesses cp
JOIN
#output u ON u.ParentId = cp.ParentId
WHERE
cp.ProcessIsFinished = 0;
COMMIT;
If ParentId is actually the primary key for this table, and you just want the number of rows affected then you can just use ##ROWCOUNT. It doesn't seem that you actually want this logic, but you should get the idea.
UPDATE ChildProcesses
SET ProcessIsFinished = 1
WHERE Id = 4
AND ProcessIsFinished = 0;
SELECT
##ROWCOUNT AS CountProcessesStillRunning;

Related

Skipping already updated row

I would like to update a table column with a counter stored in a another table that is incremented on each update. All is working fine. However, if I reset the counter to a new number that happens to already exist in the table to update, for example, 7; during the update, it will find the 7 we updated in the first loop and update it along with other values of 7.
Is there a way to skip already updated values and only update those that are not? I am contemplating adding a column with a flag to track already updated rows; which I delete after the update. However, I feel there could be a better way of doing this. Any ideas?
This is what I currently have:
/**counter table**/
create table ctl (cid int)
insert into ctl values (1)
/**table to update**/
create table tbl1(tid int)
insert into tbl1 (tid)
values(1),(1),(1),(1),(2),(3),(3),(3),(3),(3),(4),(5),(6),(7),(7)
/**temp table**/
select tid into #tmptbl from tbl1
declare #tidNum int
declare #cctl int
select #cctl = (select cid from ctl)
while exists (select tid from #tmptbl)
begin
select #tidNum = (select top 1 tid from #tmptbl order by tid asc)
update tbl1 set tid=#cctl where tid=#tidNum
select #cctl=#cctl+1
update ctl set cid=#cctl
delete #tmptbl where tid=#tidNum
end
select * from tbl1
drop table #tmptbl
/**counter table**/
create table ctl (cid int)
insert into ctl values (1)
/**table to update**/
create table tbl1(tid int)
insert into tbl1 (tid)
values(1),(1),(1),(1),(2),(3),(3),(3),(3),(3),(4),(5),(6),(7),(7)
go
begin transaction
declare #foo table(cid int);
declare #base int;
update ctl
set cid = cid + (select count(distinct tid) from tbl1)
output deleted.cid into #foo(cid);
select #base = cid
from #foo;
update t
set tid = addme + thebase
from
(
select tid, dense_rank() over(order by tid)-1 as addme, #base as thebase
from tbl1
) as t
--error handling goes here
commit transaction
go
select *
from tbl1;
update ctl
set cid = 4;
go
begin transaction
declare #foo table(cid int)
declare #base int
update ctl
set cid = cid + (select count(distinct tid) from tbl1)
output deleted.cid into #foo(cid);
select #base = cid
from #foo
update t
set tid = addme + thebase
from
(
select tid, dense_rank() over(order by tid)-1 as addme, #base as thebase
from tbl1
) as t
--error handling goes here
commit transaction
go
select *
from tbl1;
go

Update in Merge behaves different? It doesn't get the context_info() while Insert does

I created the following two test tables with a trigger to log all the action (Insert, Delete and Update).
Set up tables and trigger:
-- drop table test; drop table testLog
create table test (id int identity primary key, x int);
create table testLog (idx int identity primary key, Action varchar(10), id int not null,
x_deleted int, x_inserted int, uid uniqueidentifier);
go
-- Trigger to log the changes
create trigger trigger_test on test
after insert, delete, update
as
declare #id uniqueidentifier = context_info();
print #id;
insert testLog (id, Action, x_deleted, x_inserted, uid)
select isnull(d.id, i.id) ,
case when i.id is not null and d.id is not null then 'Updated'
when d.id is not null then 'Deleted'
when i.id is not null then 'Inserted'
end ,
d.x ,
i.x ,
#id
from Deleted d
full outer join inserted i on i.id = d.id;
set context_info 0;
go
Now insert some sample data
set context_info 0
insert test (x) values (10), (20), (30), (40), (50);
SELECT * FROM test;
SELECT * FROM testLog
go
The following statements work fine. The correct context_info() is saved in the log table.
begin tran
declare #newid uniqueidentifier = newid()
--
set context_info #newid
print #newid
insert test(x) values (1)
set context_info #newid
update test set x = 2 where id = 1
SELECT * FROM dbo.testLog;
rollback
go
However, only insert part of the Merge got the value in context_info()?
begin tran
declare #newid uniqueidentifier = newid()
--
set context_info #newid
print #newid;
with v as (select * from (values (1, 11), (2, 22), (6, 66)) v (id, x))
merge test as t using v on t.id = v.id
when matched then update set x = v.x
when not matched by target then insert (x) values (x);
SELECT * FROM dbo.testLog;
rollback
go
The uid of the last two updates got zeros.
Don't set context_info to zero in the trigger. Why would you do that in the first place - it is not the trigger's responsibility to "clean up". The merge statement will cause the trigger to execute for inserts separately from updates. Did you not notice the multiple "prints" in the results pane? That should have been a big clue.

How to chunk updates to SQL Server?

I want to update a table in SQL Server by setting a FLAG column to 1 for all values since the beginning of the year:
TABLE
DATE ID FLAG (more columns...)
2016/01/01 1 0 ...
2016/01/01 2 0 ...
2016/01/02 3 0 ...
2016/01/02 4 0 ...
(etc)
Problem is that this table contains hundreds of millions of records and I've been advised to chunk the updates 100,000 rows at a time to avoid blocking other processes.
I need to remember which rows I update because there are background processes which immediately flip the FLAG back to 0 once they're done processing it.
Does anyone have suggestions on how I can do this?
Each day's worth of data has over a million records, so I can't simply loop using the DATE as a counter. I am thinking of using the ID
Assuming the date column and the ID column are sequential you could do a simple loop. By this I mean that if there is a record id=1 and date=2016-1-1 then record id=2 date=2015-12-31 could not exist. If you are worried about locks/exceptions you should add a transaction in the WHILE block and commit or rollback on failure.
Change the #batchSize to whatever you feel is right after some experimentation.
DECLARE #currentId int, #maxId int, #batchSize int = 10000
SELECT #currentId = MIN(ID), #maxId = MAX(ID) FROM YOURTABLE WHERE DATE >= '2016-01-01'
WHILE #currentId < #maxId
BEGIN
UPDATE YOURTABLE SET FLAG = 1 WHERE ID BETWEEN #currentId AND (#currentId + #batchSize)
SET #currentId = #currentId + #batchSize
END
As this as the update will never flag the same record to 1 twice I do not see a need to track which records were touched unless you are going to manually stop the process partway through.
You should also ensure that the ID column has an index on it so the retrieval is fast in each update statement.
Looks like a simple question or maybe I'm missing something.
You can create a temp/permanent table to keep track of updated rows.
create tbl (Id int) -- or temp table based on your case
insert into tbl values (0)
declare #lastId int = (select Id from tbl)
;with cte as (
select top 100000
from YourMainTable
where Id > #lastId
ORDER BY Id
)
update cte
set Flag = 1
update tbl set Id = #lastId + 100000
You can do this process in a loop (except the table creation part)
create table #tmp_table
(
id int ,
row_number int
)
insert into #tmp_table
(
id,
row_number
)
--logic to load records from base table
select
bt.id,
row_number() over(partition by id order by id ) as row_number
from
dbo.bas_table bt
where
--ur logic to limit the records
declare #batch_size int = 100000;
declare #start_row_number int,#end_row_number int;
select
#start_row_number = min(row_number),
#end_row_number = max(row_number)
from
#tmp_table
while(#start_row_number < #end_row_number)
begin
update top #batch_size
bt
set
bt.flag = 1
from
dbo.base_table bt
inner join #tmp_table tt on
tt.Id = bt.Id
where
bt.row_number between #start_row_number and (#start_row_number + #batch_size)
set #start_row_number = #start_row_number + #batch_size
end

Using merge..output to get mapping between source.id and target.id

Very simplified, I have two tables Source and Target.
declare #Source table (SourceID int identity(1,2), SourceName varchar(50))
declare #Target table (TargetID int identity(2,2), TargetName varchar(50))
insert into #Source values ('Row 1'), ('Row 2')
I would like to move all rows from #Source to #Target and know the TargetID for each SourceID because there are also the tables SourceChild and TargetChild that needs to be copied as well and I need to add the new TargetID into TargetChild.TargetID FK column.
There are a couple of solutions to this.
Use a while loop or cursors to insert one row (RBAR) to Target at a time and use scope_identity() to fill the FK of TargetChild.
Add a temp column to #Target and insert SourceID. You can then join that column to fetch the TargetID for the FK in TargetChild.
SET IDENTITY_INSERT OFF for #Target and handle assigning new values yourself. You get a range that you then use in TargetChild.TargetID.
I'm not all that fond of any of them. The one I used so far is cursors.
What I would really like to do is to use the output clause of the insert statement.
insert into #Target(TargetName)
output inserted.TargetID, S.SourceID
select SourceName
from #Source as S
But it is not possible
The multi-part identifier "S.SourceID" could not be bound.
But it is possible with a merge.
merge #Target as T
using #Source as S
on 0=1
when not matched then
insert (TargetName) values (SourceName)
output inserted.TargetID, S.SourceID;
Result
TargetID SourceID
----------- -----------
2 1
4 3
I want to know if you have used this? If you have any thoughts about the solution or see any problems with it? It works fine in simple scenarios but perhaps something ugly could happen when the query plan get really complicated due to a complicated source query. Worst scenario would be that the TargetID/SourceID pairs actually isn't a match.
MSDN has this to say about the from_table_name of the output clause.
Is a column prefix that specifies a table included in the FROM clause of a DELETE, UPDATE, or MERGE statement that is used to specify the rows to update or delete.
For some reason they don't say "rows to insert, update or delete" only "rows to update or delete".
Any thoughts are welcome and totally different solutions to the original problem is much appreciated.
In my opinion this is a great use of MERGE and output. I've used in several scenarios and haven't experienced any oddities to date.
For example, here is test setup that clones a Folder and all Files (identity) within it into a newly created Folder (guid).
DECLARE #FolderIndex TABLE (FolderId UNIQUEIDENTIFIER PRIMARY KEY, FolderName varchar(25));
INSERT INTO #FolderIndex
(FolderId, FolderName)
VALUES(newid(), 'OriginalFolder');
DECLARE #FileIndex TABLE (FileId int identity(1,1) PRIMARY KEY, FileName varchar(10));
INSERT INTO #FileIndex
(FileName)
VALUES('test.txt');
DECLARE #FileFolder TABLE (FolderId UNIQUEIDENTIFIER, FileId int, PRIMARY KEY(FolderId, FileId));
INSERT INTO #FileFolder
(FolderId, FileId)
SELECT FolderId,
FileId
FROM #FolderIndex
CROSS JOIN #FileIndex; -- just to illustrate
DECLARE #sFolder TABLE (FromFolderId UNIQUEIDENTIFIER, ToFolderId UNIQUEIDENTIFIER);
DECLARE #sFile TABLE (FromFileId int, ToFileId int);
-- copy Folder Structure
MERGE #FolderIndex fi
USING ( SELECT 1 [Dummy],
FolderId,
FolderName
FROM #FolderIndex [fi]
WHERE FolderName = 'OriginalFolder'
) d ON d.Dummy = 0
WHEN NOT MATCHED
THEN INSERT
(FolderId, FolderName)
VALUES (newid(), 'copy_'+FolderName)
OUTPUT d.FolderId,
INSERTED.FolderId
INTO #sFolder (FromFolderId, toFolderId);
-- copy File structure
MERGE #FileIndex fi
USING ( SELECT 1 [Dummy],
fi.FileId,
fi.[FileName]
FROM #FileIndex fi
INNER
JOIN #FileFolder fm ON
fi.FileId = fm.FileId
INNER
JOIN #FolderIndex fo ON
fm.FolderId = fo.FolderId
WHERE fo.FolderName = 'OriginalFolder'
) d ON d.Dummy = 0
WHEN NOT MATCHED
THEN INSERT ([FileName])
VALUES ([FileName])
OUTPUT d.FileId,
INSERTED.FileId
INTO #sFile (FromFileId, toFileId);
-- link new files to Folders
INSERT INTO #FileFolder (FileId, FolderId)
SELECT sfi.toFileId, sfo.toFolderId
FROM #FileFolder fm
INNER
JOIN #sFile sfi ON
fm.FileId = sfi.FromFileId
INNER
JOIN #sFolder sfo ON
fm.FolderId = sfo.FromFolderId
-- return
SELECT *
FROM #FileIndex fi
JOIN #FileFolder ff ON
fi.FileId = ff.FileId
JOIN #FolderIndex fo ON
ff.FolderId = fo.FolderId
I would like to add another example to add to #Nathan's example, as I found it somewhat confusing.
Mine uses real tables for the most part, and not temp tables.
I also got my inspiration from here: another example
-- Copy the FormSectionInstance
DECLARE #FormSectionInstanceTable TABLE(OldFormSectionInstanceId INT, NewFormSectionInstanceId INT)
;MERGE INTO [dbo].[FormSectionInstance]
USING
(
SELECT
fsi.FormSectionInstanceId [OldFormSectionInstanceId]
, #NewFormHeaderId [NewFormHeaderId]
, fsi.FormSectionId
, fsi.IsClone
, #UserId [NewCreatedByUserId]
, GETDATE() NewCreatedDate
, #UserId [NewUpdatedByUserId]
, GETDATE() NewUpdatedDate
FROM [dbo].[FormSectionInstance] fsi
WHERE fsi.[FormHeaderId] = #FormHeaderId
) tblSource ON 1=0 -- use always false condition
WHEN NOT MATCHED
THEN INSERT
( [FormHeaderId], FormSectionId, IsClone, CreatedByUserId, CreatedDate, UpdatedByUserId, UpdatedDate)
VALUES( [NewFormHeaderId], FormSectionId, IsClone, NewCreatedByUserId, NewCreatedDate, NewUpdatedByUserId, NewUpdatedDate)
OUTPUT tblSource.[OldFormSectionInstanceId], INSERTED.FormSectionInstanceId
INTO #FormSectionInstanceTable(OldFormSectionInstanceId, NewFormSectionInstanceId);
-- Copy the FormDetail
INSERT INTO [dbo].[FormDetail]
(FormHeaderId, FormFieldId, FormSectionInstanceId, IsOther, Value, CreatedByUserId, CreatedDate, UpdatedByUserId, UpdatedDate)
SELECT
#NewFormHeaderId, FormFieldId, fsit.NewFormSectionInstanceId, IsOther, Value, #UserId, CreatedDate, #UserId, UpdatedDate
FROM [dbo].[FormDetail] fd
INNER JOIN #FormSectionInstanceTable fsit ON fsit.OldFormSectionInstanceId = fd.FormSectionInstanceId
WHERE [FormHeaderId] = #FormHeaderId
Here's a solution that doesn't use MERGE (which I've had problems with many times I try to avoid if possible). It relies on two memory tables (you could use temp tables if you want) with IDENTITY columns that get matched, and importantly, using ORDER BY when doing the INSERT, and WHERE conditions that match between the two INSERTs... the first one holds the source IDs and the second one holds the target IDs.
-- Setup... We have a table that we need to know the old IDs and new IDs after copying.
-- We want to copy all of DocID=1
DECLARE #newDocID int = 99;
DECLARE #tbl table (RuleID int PRIMARY KEY NOT NULL IDENTITY(1, 1), DocID int, Val varchar(100));
INSERT INTO #tbl (DocID, Val) VALUES (1, 'RuleA-2'), (1, 'RuleA-1'), (2, 'RuleB-1'), (2, 'RuleB-2'), (3, 'RuleC-1'), (1, 'RuleA-3')
-- Create a break in IDENTITY values.. just to simulate more realistic data
INSERT INTO #tbl (Val) VALUES ('DeleteMe'), ('DeleteMe');
DELETE FROM #tbl WHERE Val = 'DeleteMe';
INSERT INTO #tbl (DocID, Val) VALUES (6, 'RuleE'), (7, 'RuleF');
SELECT * FROM #tbl t;
-- Declare TWO temp tables each with an IDENTITY - one will hold the RuleID of the items we are copying, other will hold the RuleID that we create
DECLARE #input table (RID int IDENTITY(1, 1), SourceRuleID int NOT NULL, Val varchar(100));
DECLARE #output table (RID int IDENTITY(1,1), TargetRuleID int NOT NULL, Val varchar(100));
-- Capture the IDs of the rows we will be copying by inserting them into the #input table
-- Important - we must specify the sort order - best thing is to use the IDENTITY of the source table (t.RuleID) that we are copying
INSERT INTO #input (SourceRuleID, Val) SELECT t.RuleID, t.Val FROM #tbl t WHERE t.DocID = 1 ORDER BY t.RuleID;
-- Copy the rows, and use the OUTPUT clause to capture the IDs of the inserted rows.
-- Important - we must use the same WHERE and ORDER BY clauses as above
INSERT INTO #tbl (DocID, Val)
OUTPUT Inserted.RuleID, Inserted.Val INTO #output(TargetRuleID, Val)
SELECT #newDocID, t.Val FROM #tbl t
WHERE t.DocID = 1
ORDER BY t.RuleID;
-- Now #input and #output should have the same # of rows, and the order of both inserts was the same, so the IDENTITY columns (RID) can be matched
-- Use this as the map from old-to-new when you are copying sub-table rows
-- Technically, #input and #output don't even need the 'Val' columns, just RID and RuleID - they were included here to prove that the rules matched
SELECT i.*, o.* FROM #output o
INNER JOIN #input i ON i.RID = o.RID
-- Confirm the matching worked
SELECT * FROM #tbl t

Merge statement on a single record table

I need to write a single statement to insert or update a record in a single record table
the merge statement allows me to write this:
create table t1 (n int)
-- insert into t1 (n) Values (1); -- uncomment to test the matched branch
MERGE t1 AS P
USING (SELECT 3 AS n) AS S
ON 1 = 1
WHEN MATCHED THEN
UPDATE SET n = S.n
WHEN NOT MATCHED THEN
INSERT (n)
VALUES (S.n);
select * from t1
this work, but I think that the 1=1 condition purpose is not very easy to understand.
Is there a different syntax to insert a record when the table is empty or update the record when it does already exist?
The other option would be to do it the old fashioned way.
if exists (select null from t1)
update t1 set n = 3
else
insert into t1 (n) values (3)
Replace
ON 1 = 1
with
ON S.n = P.n
Example of recent procedure I wrote to either update an existing row or insert a new row.
Table has the same structure as MembershipEmailFormat the table variable.
Found it easiest to create a table variable to be the source in the Using clause. I realize that the main purpose of Merge statements really are merging muliple rows between two tables. My use case is that I need to insert a new email address for a user or modify and existing email address.
CREATE PROCEDURE [dbo].[usp_user_merge_emailformat]
#UserID UNIQUEIDENTIFIER,
#Email varchar(256),
#UseHTML bit
AS
BEGIN
--SELECT #UserID='04EFF187-AEAC-408E-9FA8-284B31890FBD',
-- #Email='person#xxxx.com',
-- #UseHTML=0
DECLARE #temp TABLE
(
UserID UNIQUEIDENTIFIER,
Email varchar(256),
HtmlFormat bit
)
INSERT INTO #temp(UserID,Email, HtmlFormat)
Values(#UserID,#Email,#UseHTML)
SELECT * FROM #temp
MERGE dbo.MembershipEmailFormat as t
USING #temp AS s
ON (t.UserID = s.UserID and t.Email = s.Email)
WHEN MATCHED THEN UPDATE SET t.HtmlFormat = s.HtmlFormat
WHEN NOT MATCHED THEN INSERT VALUES(s.UserID,s.Email,s.HtmlFormat);
END

Resources