Snowsql MERGE DUP Insert issue - snowflake-cloud-data-platform

Hi SO Community I have a Proc that is migrate from tsql into Snowsql . I was doing testing with full table truncate and just loading the data BUT now we are pushing the code into testing phases AND it seems my MERGE has a bug somewhere.
Non-Tech side : A person can have multiple rows of discount of same type on one item. They are caught by [rtrans_lineitm_seq] . So example is you can buy 20 car filters each one shows up as a new line number on the printed receipt . AND if it is a weekly special a discount is applied to each item. That said let just jump into the tech side...
So here is the Snowsql proc code. and dummy record that is causing issue will be posted after the code below.
MERGE INTO DISCOUNT_2 tgt
USING (
select to_varchar(concat(organization_id,rtl_loc_id,to_varchar(replace(to_date(business_date),'-','')),trans_seq,wkstn_id,IFNULL(rtrans_lineitm_seq,0),IFNULL(rtl_price_mod_seq_nbr,0),replace(IFNULL(rtl_price_mod_reasoncode,''),' ',''),replace(IFNULL(discount_code,''),' ',''),replace(IFNULL(deal_id,''),' ','') ) ) as Unique_id
,*
,'PROC_1' AS PROC_NUMBER
,'LOYALTY_2' AS PROC_NAME
,1 as EXECUTION_NUMBER
,current_timestamp(2) as LAST_PROC_EXECUTION_DATE
from (
select 2200 as organization_id
,TH.STORE_NO as rtl_loc_id
,TH.Date as business_date
,TH.TRANSACTION_NO as trans_seq
,case when left(TH.POS_TERMINAL_NO,2) = 'NP' then 1
when left(TH.POS_TERMINAL_NO,2) = 'NS' then 2
when left(TH.POS_TERMINAL_NO,2) = 'NT' then 3 else 0 end as wkstn_id
,TSE.LINE_NO as rtrans_lineitm_seq
,ROW_NUMBER() over (partition by TSE.STORE_NO,TSE.Date,TSE.TRANSACTION_NO,TSE.POS_TERMINAL_NO order by TSE.STORE_NO,TSE.Date,TSE.TRANSACTION_NO,TSE.POS_TERMINAL_NO,TSE.DISCOUNT_AMOUNT ) AS rtl_price_mod_seq_nbr
,CAST(CONCAT(trim(substring(TH.DATE,0,charindex(':',TH.DATE)-4)), ' ' ,substring(TH.Time,charindex(':',TH.Time)-2,length(TH.Time)))AS TIMESTAMP_NTZ(9)) as create_date
,trim(CONCAT('LOYALTY',' ',ifnull(TIE.Information,''))) AS rtl_price_mod_reasoncode
,IFNULL(PERIODIC_DISC_GROUP,TIE.Information) AS discount_code
,IFNULL(abs(TSE.DISCOUNT_AMOUNT),0) AS deal_amt
,null as deal_id
,TH.STAFF_ID as create_user
,null as sales_agt_com
,null as serial_number
from HEADER as TH
join SALES_ENTRY TSE on TSE.TRANSACTION_NO = TH.TRANSACTION_NO and TSE.STORE_NO = TH.STORE_NO and TSE.POS_TERMINAL_NO = TH.POS_TERMINAL_NO
left join CODE_ENTRY TIE on TIE.TRANSACTION_NO = TH.TRANSACTION_NO and TIE.STORE_NO = TH.STORE_NO and TIE.POS_TERMINAL_NO = TH.POS_TERMINAL_NO and TIE.LINE_NO = TSE.LINE_NO
where TH.TRANSACTION_TYPE = 2
and TH.ENTRY_STATUS not in (1,3)
and TIE.TRANSACTION_TYPE = 1
and TIE.INFOCODE = 'LOYALTY'
and TIE.INFORMATION not in ('PPP EXCLUSIVE','PPP Points Discount')
and TH.TRANSACTION_NO >= 20000000
) as a where to_varchar(concat(organization_id,rtl_loc_id,to_varchar(replace(to_date(business_date),'-','')),trans_seq,wkstn_id,IFNULL(rtrans_lineitm_seq,0),IFNULL(rtl_price_mod_seq_nbr,0),replace(IFNULL(rtl_price_mod_reasoncode,''),' ',''),replace(IFNULL(discount_code,''),' ',''),replace(IFNULL(deal_id,''),' ','') ) )
= '2200710320210826200121721126LOYALTYPPPSENIORDISCPPPSENIORDISC'
) AS src
ON (//tgt.Unique_id = src.Unique_id
to_varchar(concat(tgt.organization_id,tgt.rtl_loc_id,to_varchar(replace(to_date(tgt.business_date),'-','')),tgt.trans_seq,tgt.wkstn_id,IFNULL(tgt.rtrans_lineitm_seq,0),IFNULL(tgt.rtl_price_mod_seq_nbr,0),replace(IFNULL(tgt.rtl_price_mod_reasoncode,''),' ',''),replace(IFNULL(tgt.discount_code,''),' ',''),replace(IFNULL(tgt.deal_id,''),' ','') ) ) = src.Unique_id
)
WHEN NOT MATCHED THEN INSERT ( tgt.Unique_id
,tgt.organization_id
,tgt.rtl_loc_id
,tgt.business_date
,tgt.trans_seq
,tgt.wkstn_id
,tgt.rtrans_lineitm_seq
,tgt.rtl_price_mod_seq_nbr
,tgt.create_date
,tgt.rtl_price_mod_reasoncode
,tgt.discount_code
,tgt.deal_amt
,tgt.deal_id
,tgt.create_user
,tgt.sales_agt_com
,tgt.serial_number
,tgt.PROC_NUMBER
,tgt.PROC_NAME
,tgt.EXECUTION_NUMBER
,tgt.LAST_PROC_EXECUTION_DATE
)
values ( src.Unique_id
,src.organization_id
,src.rtl_loc_id
,src.business_date
,src.trans_seq
,src.wkstn_id
,src.rtrans_lineitm_seq
,src.rtl_price_mod_seq_nbr
,src.create_date
,src.rtl_price_mod_reasoncode
,src.discount_code
,src.deal_amt
,src.deal_id
,src.create_user
,src.sales_agt_com
,src.serial_number
,src.PROC_NUMBER
,src.PROC_NAME
,src.EXECUTION_NUMBER
,src.LAST_PROC_EXECUTION_DATE )
WHEN MATCHED THEN UPDATE SET
/* tgt.organization_id = SRC.organization_id
,tgt.rtl_loc_id = SRC.rtl_loc_id
,tgt.business_date = SRC.business_date
,tgt.trans_seq = SRC.trans_seq
,tgt.wkstn_id = SRC.wkstn_id
,tgt.rtrans_lineitm_seq = SRC.rtrans_lineitm_seq
,tgt.rtl_price_mod_seq_nbr = SRC.rtl_price_mod_seq_nbr
,tgt.create_date = SRC.create_date
,tgt.rtl_price_mod_reasoncode = SRC.rtl_price_mod_reasoncode
,tgt.discount_code = SRC.discount_code
,tgt.deal_amt = SRC.deal_amt
,tgt.deal_id = SRC.deal_id
,tgt.create_user = SRC.create_user
,tgt.sales_agt_com = SRC.sales_agt_com
,tgt.serial_number = SRC.serial_number
,tgt.PROC_NUMBER = SRC.PROC_NUMBER
,tgt.PROC_NAME = SRC.PROC_NAME ,*/
tgt.EXECUTION_NUMBER = (SRC.EXECUTION_NUMBER + 1)
,tgt.LAST_PROC_EXECUTION_DATE = current_timestamp(2)
Here is the sample row that I'm testing with
UNIQUE_ID ORGANIZATION_ID RTL_LOC_ID BUSINESS_DATE TRANS_SEQ WKSTN_ID RTRANS_LINEITM_SEQ RTL_PRICE_MOD_SEQ_NBR CREATE_DATE RTL_PRICE_MOD_REASONCODE DISCOUNT_CODE DEAL_AMT DEAL_ID CREATE_USER SALES_AGT_COM SERIAL_NUMBER PROC_NUMBER PROC_NAME EXECUTION_NUMBER LAST_PROC_EXECUTION_DATE
2200710320210826200121721126LOYALTYPPPSENIORDISCPPPSENIORDISC, 2200, 7103, 2021-08-26, 20012172, 1, 1, 26, 29:12.4, LOYALTY PPP SENIOR DISC PPP SENIOR DISC, 0.22, ST7103, 00, PROC_1, LOYALTY_2, 4, 21:53.9,
Error I get
Duplicate row detected during DML action Row Values: ["2200710320210826200121721126LOYALTYPPPSENIORDISCPPPSENIORDISC", 2200, "7103", 18865, 20012172, 1, 1, 26, 1630009752450000000, "LOYALTY PPP SENIOR DISC", "PPP SENIOR DISC", 2200, NULL, "ST7103 00", NULL, NULL, "PROC_1", "LOYALTY_2", 1, 1642853936960000000]
My question is: 9 TIMES OUT OF 10 WHY WOULD THIS GET CAUGHT IN THE "NOT MATCH INSERT" PART on n-th time VS NOT just being moving to the "WHEN MATCHED UPDATE" part???
as you can see in the sample row above. I was able to run the code successfully 4 times BUT WHEN I TRIED TO RUN IT A 5th time it failed with error...
there ARE OTHER CASES I can share if need be.
Any help would be great.
Thanks.

The duplicate exists on the source side and makes it undeterministic.
This behaviour is described in documentation:
Duplicate Join Behavior:
When a merge joins a row in the target table against multiple rows in the source, the following join conditions produce nondeterministic results (i.e. the system is unable to determine the source value to use to update or delete the target row)
In this situation, the outcome of the merge depends on the value specified for the ERROR_ON_NONDETERMINISTIC_MERGE session parameter:
If TRUE (default value), the merge returns an error.
If FALSE, one row from among the duplicates is selected to perform the update or delete; the row selected is not defined.
...
To avoid errors when multiple rows in the data source (i.e. the source table or subquery) match the target table based on the ON condition, use GROUP BY in the source clause to ensure that each target row joins against one row (at most) in the source.
Option number 1:Using session parameter(it is quick fix that will mask the duplicate error but choose source row in undefined manner):
ALTER SESSION SET ERROR_ON_NONDETERMINISTIC_MERGE = FALSE;
Option number 2:
Identify why they are duplicates in the source and change USING part. To find duplicates QUALIFY COUNT(*) OVER(PARTITION BY Unique_id) > 1; is the fastest option:
select to_varchar(concat(organization_id,rtl_loc_id,to_varchar(replace(to_date(business_date),'-','')),trans_seq,wkstn_id,IFNULL(rtrans_lineitm_seq,0),IFNULL(rtl_price_mod_seq_nbr,0),replace(IFNULL(rtl_price_mod_reasoncode,''),' ',''),replace(IFNULL(discount_code,''),' ',''),replace(IFNULL(deal_id,''),' ','') ) ) as Unique_id
,*
,'PROC_1' AS PROC_NUMBER
,'LOYALTY_2' AS PROC_NAME
,1 as EXECUTION_NUMBER
,current_timestamp(2) as LAST_PROC_EXECUTION_DATE
from (
select 2200 as organization_id
,TH.STORE_NO as rtl_loc_id
,TH.Date as business_date
,TH.TRANSACTION_NO as trans_seq
,case when left(TH.POS_TERMINAL_NO,2) = 'NP' then 1
when left(TH.POS_TERMINAL_NO,2) = 'NS' then 2
when left(TH.POS_TERMINAL_NO,2) = 'NT' then 3 else 0 end as wkstn_id
,TSE.LINE_NO as rtrans_lineitm_seq
,ROW_NUMBER() over (partition by TSE.STORE_NO,TSE.Date,TSE.TRANSACTION_NO,TSE.POS_TERMINAL_NO order by TSE.STORE_NO,TSE.Date,TSE.TRANSACTION_NO,TSE.POS_TERMINAL_NO,TSE.DISCOUNT_AMOUNT ) AS rtl_price_mod_seq_nbr
,CAST(CONCAT(trim(substring(TH.DATE,0,charindex(':',TH.DATE)-4)), ' ' ,substring(TH.Time,charindex(':',TH.Time)-2,length(TH.Time)))AS TIMESTAMP_NTZ(9)) as create_date
,trim(CONCAT('LOYALTY',' ',ifnull(TIE.Information,''))) AS rtl_price_mod_reasoncode
,IFNULL(PERIODIC_DISC_GROUP,TIE.Information) AS discount_code
,IFNULL(abs(TSE.DISCOUNT_AMOUNT),0) AS deal_amt
,null as deal_id
,TH.STAFF_ID as create_user
,null as sales_agt_com
,null as serial_number
from HEADER as TH
join SALES_ENTRY TSE on TSE.TRANSACTION_NO = TH.TRANSACTION_NO and TSE.STORE_NO = TH.STORE_NO and TSE.POS_TERMINAL_NO = TH.POS_TERMINAL_NO
left join CODE_ENTRY TIE on TIE.TRANSACTION_NO = TH.TRANSACTION_NO and TIE.STORE_NO = TH.STORE_NO and TIE.POS_TERMINAL_NO = TH.POS_TERMINAL_NO and TIE.LINE_NO = TSE.LINE_NO
where TH.TRANSACTION_TYPE = 2
and TH.ENTRY_STATUS not in (1,3)
and TIE.TRANSACTION_TYPE = 1
and TIE.INFOCODE = 'LOYALTY'
and TIE.INFORMATION not in ('PPP EXCLUSIVE','PPP Points Discount')
and TH.TRANSACTION_NO >= 20000000
) as a where to_varchar(concat(organization_id,rtl_loc_id,to_varchar(replace(to_date(business_date),'-','')),trans_seq,wkstn_id,IFNULL(rtrans_lineitm_seq,0),IFNULL(rtl_price_mod_seq_nbr,0),replace(IFNULL(rtl_price_mod_reasoncode,''),' ',''),replace(IFNULL(discount_code,''),' ',''),replace(IFNULL(deal_id,''),' ','') ) )
= '2200710320210826200121721126LOYALTYPPPSENIORDISCPPPSENIORDISC'
QUALIFY COUNT(*) OVER(PARTITION BY Unique_id) > 1;
If the query returns more than one row it means source query is not producing unique_id and requires redesign.

Related

Second level lookup with SQL statement

How do I write a SQL statement that does a second level lookup only if first is not matched. For example:
In the below query, if my SEDOLCode condition does not return a record, proceed to lookup with condition 2 with RICCode.
select
*, GETDATE()
from
Securities sec
where
sec.SEDOLCode = 'ABCDEF'
or sec.RICCode = '002815.SZ'
This query is returning two different records - for example:
1234 ABCDEF DUMY906.X
5675 EFTFS 002815.SZ
I am taking data from a file to update the Pricetable as below. I want to use SedolCode as primary lookup.
IF ##ROWCOUNT = 0
INSERT INTO dbo.Price (sec.SecurityID, ClosingPrice, UpdatedDate, UpdatedByUser, Priced)
SELECT
..., GETDATE()
FROM
Securities sec
WHERE
sec.SEDOLCode = #SedolCode
OR sec.RICCode = #RicCode
Try this the logic is basically if the sedolcode is found then it will only meet the first condition. Otherwise the count of that sedolcolde will be 0 and it will look at riccode.
select
*, GETDATE()
from
Securities sec
where
sec.sedolcode = 'ABCDEF'
OR ((SELECT COUNT(1) FROM securites WHERE sedolcode ='ABCDEF') = 0 AND sec.riccode = '002815.SZ')
Ah - reminds me of my FTSE days........
Match Sedol and Not Ric
or
Ric and Not Sedol and use myOrdering & TOP to get the first.
INSERT INTO dbo.Price
(
sec.SecurityID
, ClosingPrice
, UpdatedDate
, UpdatedByUser
, Priced
)
SELECT TOP 1 [specifiy fields to insert]
FROM
(
select 1 as myOrdering ...
, GETDATE()
from Securities sec
WHERE
(sec.RICCode = #RicCode AND sec.SEDOLCode != #SedolCode)
UNION
select 2 as myOrdering ...
, GETDATE()
from Securities sec
WHERE
(sec.RICCode = #RicCode AND sec.SEDOLCode != #SedolCode)
)SUB_Q ORDER BY myOrdering

Update latest record with data from an older record

I have a table with newspaper subscribers:
Subscribers:
==============
ID INT,
Status,
Address,
IndexAddress,
StartDate,
EndDate,
SubscriberID,
PaperID
IndexAddress is a reference to my internal Address table where I keep "correct" addresses (you woulnd't believe how many people don't know where they live). Address is the address supplied by the customer.
Each time a subscriber ends his subscription I save the data and when he renews his subscription I want to re-fetch the old IndexAddress from the old subscrption line in my table.
The data in the database can look like this:
1 1 MyLocalAddress 13455 20160101 20160501 100 5
8 1 MyLocalAddress 13455 20160820 20161201 100 5
14 1 MyLocalAddress 13455 20161228 20170107 100 5
18 0 MyLocalAddress NULL 20170109 NULL 100 5
So ID 1, has status 1, a local address, pointing to address 13455 in my internal system, started 160101 and ended 160501 with customer number 100 and paper number 5.
The last row, ID 18 has just arrived in the database, I want to make sure I automatically find the IndexAddress number so I don't have to match it by hand, but I also want to make absolutlely sure that I fetch the information from the row with ID 14 since the older information in the database MIGHT be wrong (in this case it isn't but it might).
Here is my SQL to fix this:
UPDATE s SET
Status = s2.Status,
IndexAddress = s2.IndexAddress
FROM dbo.Subscribers s
JOIN dbo.Subscribers s2 ON s2.SubscriberID = s.SubscriberID
WHERE 1 = 1
AND s.Status <> s2.Status
AND s2.Status = 1
AND s2.ID IN
(
SELECT
MAX(s3.ID)
FROM dbo.Subscribers s3
WHERE 1 = 1
AND s3.SubscriberID = s.SubscriberID
AND s3.PaperID = s.PaperID
AND s3.Status = 1
AND s3.ID <> s.ID
)
-- Make sure it's the same customer. Customer number is checked in
-- join above.
AND s.PaperID = s2.PaperID
AND s.Address = s2.Address
This works, but I wanted to know if the subquery approach was the best solution or is there a better approach?
I would like to deepen my understand of MS SQL and thus my questions.
I think your query is way over complicated:
with toupdate as (
select s.*,
lag(address) over (partition by subscriberid, paperid order by id) as prev_address,
lag(status) over (partition by subscriberid, paperid order by id) as prev_status
from dbo.Subscribers s
)
update toupdate
set address = prev_address,
status = prev_status
where address is null;
This is not the answer you're looking for but it's not really suitable for a comment. I don't really agree with the design of the tables as you have redundant data. You shouldn't have to repeat data for address and indexaddress in Subscribers or do updates like you are doing.
I would suggest a design something like the below that would avoid you having to do updates like the one you are doing. The below code is re-runnable, so you can run and modify if required to test it.
-- user level information with 1 row per user - address should be linked here
CREATE TABLE #user
(
id INT ,
name NVARCHAR(20) ,
indexAddress INT
)
-- all subscriptions - with calculated status compared to current date
CREATE TABLE #subscription
(
id INT ,
startDate DATETIME ,
endDate DATETIME ,
staus AS CASE WHEN endDate < GETDATE() THEN 1
ELSE 0
END
)
-- table to link users with their subscriptions
CREATE TABLE #userSubscription
(
userId INT ,
subscriptionId INT
)
INSERT INTO #user
( id, name, indexAddress )
VALUES ( 1, N'bob', 13455 ),
( 2, 'dave', 55332 )
INSERT INTO #subscription
( id, startDate, endDate )
VALUES ( 1, '20160101', '20160201' ),
( 8, '20160820', '20161201' ),
( 14, '20161228', '20170107' ),
( 18, '20170109', NULL ),
( 55, '20170101', NULL );
INSERT INTO #userSubscription
( userId, subscriptionId )
VALUES ( 1, 1 ) ,
( 1, 8 ) ,
( 1, 14 ) ,
( 1, 18 ) ,
( 2, 55 );
-- show active users
SELECT u.name ,
u.indexAddress ,
us.userId ,
us.subscriptionId ,
s.startDate ,
s.endDate ,
s.staus
FROM #user u
INNER JOIN #userSubscription us ON u.id = us.userId
INNER JOIN #subscription s ON s.id = us.subscriptionId
WHERE s.staus = 0 -- active
-- show inactive users
SELECT u.name ,
u.indexAddress ,
us.userId ,
us.subscriptionId ,
s.startDate ,
s.endDate ,
s.staus
FROM #user u
INNER JOIN #userSubscription us ON u.id = us.userId
INNER JOIN #subscription s ON s.id = us.subscriptionId
WHERE s.staus = 1 -- inactive
-- tidy up
DROP TABLE #subscription
DROP TABLE #user
DROP TABLE #userSubscription

Update records SQL?

First when I started this project seemed very simple. Two tables, field tbl1_USERMASTERID in Table 1 should be update from field tbl2_USERMASTERID Table 2. After I looked deeply in Table 2, there is no unique ID that I can use as a key to join these two tables. Only way to match the records from Table 1 and Table 2 is based on FIRST_NAME, LAST_NAME AND DOB. So I have to find records in Table 1 where:
tbl1_FIRST_NAME equals tbl2_FIRST_NAME
AND
tbl1_LAST_NAME equals tbl2_LAST_NAME
AND
tbl1_DOB equals tbl2_DOB
and then update USERMASTERID field. I was afraid that this can cause some duplicates and some users will end up with USERMASTERID that does not belong to them. So if I find more than one record based on first,last name and dob those records would not be updated. I would like just to skip and leave them blank. That way I wouldn't populate invalid USERMASTERID. I'm not sure what is the best way to approach this problem, should I use SQL or ColdFusion (my server side language)? Also how to detect more than one matching record?
Here is what I have so far:
UPDATE Table1 AS tbl1
LEFT OUTER JOIN Table2 AS tbl2
ON tbl1.dob = tbl2.dob
AND tbl1.fname = tbl2.fname
AND tbl1.lname = tbl2.lname
SET tbl1.usermasterid = tbl2.usermasterid
WHERE LTRIM(RTRIM(tbl1.usermasterid)) = ''
Here is query where I tried to detect duplicates:
SELECT DISTINCT
tbl1.FName,
tbl1.LName,
tbl1.dob,
COUNT(*) AS count
FROM Table1 AS tbl1
LEFT OUTER JOIN Table2 AS tbl2
ON tbl1.dob = tbl2.dob
AND tbl1.FName = tbl2.first
AND tbl1.LName = tbl2.last
WHERE LTRIM(RTRIM(tbl1.usermasterid)) = ''
AND LTRIM(RTRIM(tbl1.first)) <> ''
AND LTRIM(RTRIM(tbl1.last)) <> ''
AND LTRIM(RTRIM(tbl1.dob)) <> ''
GROUP BY tbl1.FName,tbl1.LName,tbl1.dob
Some data after I tested query above:
First Last DOB Count
John Cook 2008-07-11 2
Kate Witt 2013-06-05 1
Deb Ruis 2016-01-22 1
Mike Bennet 2007-01-15 1
Kristy Cruz 1997-10-20 1
Colin Jones 2011-10-13 1
Kevin Smith 2010-02-24 1
Corey Bruce 2008-04-11 1
Shawn Maiers 2016-08-28 1
Alenn Fitchner 1998-05-17 1
If anyone have idea how I can prevent/skip updating duplicate records or how to improve this query please let me know. Thank you.
You could check for and avoid duplicate matches using with common_table_expression (Transact-SQL)
along with row_number()., like so:
with cte as (
select
t.fname
, t.lname
, t.dob
, t.usermasterid
, NewUserMasterId = t2.usermasterid
, rn = row_number() over (partition by t.fname, t.lname, t.dob order by t2.usermasterid)
from table1 as t
inner join table2 as t2 on t.dob = t2.dob
and t.fname = t2.fname
and t.lname = t2.lname
and ltrim(rtrim(t.usermasterid)) = ''
)
--/* confirm these are the rows you want updated
select *
from cte as t
where t.NewUserMasterId != ''
and not exists (
select 1
from cte as i
where t.dob = i.dob
and t.fname = i.fname
and t.lname = i.lname
and i.rn>1
);
--*/
/* update those where only 1 usermasterid matches this record
update t
set t.usermasterid = t.NewUserMasterId
from cte as t
where t.NewUserMasterId != ''
and not exists (
select 1
from cte as i
where t.dob = i.dob
and t.fname = i.fname
and t.lname = i.lname
and i.rn>1
);
--*/
I use the cte to extract out the sub query for readability. Per the documentation, a common table expression (cte):
Specifies a temporary named result set, known as a common table expression (CTE). This is derived from a simple query and defined within the execution scope of a single SELECT, INSERT, UPDATE, or DELETE statement.
Using row_number() to assign a number for each row, starting at 1 for each partition of t.fname, t.lname, t.dob. Having those numbered allows us to check for the existence of duplicates with the not exists() clause with ... and i.rn>1
You could use a CTE to filter out the duplicates from Table1 before joining:
; with CTE as (select *
, count(ID) over (partition by LastName, FirstName, DoB) as IDs
from Table1)
update a
set a.ID = b.ID
from Table2 a
left join CTE b
on a.FirstName = b.FirstName
and a.LastName = b.LastName
and a.Dob = b.Dob
and b.IDs = 1
This will work provided there are no exact duplicates (same demographics and same ID) in table 1. If there are exact duplicates, they will also be excluded from the join, but you can filter them out before the CTE to avoid this.
Please try below SQL:
UPDATE Table1 AS tbl1
INNER JOIN Table2 AS tbl2
ON tbl1.dob = tbl2.dob
AND tbl1.fname = tbl2.fname
AND tbl1.lname = tbl2.lname
LEFT JOIN Table2 AS tbl3
ON tbl3.dob = tbl2.dob
AND tbl3.fname = tbl2.fname
AND tbl3.lname = tbl2.lname
AND tbl3.usermasterid <> tbl2.usermasterid
SET tbl1.usermasterid = tbl2.usermasterid
WHERE LTRIM(RTRIM(tbl1.usermasterid)) = ''
AND tbl3.usermasterid is null

How to combine fields from 2 columns to create a "matrix"?

I have a logging table in my application that only logs changed data, and leaves the other columns NULL. What I'm wanting to do now is create a view that takes 2 of those columns (Type and Status),
and create a resultset that returns the Type and Status on the entry of that log row, assuming that either one or both columns could be null.
For example, with this data:
Type Status AddDt
A 1 7/8/2013
NULL 2 7/7/2013
NULL 3 7/6/2013
NULL NULL 7/5/2013
B NULL 7/4/2013
C NULL 7/3/2013
C 4 7/2/2013
produce the resultset:
Type Status AddDt
A 1 7/8/2013
A 2 7/7/2013
A 3 7/6/2013
A 3 7/5/2013
B 3 7/4/2013
C 3 7/3/2013
C 4 7/2/2013
From there I'm going to figure out the first time in these results the Type and Status meet certain conditions, such as a Type of B and Status 3 (7/4/2013) and ultimately use that date in a calculation, so performance is a huge issue with this.
Here's what I was thinking so far, but it doesn't get me where I need to be:
SELECT
Type.TypeDesc
, Status.StatusDesc
, *
FROM
jw50_Item c
OUTER APPLY (SELECT TOP 10000 * FROM jw50_ItemLog csh WHERE csh.ItemID = c.ItemID AND csh.StatusCode = 'OPN' ORDER BY csh.AddDt DESC) [Status]
OUTER APPLY (SELECT TOP 10000 * FROM jw50_ItemLog cth WHERE cth.ItemID = c.ItemID AND cth.ItemTypeCode IN ('F','FG','NG','PF','SXA','AB') ORDER BY cth.AddDt DESC) Type
WHERE
c.ItemID = #ItemID
So with the help provided below, I was able to get where I needed. Here is my final solution:
SELECT
OrderID
, CustomerNum
, OrderTitle
, ItemTypeDesc
, ItemTypeCode
, StatusCode
, OrdertatusDesc
FROM
jw50_Order c1
OUTER APPLY (SELECT TOP 1 [DateTime] FROM
(SELECT c.ItemTypeCode, c.OrderStatusCode, c.OrderStatusDt as [DateTime] FROM jw50_Order c WHERE c.OrderID = c1.OrderID
UNION
select (select top 1 c2.ItemTypeCode
from jw50_OrderLog c2
where c2.UpdatedDt >= c.UpdatedDt and c2.ItemTypeCode is not null and c2.OrderID = c.OrderID
order by UpdatedDt DESC
) as type,
(select top 1 c2.StatusCode
from jw50_OrderLog c2
where c2.UpdatedDt >= c.UpdatedDt and c2.StatusCode is not null and c2.OrderID = c.OrderID
order by UpdatedDt DESC
) as status,
UpdatedDt as [DateTime]
from jw50_OrderLog c
where c.OrderID = c1.OrderID AND (c.StatusCode IS NOT NULL OR c.ItemTypeCode IS NOT NULL)
) t
WHERE t.ItemTypeCode IN ('F','FG','NG','PF','SXA','AB') AND t.StatusCode IN ('OPEN')
order by [DateTime]) quart
WHERE quart.DateTime <= #FiscalPeriod2 AND c1.StatusCode = 'OPEN'
Order By c1.OrderID
The union is to bring in the current data in addition to the log table data to create the resultset, since the current data maybe what meets the conditions required. Thanks again for the help guys.
Here is an approach that uses correlated subqueries:
select (select top 1 c2.type
from jw50_Item c2
where c2.AddDt >= c.AddDt and c2.type is not null
order by AddDt
) as type,
(select top 1 c2.status
from jw50_Item c2
where c2.AddDt >= c.AddDt and c2.status is not null
order by AddDt
) as status,
(select AddDt
from jw50_Item c
If you have indexes on jw50_item(AddDt, type) and jw50_item(AddDt, status), then the performance should be pretty good.
I suppose you want to "generate a history": for those dates that has some data missing, the next available data should be set.
Something similar should work:
Select i.AddDt, t.Type, s.Status
from Items i
join Items t on (t.addDt =
(select min(t1.addDt)
from Items t1
where t1.addDt >= i.addDt
and t1.Type is not null))
join Items s on (s.addDt =
(select min(s1.addDt)
from Items s1
where s1.addDt >= i.addDt
and s1.status is not null))
Actually I'm joining the base table to 2 secondary tables and the join condition is that we match the smallest row where the respective column in the secondary table is not null (and of course smaller than the current date).
I'm not absolutely sure that it will work, since I don't have an SQL Server in front of me but give it a try :)

SQL query - need to exclude if Requirement NOT met, and exclude if Disqualifier IS met

I have a feeling once i see the solution i'll slap my forehead, but right now I'm not seeing it.
I have a lookup table, say TableB, which looks like this. All fields are INT except the last two which are BOOL.
ID, TableA_ID, Value, Required, Disqualifies
I have a list of TableA_Id values (1, 2, 3 ) etc
For each record in this table, either Required can be true or disqualified can be true - they cant both be true at the same time. They can both be false or null though. There can be duplicate values of TableA_Id but there should never be duplicates of TableA_Id and Value
If required is true for any of those TableA_ID values, and none of those values are in my list, return no records. If none of the values are marked as required (required = 0 or null) then return records UNLESS any of the values are marked as Disqualifies and are in the list, in which case i want to return no records.
So - if a field is required and i dont have it, dont return any records. If a field is marked as disqualified and i have it, don't return any records. Only return a record if either i have a required value or don't have a disqualified value or there are no required values.
I hope I explained myself clearly.
Thanks in advance for pointing me in the right direction.
As an example of what my records might look like:
ID TableA_ID Value Required Disqualifies
-- --------- ----- -------- ------------
1 123 1 True False
2 123 2 True False
3 123 3 False False
4 123 4 False True
5 456 1 False True
6 456 2 False False
Given this set of sample data, if we're working with TableA_Id 123 and my list of values is lets say 1 and 3, i would get data returned because i have a required value and dont have any disqualified values. If my list of values were just 3, i'd get no records since i'm missing of the Required values. If my list of values were 1 and 4, i'd get no records because 4 is marked as disqualified.
Now if we're working with TableA_Id 456, the only list of values that would return any records is 2.
Maybe i should post the whole SQL query - i was trying to keep this short to make it easier for everyone, but it looks like maybe that's not working so well.
Here is the full dynamically generated query. The bit i am working on now is the 2nd line from the bottom. To equate this to my example, t.id would be TableA_ID, Value would be PDT_ID.
SELECT DISTINCT t.ID, t.BriefTitle, stat.Status, lstat.Status AS LocationStatus, st.SType, t.LAgency, l.City, state.StateCode
,( SELECT TOP 1 UserID
FROM TRecruiter
WHERE TrialID = t.ID AND Lead = 1 ), l.ID as LocationID
, l.WebBased
FROM Trial t
INNER JOIN Location l ON t.ID = l.TrialID
FULL JOIN pdt on t.ID = pdt.trialid
FULL JOIN pdm on t.ID = pdm.TrialID
FULL JOIN s on t.ID = s.TrialID
FULL JOIN hy on t.ID = hy.TrialID
FULL JOIN ta on t.ID = ta.TrialID
FULL JOIN stt on t.ID = stt.TrialID
FULL JOIN [Status] stat ON t.StatusID = stat.ID
FULL JOIN st ON t.StudyTypeID = st.ID
FULL JOIN State state ON l.StateID = state.ID
FULL JOIN [Status] lstat ON l.StatusID = lstat.ID
FULL JOIN ts ON t.ID = ts.TrialID
FULL JOIN tpdm ON t.ID = tpdm.TrialID
WHERE ((t.ID IS NOT NULL)
AND (EligibleHealthyVolunteers IS NULL OR EligibleHealthyVolunteers = 1 OR (0 = 0 AND EligibleHealthyVolunteers = 0))
AND (eligiblegenderid is null OR eligiblegenderid = 1 OR eligiblegenderid = 3)
AND ((EligibleMinAge <= 28 AND EligibleMaxAge >= 28) OR (EligibleMinAge <= 28 AND EligibleMaxAge is null) OR (EligibleMinAge IS NULL AND EligibleMaxAge >= 28))
AND (HYID = 6 AND (hy.Disqualify = 0 OR hy.Disqualify IS NULL AND NOT EXISTS (SELECT * FROM hy WHERE t.id = hy.TrialID AND hy.Req =1)) OR HYID = 6 AND hy.req = 1)
AND (PDT_ID IN (1) AND ( pdt.Disqualify = 0 OR pdt.Disqualify IS NULL AND NOT EXISTS (select * from pdt where t.id = pdt.TrialID AND pdt.Req = 1)) OR PDT_ID IN (1) AND (pdt.Req = 1 AND (pdt.Disqualify = 0 or pdt.Disqualify is null )))
) AND ((3959 * acos(cos(radians(34.18)) * cos(radians(l.Latitude)) * cos(radians(l.Longitude) - radians(-118.46)) + sin(radians(34.18)) * sin(radians(l.Latitude)))) <= 300 OR l.Latitude IS NULL) AND t.IsPublished = 1 AND (t.StatusID = 1 OR t.StatusID = 2)
I've changed/shortened some table names just for security/privacy reasons.
Edit:
I think i am close to getting this working, but I'm getting tripped up on the logic again.
I have the following bit of sql:
AND ( exists (SELECT * FROM pdt WHERE Req = 1 AND trialid = t.id AND pdT_ID IN (2) ) AND EXISTS (SELECT * FROM pdt WHERE Req = 1 AND trialid = t.id ) )
I'm not sure how to structure this. Those two exists statement should make the whole thing true in the following combination:
True & False
True & True
False & False
If it's False & True, then the whole thing is false. In other words if there is a Req =1 AND the PDT_ID that is marked as Req=1 is not in our list (in the example above the list just contains '2') then return false.
EDIT:
I think i finally got it.
AND NOT EXISTS (SELECT * FROM pdt WHERE Disqualify = 1 AND trialid = t.id AND PDT_ID IN (2) )
AND NOT ( NOT exists (SELECT * FROM pdt WHERE Req = 1 AND trialid = t.id AND PDT_ID IN (2) ) AND EXISTS (SELECT * FROM pdt WHERE Req = 1 AND trialid = t.id ) )
So far this seems to work in testing. Although I'm only working with two values of PDT_ID. If this does resolve my problem, i will come back and give someone the credit for helping me.
SELECT *
FROM TABLEB B
WHERE
(
B.REQUIRED = 1
AND EXISTS
(
SELECT 1
FROM TABLEA A
WHERE A.ID =B.TABLEA_ID
)
)
OR
(
B.REQUIRED != 1
AND B.DISQUALIFIES <> 1
)
OR
(
B.REQUIRED != 1
AND B.DISQUALIFIES = 1
AND EXISTS
(
SELECT 1
FROM TABLEA A
WHERE A.ID =B.TABLEA_ID
)
)
UPDATE - after the EDIT and explanation from OP:
Change the line
FULL JOIN pdt on t.ID = pdt.trialid
To
FULL JOIN (SELECT * FROM pdt BB WHERE
BB.TrialID IN (SELECT AA.ID FROM Trial AA WHERE AA.ID = BB.TrialID) AND
1 > (SELECT COUNT(*) FROM Trial A
LEFT OUTER JOIN pdt B ON B.Req != 1 AND B.Disqualify != 1 AND B.TrialID = A.ID
WHERE B.TrialID IS NULL)) pdt ON t.ID = pdt.TiralID
AND change the line before last from
AND (PDT_ID IN (1) AND ( pdt.Disqualify = 0 OR pdt.Disqualify IS NULL AND NOT EXISTS (select * from pdt where t.id = pdt.TrialID AND pdt.Req = 1)) OR PDT_ID IN (1) AND (pdt.Req = 1 AND (pdt.Disqualify = 0 or pdt.Disqualify is null )))
To
AND PDT_ID IN (1)
(You seem to have found a solution, yet I've decided to share my thoughts about this problem anyway.)
Given you've got a set of TableA IDs, each of which is accompanied by a set of some values, and you want to test the entire row set against this TableB thing using the rules you've set forth, I think the entire checking process might look like this:
Match every pair of TableA.ID and Value against TableB and get aggregate maximums of Required and Disqualifies for every TableA.ID along the way.
Derive a separate list of TableA_ID values with their corresponding maximum values of Required, from TableB. That will be for us to know whether a particular TableA_ID must have a required value at all.
Match the row set obtained at Stage 1 against the derived table (Stage 2) and check the aggregate values:
1) if the actual aggregate Disqualifies for a TableA_ID is 1, discard this TableA_ID set;
2) if a TableA_ID has a match in the Stage 2 derived table and the aggregate maximum of Required that we obtained at Stage 1 doesn't match the maximum Required in the derived table, discard the set as well.
Something tells me that it would be better at this point to move on to some sort of illustration. Here's a sample script, with comments explaining which part of the script implements which part of the description above:
;
WITH
/* this is the row set to be tested and which
is supposed to contain TableA.IDs and Values */
testedRowSet AS (
SELECT
TableA.ID AS TableA_ID,
SomethingElse.TestedValue AS Value,
...
FROM TableA
JOIN SomethingElse ON some_condition
...
),
/* at this point, we are getting the aggregate maximums
of TableB.Required and TableB.Disqualifies for every
TableA_ID in testedRowSet */
aggregated AS (
SELECT
testedRowSet.TableA_ID,
testedRowSet.Value,
...
DoesHaveRequiredValues = MAX(CASE TableB.Required WHEN 1 THEN 1 ELSE 0 END) OVER (PARTITION BY testedRowSet.TableA_ID),
HasDisqualifyingValues = MAX(CASE TableB.Disqualifies WHEN 1 THEN 1 ELSE 0 END) OVER (PARTITION BY testedRowSet.TableA_ID)
FROM testedRowSet
LEFT JOIN TableB ON testedRowSet.TableA_ID = TableB.TableA_ID
AND testedRowSet.Value = TableB.Value
),
/* this row set will let us see whether a particular
TableA_ID must have a required value */
properties AS (
SELECT
TableA_ID,
MustHaveRequiredValues = MAX(CASE Required WHEN 1 THEN 1 ELSE 0 END)
FROM TableB
GROUP BY TableA_ID
),
/* this is where we are actually checking the previously
obtained aggregate values of Required and Disqualifies */
tested AS (
SELECT
aggregated.TableA_ID,
aggregated.Value,
...
FROM aggregated
LEFT JOIN properties ON aggregated.TableA_ID = properties.TableA_ID
WHERE aggregated.HasDisqualifyingValues = 0
AND (properties.TableA_ID IS NULL
OR properties.MustHaveRequiredValues = aggregated.DoesHaveRequiredValues)
)
SELECT * FROM tested

Resources