Column based intersect on two tables - sql-server

I'm trying to do something similar to a column based intersect on two tables.
The tables are:
LogTag: a log can have zero or more tags
MatchingRule: a matching rule consists of one or more tags that define the rule
A log can have zero or more rules matched to it. I will be passing in a MatchingRuleID and expecting to return all logs that match that rule.
Expected Result: A result set of matching LogIDs. Eg. passing in MatchingRuleID = 30 should return LogID 101. MatchingRuleID = 31 should return LogID 101 & 100.
Also, the LogTag table could have millions of rows so an efficient query is preferred.
The question: How to find all LogIDs that match with a specified rule definition?
Schema:
CREATE TABLE dbo.Tag
(
TagID INT,
TagName NVARCHAR(50)
)
INSERT INTO dbo.Tag (TagID, TagName)
VALUES (1, 'tag1'), (2, 'tag2'), (3, 'tag3')
CREATE TABLE dbo.LogTag
(
LogID INT,
TagID INT
)
INSERT INTO dbo.LogTag (LogID, TagID)
VALUES (100, 1), (101, 1), (101, 2), (101, 3), (101, 4), (102, 2), (102, 3)
CREATE TABLE dbo.MatchingRule
(
MatchingRuleID INT,
TagID INT
)
INSERT INTO dbo.MatchingRule (MatchingRuleID, TagID)
VALUES (30, 1), (30, 2), (30, 3), (31, 1)

Important to have the proper clustered index on the tables. I've put an alternative index in comments for #log_tag which might improve performance for large sets. Since I do not have the proper sample to test on, you will have to verify which is best.
CREATE TABLE #tag(tag_id INT PRIMARY KEY,tag_name NVARCHAR(50));
INSERT INTO #tag (tag_id,tag_name)VALUES
(1,'tag1'),(2,'tag2'),(3,'tag3');
-- Try this key for large sets: PRIMARY KEY(tag_id,log_id));
CREATE TABLE #log_tag(log_id INT,tag_id INT,PRIMARY KEY(log_id,tag_id))
INSERT INTO #log_tag (log_id,tag_id)VALUES
(100,1),(101,1),(101,2),(101,3),(101,4),(102,2),(102,3);
CREATE TABLE #matching_rule(matching_rule_id INT,tag_id INT,PRIMARY KEY(matching_rule_id,tag_id));
INSERT INTO #matching_rule(matching_rule_id,tag_id)VALUES
(30,1),(30,2),(30,3),(31,1);
DECLARE #matching_rule_id INT=31;
;WITH required_tags AS (
SELECT tag_id
FROM #matching_rule
WHERE matching_rule_id=#matching_rule_id
)
SELECT lt.log_id
FROM required_tags AS rt
INNER JOIN #log_tag AS lt ON
lt.tag_id=rt.tag_id
GROUP BY lt.log_id
HAVING COUNT(*)=(SELECT COUNT(*) FROM required_tags);
DROP TABLE #log_tag;
DROP TABLE #matching_rule;
DROP TABLE #tag;
The results are the ones in your Expected Result for both 30 & 31.
Execution plan for the index used in the script:

Try this query
Fiddle Here
DECLARE #InputMatchingRuleId INT = 30
;WITH CTE1
AS
(
SELECT DENSE_RANK() OVER(ORDER BY LT.TAGID) AS RN,LT.TagID,LT.LOGID
FROM MatchingRule MR INNER JOIN LogTag LT ON LT.TagID = MR.TagID
WHERE MatchingRuleID=#InputMatchingRuleId
),
CTE2
AS
(
SELECT 1 AS RN2,LOGID FROM CTE1 C1 WHERE C1.RN=1
UNION ALL
SELECT RN2+1 as RN2,C2.LOGID
FROM CTE1 C1 INNER JOIN CTE2 C2 ON C1.RN = C2.RN2+1 AND C1.LOGID = C2.LOGID
)
SELECT DISTINCT LOGID FROM CTE2
WHERE RN2>(CASE WHEN (SELECT MAX(RN2) FROM CTE2)=1 THEN 0 ELSE 1 END)

NOTE: This will only work with SQL Server 2008+
Here's the query I came up with:
DECLARE #RuleID INT
SELECT #RuleID = 30
SELECT LogID
FROM LogTag lt
INNER JOIN (
SELECT TagID, MatchingRuleID, COUNT(*) OVER (PARTITION BY MatchingRuleID) TagCount
FROM MatchingRule
) mr
ON lt.TagID = mr.TagID
AND mr.MatchingRuleID = #RuleID
GROUP BY LogID, TagCount
HAVING COUNT(*) = TagCount
So basically I match all TagID's within the specified matching rule and then once I know that all tags match I check to see if the count of tags from the MatchingRule table matches the (now filtered and grouped) count of tags from the LogTag table.

should be
; with rules as
(
select TagID, cnt = sum(count(*)) over()
from dbo.MatchingRule
where MatchingRuleID = #MatchingRuleID
group by TagID
)
select LogID
from rules r
inner join LogTag lt on r.TagID = lt.TagID
group by LogID, cnt
having count(*) = r.cnt

select l.LogID
from dbo.MatchingRule r
inner join dbo.LogTag l on l.TagID = r.TagID
where r.MatchingRuleID = 31
another approach is to identify all tags and then:
select l.LogID
from dbo.LogTag l
where exists(select 1 from #Tags t where t.TagID = l.TagID)

Related

SQL Server 2014: Pairing rows from 2 tables based on values coming from a third one

I have 2 tables that contains typed events over time.
The first table #T1 contains events that always comes before events in the second table #T2.
A third table #E contains records that defines for an event the values that comes in #T1 and #T2 respectively.
Sample data:
CREATE TABLE #T1
(
EventTimestamp DateTime,
VehicleId int,
EventId varchar(50),
EventValue varchar(50)
);
CREATE TABLE #T2
(
EventTimestamp DateTime,
VehicleId int,
EventId varchar(50),
EventValue varchar(50)
);
CREATE TABLE #E
(
EventId varchar(50),
FirstValue int,
LastValue varchar(50)
);
INSERT INTO #T1(EventTimestamp, VehicleId , EventId, EventValue)
VALUES (GETDATE(), 1, 'TwigStatus', '12'),
(GETDATE(), 2, 'SafeProtectEvent', '5')
INSERT INTO #T2(EventTimestamp, VehicleId , EventId, EventValue)
VALUES (DATEADD(second, 30, GETDATE()), 1, 'TwigStatus', '7'),
(DATEADD(second, 30, GETDATE()), 2, 'SafeProtectEvent', '6')
INSERT INTO #E(EventId, FirstValue, LastValue)
VALUES ('TwigStatus', '12', '7'),
('SafeProtectEvent', '5', '6')
DECLARE #EventId varchar(50) = 'TwigStatus';
DECLARE #FirstValue varchar(50) = '12';
DECLARE #LastValue varchar(50) = '7';
WITH ord AS
(
SELECT
first, last,
EventNr = ROW_NUMBER() OVER (ORDER BY first)
FROM
(SELECT
first = t1.EventTimestamp, last = t2.EventTimestamp,
rn = ROW_NUMBER() OVER (PARTITION BY t1.VehicleId ORDER BY t2.EventTimestamp)
FROM
#T1 t1
INNER JOIN
#T2 t2 ON t2.EventTimestamp > t1.EventTimestamp
AND t2.EventValue = #LastValue
WHERE
t1.EventId = #EventId AND t1.EventValue = #FirstValue) ids
WHERE
rn = 1
)
SELECT
t.VehicleId, o.first, o.last, t.EventId, t.EventValue
FROM
#T2 t
INNER JOIN
ord o ON t.EventTimestamp >= o.first
AND t.EventTimestamp <= o.last;
WHERE t.EventId = #EventId;
DROP TABLE #E;
DROP TABLE #T1;
DROP TABLE #T2;
Basically, for a record in table E you see that for eventID 'TwigStatus' the value '12' should come first in table T1 and then '7' should be next in table T2. There is a second event sequence that is defined.
The VehicleId column is the link between the tables T1 and T2.
I need to compute the delay between two matching events in table T1 and T2.
To start simple, I do not use the table E yet, I'm using variables that contains predefined values and I'm returning timestamps.
But the result of the query above;
VehicleId first last EventId EventValue
1 2020-09-15 16:00:37.670 2020-09-15 16:01:07.670 TwigStatus 7
2 2020-09-15 16:00:37.670 2020-09-15 16:01:07.670 SafeProtectEvent 6
Is not what I'm expecting because the EventId 'SafeProtectEvent' Should be filtered out for now.
So I have 2 questions:
How to avoid the second event to show with the actual query.
How to work with the content of the table E and get rid of variables to process event sequences.
EDIT 1: Problem 1 Solved by adding a restriction on the query (see above)
Update/new version below - now allows rows in T1 without matching rows in T2.
Based on discussion on comments below, I have updated this suggestion.
This code replaces everything from the DECLARE #EventId to the end of that SELECT statement.
Logic is as follows - for each row in T1 ...
Determine the time boundaries for that row in T1 (between its EventTimestamp, and the next EventTimestamp in T1 for that vehicle; or 1 day in the future if there is no next event)
Find the matching rows in T2, where 'matching' means a) same VehicleId, b) same EventId, c) EventValue is limited by possibilities in #E, and d) occurs within the time boundaries of T1
Find the first of these rows, if available
Calculate EventDelay as the times between the two timestamps
; WITH t1 AS
(SELECT VehicleId,
EventTimestamp,
EventId,
EventValue,
COALESCE(LEAD(EventTimestamp, 1) OVER (PARTITION BY VehicleID ORDER BY EventTimestamp), DATEADD(day, 1, getdate())) AS NextT1_EventTimeStamp
FROM #T1
),
ord AS
(SELECT t1.VehicleId,
t1.EventTimestamp AS first,
t2.EventTimestamp AS last,
t1.EventId,
t2.EventValue,
ROW_NUMBER() OVER (PARTITION BY t1.VehicleId, t1.EventTimestamp, t1.EventId ORDER BY t2.EventTimestamp) AS rn
FROM t1
LEFT OUTER JOIN #E AS e ON t1.EventId = e.EventId
AND t1.EventValue = e.FirstValue
LEFT OUTER JOIN #T2 AS t2 ON t1.VehicleID = t2.VehicleID
AND t1.EventID = t2.EventID
AND t2.eventId = e.EventId
AND t2.EventValue = e.LastValue
AND t2.EventTimestamp > t1.EventTimestamp
AND t2.EventTimestamp < NextT1_EventTimeStamp
)
SELECT VehicleId, first, last, EventId, EventValue,
DATEDIFF(second, first, last) AS EventDelay
FROM ord
WHERE rn = 1
The ever-growing DB<>fiddle has the latest updates, as well as original posts and previous suggestions.

Count how many times a word exist in column

I have a table 1:
CREATE TABLE table1
INSERT INTO table1 values('XYZ')
INSERT INTO table1 values('ABC')
INSERT INTO table1 values('XYZ~ABC~AAA')
INSERT INTO table1 values('123')
Then, I have string 'ABC~XYZ~123'. I need to split this string into each word by using SQL:
Select VALUE FROM STRING_SPLIT('ABC~XYZ~123','~')
The return is table2
ABC
XYZ
123
I want to count how many times each word in table2 existed in table 1
The expected output is
ABC|3
XYZ|2
123|1
Any ideas on this?
If I understand your case correctly, the next statement may help:
Text and table:
DECLARE #text varchar(100) = 'ABC~XYZ~123'
CREATE TABLE Data (
Id int,
[Text] varchar(100)
)
INSERT INTO Data
(Id, [Text])
VALUES
(1, 'XYZ'),
(2, 'ABC'),
(3, 'XYZ~ABC~AAA'),
(4, '123~ABC')
Statement:
SELECT t.[value] AS [Word], j.[Count]
FROM STRING_SPLIT(#text, '~') t
LEFT JOIN (
SELECT s.[value], COUNT(*) AS [Count]
FROM Data d
CROSS APPLY STRING_SPLIT(d.[Text], '~') s
GROUP BY s.[value]
) j ON t.[value] = j.[value]
Result:
-----------
Word Count
-----------
ABC 3
XYZ 2
123 1
Apart from the suggestions as in comment you can use Count() function as below. But storing in this format will give you difficulty for the extraction as well as in join with the other tables.
Select T1Value, Count(*) as [NoCount] from(
Select table1.Value as T1Value, Value FROM STRING_SPLIT('ABC~XYZ~123','~')
inner join table1 on Value = table1.Value
)a group by T1Value
Edit
CREATE TABLE table1(
TableValue varchar(max)
);
INSERT INTO table1 (TableValue) values ( 'XYZ');
INSERT INTO table1 ( TableValue) values ( 'ABC');
INSERT INTO table1 ( TableValue) values ( 'XYZ~ABC~AAA');
INSERT INTO table1 ( TableValue) values ( '123~ABC');
SELECT b.value
,Count(*)
FROM (
SELECT VALUE
FROM STRING_SPLIT('ABC~XYZ~123', '~')
) a
INNER JOIN (
SELECT *
FROM table1
CROSS APPLY STRING_SPLIT(TableValue, '~')
) b ON a.Value = b.Value
GROUP BY b.Value
Here is the given Live Demo on db <> fiddle
Setup
create table STRINGS (ID int, STRINGS varchar(max));
insert into STRINGS (ID, STRINGS) values (1, 'XYZ');
insert into STRINGS (ID, STRINGS) values (1, 'ABC');
insert into STRINGS (ID, STRINGS) values (1, 'XYZ~ABC~AAA');
insert into STRINGS (ID, STRINGS) values (1, '123~ABC');
declare #VALUES varchar(max) = 'XYZ~ABC~123';
Calculation :
select V1.VALUE, count(STRINGS.ID)
from string_split(#VALUES,'~') V1
cross join STRINGS
outer apply string_split(STRINGS.STRINGS,'~') V2
where V2.VALUE = V1.VALUE
group by V1.VALUE
Result
-----------
Value Num
-----------
ABC 3
XYZ 2
123 1
Live exemple :
https://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=15b95efcf69ea98fafbb7dda1c624551

Select all records for customers where mindate in 2015

I want to select all records for customers whose first order is from 2015. I want any orders they placed after 2015 too, but I DON'T want the records for customers whose first order was in 2016. I am ultimately trying to find the percentage of people who order more than twice, but I want to exclude the customers who were new in 2016.
This doesn't work because 'mindate' is an invalid column name but I'm not sure why or how else to try it.
Select
od.CustomerID, OrderID, OrderDSC, OrderDTS
From
OrderDetail OD
Join
(Select
OrderID, Min(orderdts) as mindate
From
OrderDetail
Where
mindate Between '2015-1-1' and '2015-12-31'
Group By Orderid) b on od.OrderID = b.OrderID
Because execution phases - it's seqency how is qry evaluated and by engine. In where clause your mindate not yet exists.
You can change mindate by orderdts:
select OrderID, min(orderdts) as mindate
from OrderDetail
where orderdts between '2015-1-1' and '2015-12-31'
group by Orderid
Second option is to use having statement - it's evaluated after group by.
What I di was select the distinct CustomerIDs that fall in between your daterange and did a left join with the table so it filters out anyone that doesn't fall in between your daterange.
SELECT * FROM
(Select DISTINCT(CustomerID) as CustomerID
FROM OrderDetail WHERE OrderDTS between '2015-1-1' AND '2015-12-31') oIDs
LEFT JOIN
OrderDetail OD
ON oIDs.CustomerID = OD.CustomerID
Try using the EXISTS clause. It is basically a sub-query. Below is an example you should be able to adapt.
create table Test (Id int, aDate datetime)
insert Test values (1,'04/04/2014')
insert Test values (1,'05/05/2015')
insert Test values (1,'06/06/2016')
insert Test values (2,'04/30/2016')
insert Test values (3,'02/27/2014')
select t.* from Test t
where
aDate>='01/01/2015'
and exists(select * from Test x where x.Id=t.Id and x.aDate >='01/01/2015' and x.aDate<'01/01/2016')
I don't know the orderdts data type but if it is datetime orders on 2015-12-31 will not be included (unless the order date is 2015-12-31 00:00:00.000. Note how this will skip the first record:
DECLARE #orders TABLE (CustomerID INT, orderDate DATETIME);
INSERT #orders VALUES (1, '2015-12-31 00:00:01.000'), (1, '2015-12-30'), (2, '2015-01-04');
SELECT * FROM #orders WHERE orderDate BETWEEN '2015-01-01' AND '2015-12-31';
In this case you would want the WHERE clause filter to look like:
WHERE orderDate BETWEEN '2015-01-01 00:00:00.000' AND '2015-12-31 23:59:59.999';
Or
WHERE CAST(orderDate AS date) BETWEEN '2015-01-01' AND '2015-12-31';
(the first example will almost certainly perform better).
Now, using this sample data:
-- Sample data
CREATE TABLE #LIST (LISTName varchar(10) NOT NULL);
INSERT #LIST
SELECT TOP (100) LEFT(newid(), 8)
FROM sys.all_columns a, sys.all_columns b;
-- You will probably want LISTName to be indexed
CREATE NONCLUSTERED INDEX nc_LISTName ON #LIST(LISTName);
You can implement Paul's solution like this:
DECLARE #LIST_Param varchar(8) = 'No List';
SELECT LISTName
FROM
(
SELECT distinct LISTName
FROM #LIST
UNION ALL
SELECT 'No List'
WHERE (SELECT COUNT(DISTINCT LISTName) FROM #LIST) < 1000000
) Distinct_LISTName
WHERE (#LIST_Param = 'No List' or #LIST_Param = LISTName);
Alternatively you can do this:
DECLARE #LIST_Param varchar(8) = 'No List';
WITH x AS
(
SELECT LISTName, c = COUNT(*)
FROM #LIST
WHERE (#LIST_Param = 'No List' or #LIST_Param = LISTName)
GROUP BY LISTName
),
c AS (SELECT s = SUM(c) FROM x)
SELECT LISTName
FROM x CROSS JOIN c
WHERE s < 1000000;

SQL Server: How do I get the highest value not set of an int column?

Let's take an example. These are the rows of the table I want get the data:
The column I'm talking about is the reference one. The user can set this value on the web form, but the system I'm developing must suggest the lowest reference value still not used.
As you can see, the smallest value of this column is 35. I could just take the smaller reference and sum 1, but, in that case, the value 36 is already used. So, the value I want is 37.
Is there a way to do this without a loop verification? This table will grow so much.
This is for 2012+
DECLARE #Tbl TABLE (id int, reference int)
INSERT INTO #Tbl
( id, reference )
VALUES
(1, 49),
(2, 125),
(3, 35),
(4, 1345),
(5, 36),
(6, 37)
SELECT
MIN(A.reference) + 1 Result
FROM
(
SELECT
*,
LEAD(reference) OVER (ORDER BY reference) Tmp
FROM
#Tbl
) A
WHERE
A.reference - A.Tmp != -1
Result: 37
Here is yet another place where the tally table is going to prove invaluable. In fact it is so useful I keep a view on my system that looks like this.
create View [dbo].[cteTally] as
WITH
E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n)),
E2(N) AS (SELECT 1 FROM E1 a cross join E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a cross join E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS
(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
)
select N from cteTally
Next of course we need some sample data and table to hold it.
create table #Something
(
id int identity
, reference int
, description varchar(10)
)
insert #Something (reference, description)
values (49, 'data1')
, (125, 'data2')
, (35, 'data3')
, (1345, 'data4')
, (36, 'data5')
, (7784, 'data6')
Now comes the magic of the tally table.
select top 1 t.N
from cteTally t
left join #Something s on t.N = s.reference
where t.N >= (select MIN(reference) from #Something)
and s.id is null
order by t.N
This is ugly, but should get the job done:
select
top 1 reference+1
from
[table]
where
reference+1 not in (select reference from [table])
order by reference
I used a table valued express to get the next value. I first left outer joined the table to itself (shifting the key in the join by +1). I then looked only at rows that had no corresponding match (b.ID is null). The minimum a.ReferenceID + 1 gives us the answer we are looking for.
create table MyTable
(
ID int identity,
Reference int,
Description varchar(20)
)
insert into MyTable values (10,'Data')
insert into MyTable values (11,'Data')
insert into MyTable values (12,'Data')
insert into MyTable values (15,'Data')
-- Find gap
;with Gaps as
(
select a.Reference+1 as 'GapID'
from MyTable a
left join MyTable b on a.Reference = b.Reference-1
where b.ID is null
)
select min(GapID) as 'NewReference'
from Gaps
NewReference
------------
13
I hope the code was clearer than my description.
CREATE TABLE #T(ID INT , REFERENCE INT, [DESCRIPTION] VARCHAR(50))
INSERT INTO #T
SELECT 1,49 , 'data1' UNION ALL
SELECT 2,125 , 'data2' UNION ALL
SELECT 3,35 , 'data3' UNION ALL
SELECT 4,1345, 'data4' UNION ALL
SELECT 5,36 , 'data5' UNION ALL
SELECT 6,7784, 'data6'
SELECT TOP 1 REFERENCE + 1
FROM #T T1
WHERE
NOT EXISTS
(
SELECT 1 FROM #T T2 WHERE T2.REFERENCE = T1.REFERENCE + 1
)
ORDER BY T1.REFERENCE
--- OR
SELECT MIN(REFERENCE) + 1
FROM #T T1
WHERE
NOT EXISTS
(
SELECT 1 FROM #T T2 WHERE T2.REFERENCE = T1.REFERENCE + 1
)
How about using a Tally table. The following illustrates the concept. It would be better to use a persisted numbers table as opposed to the cte however the code below illustrates the concept.
For further reading as to why you should use a persisted table, check out the following link: sql-auxiliary-table-of-numbers
DECLARE #START int = 1, #END int = 1000
CREATE TABLE #TEST(UsedValues INT)
INSERT INTO #TEST(UsedValues) VALUES
(1),(3),(5),(7),(9),(11),(13),(15),(17)
;With NumberSequence( Number ) as
(
Select #start as Number
union all
Select Number + 1
from NumberSequence
where Number < #end
)
SELECT MIN(Number)
FROM NumberSequence n
LEFT JOIN #TEST t
ON n.Number = t.UsedValues
WHERE UsedValues IS NULL
OPTION ( MAXRECURSION 1000 )
You could try using a descending order:
SELECT DISTINCT reference
FROM `Resultsados`
ORDER BY `reference` ASC;
As far as I know, there is no way to do this without a loop. To prevent multiple values from returning be sure to use DISTINCT.

TSql equality on groups of rows

I have a table that contains information on groups. There can be any number of members in a group. There is a group identifier and then an element identifier. I want to be able to in a single statement determine whether or not a given set exists in the table
#groupTable is an example of the data that already exists in the database
#inputData is the data that I want to see if it already exists in #groupTable
declare #groupData table
(
groupIdentifier int,
elementIdentifier uniqueidentifier
)
insert into #groupData values
(1, 'dfce40b1-3719-4e4c-acfa-65f728677700'),
(1, '89e7e6be-cee8-40a7-8135-a54659e0d88c')
declare #inputData table
(
tempGroupIdentifier int,
elementIdentifier uniqueidentifier
)
insert into #inputData values
(42, 'dfce40b1-3719-4e4c-acfa-65f728677700'),
(42, '89e7e6be-cee8-40a7-8135-a54659e0d88c'),
(55, 'dfce40b1-3719-4e4c-acfa-65f728677700'),
(55, '2395a42c-94f4-4cda-a773-221b26ea5e44'),
(55, 'f22db9df-a1f4-4078-b74c-90e34376eff6')
Now I want to run a query that will show the relationship of the sets, showing which groupIdentifier is associated with which tempGroupIdentifier. If there is no matching set then I need to know that too.
desired output:
groupIdentifier, tempGroupIdentifier
1, 42
null, 55
Does anyone any suggestions on how to approach this problem?
I could probably pivot the rows and concat all elementIdentifiers into a giant string for each group that then do equality on, but that doesn't seem like a good solution.
SELECT DISTINCT
T1.tempgroupIdentifier, T2.GroupIdentifier
FROM
(
SELECT
COUNT(*) OVER (PARTITION BY tempgroupIdentifier) AS GroupCount,
ROW_NUMBER() OVER (PARTITION BY tempgroupIdentifier ORDER BY elementIdentifier) AS GroupRN,
tempgroupIdentifier, elementIdentifier
FROM
#inputData
) T1
LEFT JOIN
(
SELECT
COUNT(*) OVER (PARTITION BY GroupIdentifier) AS GroupCount,
ROW_NUMBER() OVER (PARTITION BY GroupIdentifier ORDER BY elementIdentifier) AS GroupRN,
GroupIdentifier, elementIdentifier
FROM
#groupData
) T2 ON T1.elementIdentifier = T2.elementIdentifier AND
T1.GroupCount = T2.GroupCount AND
T1.GroupRN = T2.GroupRN
Edit: this will also deal with the same value in a given set
SELECT
(
CASE WHEN matchCount = gdCount AND matchCount = idCount
THEN groupIdentifier
ELSE NULL
END) groupIdentifier,
cj.tempGroupIdentifier
FROM
(
SELECT gd.groupIdentifier, id.tempGroupIdentifier, COUNT(1) matchCount
FROM #groupData gd
CROSS JOIN #inputData id
WHERE id.elementIdentifier = gd.elementIdentifier
GROUP BY gd.groupIdentifier, id.tempGroupIdentifier) as cj
CROSS APPLY (SELECT COUNT(groupIdentifier) from #groupData gdca WHERE gdca.groupIdentifier = cj.groupIdentifier) as gdc(gdCount)
CROSS APPLY (SELECT COUNT(tempGroupIdentifier) from #inputData idca WHERE idca.tempGroupIdentifier = cj.tempGroupIdentifier) as idc(idCount)

Resources