Migrate time when column was last changed from history table? - sql-server

I would like to know when UserId was changed to the current value.
Say we got a table Foo:
Foo
Id | UserId
---+-------
1 | 1
2 | 2
Now I would need to be able to execute a query like:
SELECT UserId, UserIdModifiedAt FROM Foo
Luckily I have logged all the changes in history to table FooHistory:
FooHistory
Id | FooId | UserId | FooModifiedAt
---|-------+--------+---------------
1 | 1 | NULL | 1.1.2019 02:00
2 | 1 | 2 | 1.1.2019 02:01
3 | 1 | 1 | 1.1.2019 02:02
4 | 1 | 1 | 1.1.2019 02:03
5 | 2 | 1 | 1.1.2019 02:04
6 | 2 | 2 | 1.1.2019 02:05
7 | 2 | 2 | 1.1.2019 02:06
So all the data we need is available (above the user of Foo #1 was last modified 02:02 and the user of Foo #2 02:05). We will add a new column UserIdModifiedAt to Foo
Foo v2
Id | UserId | UserIdModifiedAt
---+--------|-----------------
1 | 1 | NULL
2 | 2 | NULL
... and set its values using a trigger. Fine. But how to migrate the history? What script would fill UserIdModifiedAt for us?
See an example of the table structure:
DROP TABLE IF EXISTS [Foo]
DROP TABLE IF EXISTS [FooHistory]
CREATE TABLE [Foo]
(
[Id] INT NOT NULL CONSTRAINT [PK_Foo] PRIMARY KEY,
[UserId] INT,
[UserIdModifiedAt] DATETIME2 -- Automatically updated based on a trigger
)
CREATE TABLE [FooHistory]
(
[Id] INT IDENTITY NOT NULL CONSTRAINT [PK_FooHistory] PRIMARY KEY,
[FooId] INT,
[UserId] INT,
[FooModifiedAt] DATETIME2 NOT NULL CONSTRAINT [DF_FooHistory_FooModifiedAt] DEFAULT (sysutcdatetime())
)
GO
CREATE TRIGGER [trgFoo]
ON [dbo].[Foo]
AFTER INSERT, UPDATE
AS
BEGIN
IF EXISTS (SELECT [UserId] FROM inserted EXCEPT SELECT [UserId] FROM deleted)
BEGIN
UPDATE [Foo] SET [UserIdModifiedAt] = SYSUTCDATETIME() FROM [inserted] WHERE [Foo].[Id] = [inserted].[Id]
END
INSERT INTO [FooHistory] ([FooId], [UserId])
SELECT [Id], [UserId] FROM inserted
END
GO
/* Test data */
INSERT INTO [Foo] ([Id], [UserId]) VALUES (1, NULL)
WAITFOR DELAY '00:00:00.010'
UPDATE [Foo] SET [UserId] = NULL
WAITFOR DELAY '00:00:00.010'
UPDATE [Foo] SET [UserId] = 1
WAITFOR DELAY '00:00:00.010'
UPDATE [Foo] SET [UserId] = 1
WAITFOR DELAY '00:00:00.010'
SELECT * FROM [Foo]
SELECT * FROM [FooHistory]
Related question: Select first row in each GROUP BY group?.

If I understand your question right, it looks like you have already answered it yourself by the way you created your trigger on dbo.Foo.
It looks like the UserIdModifiedAt is modified the first time the UserId changes and not modified when it does not change, in which case your answer is simply dbo.Foo.UserIdModifiedAt.
If you did not mean to write this trigger like that, I think it is possible to retrieve that value from FooHistory but it's much more complicated.
The code below might do what I think you were asking for
;WITH FooHistoryRanked
AS (
SELECT FH.Id, FH.FooId, FH.FooModifiedAt, FH.UserId
, RankedASC = ROW_NUMBER() OVER(PARTITION BY FH.FooId ORDER BY FooModifiedAt ASC) -- 1 = first change to that Foo record
FROM [FooHistory] FH
)
,Matches AS
(
SELECT FHR1.*
, PreviousUserId = FHR2.UserId
, PreviousFooModifiedAt = FHR2.FooModifiedAt
, PreviousHistoryId = FHR2.Id
FROM FooHistoryRanked FHR1
-- join on Foo filters on current value
INNER JOIN [Foo] F ON F.Id = FHR1.FooId
AND ( FHR1.UserId = F.UserId
OR (FHR1.UserId IS NULL AND F.UserId IS NULL)
)
-- Find preceding changes to a different value
LEFT JOIN FooHistoryRanked FHR2 ON FHR2.FooId = FHR1.FooId
AND FHR2.RankedASC = FHR1.RankedASC - 1 -- previous change
AND ( FHR2.UserId <> FHR1.UserId
OR ( FHR2.UserId IS NULL AND FHR1.UserId IS NOT NULL )
OR ( FHR2.UserId IS NOT NULL AND FHR1.UserId IS NULL )
)
)
,MatchesRanked AS
(
-- select the modifications that had a different value before OR that are the only modification
SELECT *, MatchRanked = ROW_NUMBER() OVER(PARTITION BY FooId ORDER BY Id DESC)
FROM Matches
WHERE RankedASC = 1 OR PreviousFooModifiedAt IS NOT NULL
)
SELECT *
FROM MatchesRanked
WHERE MatchRanked = 1 -- just get the last qualifying record
ORDER BY FooId, FooModifiedAt DESC, UserId;
PS:
1) Performance could be a problem if these tables were big...
2) you could probably use LAG instead of the LEFT JOIN but I am just used to do things this way...

Related

SQL SERVER update or insert after left join

I have a Table Animals
Id | Name | Count | -- (other columns not relevant)
1 | horse | 11
2 | giraffe | 20
I want to try to insert or update values from a CSV string
Is it possible to do something like the following in 1 query?
;with results as
(
select * from
(
values ('horse'), ('giraffe'), ('lion')
)
animal_csv(aName)
left join animals on
animals.[Name] = animal_csv.aName
)
update results
set
[Count] = 1 + animals.[Count]
-- various other columns are set here
where Id is not null
--else
--insert into results ([Name], [Count]) values (results.aName, 1)
-- (essentially Where id is null)
It looks like what you're looking for is a table variable or temporary table rather than a common table expression.
If I understand your problem correctly, you are building a result set based on data you're getting from a CSV, merging it by incrementing values, and then returning that result set.
As I read your code, it looks as if your results would look like this:
aName | Id | Name | Count
horse | 1 | horse | 12
giraffe | 2 | giraffe | 21
lion | | |
I think what you're looking for in your final result set is this:
Name | Count
horse | 12
giraffe | 21
lion | 1
First, you can get from your csv and table to a resultset in a single CTE statement:
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
Or, if you want to build your resultset using a table variable:
DECLARE #Results TABLE
(
Name VARCHAR(30)
, Count INT
)
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
INSERT #Results
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
SELECT * FROM #results
Or, if you just want to use a temporary table, you can build it like this (temp tables are deleted when the connection is released/closed or when they're explicitly dropped):
;WITH animal_csv AS (SELECT * FROM (VALUES('horse'),('giraffe'), ('lion')) a(aName))
SELECT ISNULL(Name, aName) Name
, CASE WHEN [Count] IS NULL THEN 1 ELSE 1 + [Count] END [Count]
INTO #results
FROM animal_csv
LEFT JOIN animals
ON Name = animal_csv.aName
SELECT * FROM #results

How to join multiple tables , one table as dynamic columns and the other table as the values of these columns

I'm new to SQL Server pivot, and i'm trying to solve a problem where i need to output the following tables into one table that includes the values based on one table's columns
Here's my tables
ContactGroup
Title ID
---------- -----------
Group A 1
ContactsInGroups
ContactId GroupId
----------- -----------
1 1
2 1
3 1
ContactVariables
ID Name GroupId Order
----------- ---------- ----------- ------
1 Invoice 1 1
2 Due Date 1 1
ContactsVariablesValues
ContactVariableId ContactId Value
----------------- ----------- -----
1 1 600
Desired output
GroupId ContactId Invoice Due Date
----------- ----------- ----------- -----------
1 1 600 NULL
1 2 NULL NULL
1 3 NULL NULL
Ali, here is an example that will at least get you started. You can run the following in SSMS.
Create some table variables and insert your sample data.
DECLARE #ContactGroup TABLE ( id INT, title VARCHAR(50) );
INSERT INTO #ContactGroup ( id, title ) VALUES ( 1, 'Group A' );
DECLARE #ContactsInGroup TABLE ( ContactID INT, GroupID INT );
INSERT INTO #ContactsInGroup ( ContactID, GroupID ) VALUES ( 1, 1 ), ( 2, 1 ), ( 3, 1 );
DECLARE #ContactVariables TABLE ( id INT, [name] VARCHAR(50), GroupID INT, [Order] INT );
INSERT INTO #ContactVariables ( id, [name], GroupID, [Order] ) VALUES ( 1, 'Invoice', 1, 1 ), ( 2, 'Due Date', 1, 1 );
DECLARE #ContactsVariablesValues TABLE ( ContactVariableID INT, ContactID INT, [value] INT );
INSERT INTO #ContactsVariablesValues ( ContactVariableID, ContactID, [value] ) VALUES ( 1, 1, 600 );
Then query the data as follows:
SELECT
ContactGroup.id AS GroupID
, ContactsInGroup.ContactID
, ContactVars.Invoice
, ContactVars.[Due Date]
FROM #ContactGroup AS ContactGroup
INNER JOIN #ContactsInGroup AS ContactsInGroup
ON ContactGroup.id = ContactsInGroup.GroupID
OUTER APPLY (
SELECT
[Invoice], [Due Date]
FROM (
SELECT
Vars.[name]
, Vals.[value]
FROM #ContactVariables AS Vars
LEFT OUTER JOIN #ContactsVariablesValues Vals
ON Vars.id = Vals.ContactVariableID
WHERE
Vars.GroupID = 1
AND Vals.ContactID = ContactsInGroup.ContactID
) AS ContactData
PIVOT (
MIN( [value] )
FOR [name] IN (
[Invoice], [Due Date]
)
) AS pvt
) AS ContactVars
ORDER BY
ContactGroup.id, ContactsInGroup.ContactID;
Which returns:
+---------+-----------+---------+----------+
| GroupID | ContactID | Invoice | Due Date |
+---------+-----------+---------+----------+
| 1 | 1 | 600 | NULL |
| 1 | 2 | NULL | NULL |
| 1 | 3 | NULL | NULL |
+---------+-----------+---------+----------+
Things to note
The "magic" here is in the OUTER APPLY. This allows us to query a subset of data based on the primary data returned, in this case the GroupID and ContactID. OUTER APPLY will also return rows with NULL values like you desire.
You're going to have some challenges here, namely that to use a PIVOT as shown in my example, you will need to know all the values ( Invoice, Due Date, etc... ) that will become column headers. Based on your setup, I'm thinking this may not be the case, so you will be forced to resort to an technique that creates and executes a dynamic PIVOT statement for you within the OUTER APPLY.
You also might consider using a TABLE VALUED FUNCTION that does the PIVOT work that can then be JOINed on vs. an OUTER APPLY.
You have several options, but hopefully this helps jumpstart your thinking.

Update Table1 adding values from Table2

Table1
Columns PK_Table1 Name | DoYouGoToSchool |DoYouhaveACar |DoYouWorkFullTime | DoYouWorkPartTime | Score
1 joe Yes Yes No Yes
2 amy No Yes Yes No
Table2
Columns Pk_Table2 |Question | Answer(Bit Column) |Value
1 DoYouGoToSchool True 3
2 DoYouhaveACar True 2
3 DoYouWorkFullTime True 4
4 DoYouWorkPartTime True 2
Based on the information from Table2 What i need to do is UPDATE Table1 ColumnName Score by summing up the Value from Table2 with the information he has provided.
for example i expect the Score column in table1 to be 7 for record 1
and 5 for record 2
Here is a query to play with
IF OBJECT_ID('tempdb..#Table2') IS NOT NULL DROP TABLE #Table2
GO
IF OBJECT_ID('tempdb..#Table1') IS NOT NULL DROP TABLE #Table1
GO
create table #Table1
(
PK_Table1 int,
Name Varchar(50),
DoYouGoToSchool Varchar(8),
DoYouhaveACar Varchar(8),
DoYouWorkFullTime Varchar(8),
DoYouWorkPartTime Varchar(8),
Score INT NULL,
)
create table #Table2
(
PK_Table2 int,
Questions Varchar(50),
Answer BIT NOT NULL DEFAULT(0),
VALUE INT NULL
)
INSERT INTO #Table1 (Name,DoYouGoToSchool,DoYouhaveACar,DoYouWorkFullTime,DoYouWorkPartTime)
VALUES ('joe','Yes','Yes','No','Yes'), ('amy','NO','Yes','Yes','No')
INSERT INTO #Table2(Questions,Answer,VALUE)
VALUES ('DoYouGoToSchool','True',3 ),('DoYouhaveACar','True',2 ),('DoYouWorkFullTime','True',4 ),('DoYouWorkPartTime','True',2 )
This is what is missing from answer below that tells you to create new FK contraint to the Table2 --Inserting Data into the table with the new FK Column
insert into #Table2 (FK_Table1, Questions, Answer) select t.PK_Table1, t1.cols, colsval from #Table1 t cross apply (values (PK_Table1,'DoYouGoToSchool', DoYouGoToSchool), (PK_Table1,'DoYouhaveACar', DoYouhaveACar), (PK_Table1,'DoYouWorkFullTime', DoYouWorkFullTime), (PK_Table1,'DoYouWorkPartTime', DoYouWorkPartTime) ) t1 (PK_Table1,cols, colsval);
First create a relation between these two tables and add Primary key of Table1 in Table2 as a foreign key so your Table2 becomes:
Table2 Columns:
FK_Table1 |Pk_Table2 |Question | Answer(Bit Column) |Value
1 1 DoYouGoToSchool True 3
1 2 DoYouhaveACar True 2
1 3 DoYouWorkFullTime True 4
1 4 DoYouWorkPartTime True 2
You can add in table by using this Query:
ALTER TABLE Table2
ADD FK_Table1 INTEGER,
ADD CONSTRAINT FOREIGN KEY(FK_Table1) REFERENCES Table1(PK_Table1)
means that it is only for that person whose PK_Table1 = 1
Then you can extract his score from this query:
SELECT Sum(Value) FROM Table2 WHERE FK_Table1 = 1;
And then update query:
UPDATE Table1
SET score = (enter here the returned score from above query)
WHERE PK_Table1 = 1;
Or you can do in a single query like this:
UPDATE Table1
SET score = (SELECT Sum(Value) FROM Table2 WHERE FK_Table1 = 1)
WHERE PK_Table1 = 1;
You will need to add another table. This table will be your relational table. It can be called Table1_Table2 with three columns. The first column will be the primary key for the table. The next column will be the primary key of Table1 and the third column will be the primary key for Table 2.
When an instance of Table2 occurs that relates with Table1, insert a record into Table1_Table2 that relates the two tables together with each others primary key. Then a query can be done on the relational table, Table1_Table2 that allows you to sum the relationships.
|Table1_Table2 |
| PK | PK_Table1 | PK_Table2 |
| 1 | 1 | 1 |
| 2 | 1 | 3 |
| 3 | 2 | 1 |
| 4 | 2 | 4 |
As we can see, we can now perform an update on Table1
UPDATE TABLE1 A SET A.SCORE = (Select SUM(B.Value) FROM Table2 B, Table1_Table2 C WHERE C.PK_Table2 = B.PK_Table2 AND C.PK_Table1 = A.PK_Table1);

SQL Server 2016 - Inserting remaining rows into a table leading to duplicates of existing rows

I have 4 tables: People Status, People, Codes and PeopleStatusCodes with the following schemas:
People:
[ID] INT IDENTITY (1, 1) CONSTRAINT [PK_People_ID] PRIMARY KEY,
[PersonCode] VARCHAR(MAX) NOT NULL,
[FirstName] VARCHAR(MAX) NOT NULL,
[LastName] VARCHAR(MAX) NOT NULL
PeopleStatus:
[ID] INT IDENTITY (1, 1) CONSTRAINT [PK_PeopleStatus_ID] PRIMARY KEY,
[PeopleID] VARCHAR(MAX) NOT NULL FOREIGN KEY REFERENCES [People]([ID]),
[Status] INT NOT NULL
Codes:
[ID] INT IDENTITY (1, 1) CONSTRAINT [PK_Codes_ID] PRIMARY KEY,
[CodeNumber] VARCHAR(MAX) NOT NULL,
[Name] VARCHAR(MAX) NOT NULL
PeopleStatusCodes:
[ID] INT IDENTITY (1, 1) CONSTRAINT [PK_PeopleStatusCodes_ID] PRIMARY KEY,
[PeopleStatusID] INT NOT NULL FOREIGN KEY REFERENCES [PeopleStatus]([ID]),
[CodeID] INT NOT NULL FOREIGN KEY REFERENCES [Codes]([ID]),
[Result] INT NOT NULL, --success = 1, fail=0
I am attempting to insert 3 rows of data into the PeopleStatusCodes table - 1 row where the Result = 1, and the remaining rows where Result = 0.
The code below declares 2 temporary tables - one to store the Person's PeopleStatus ID (#peopleStatus) the other to store the data (#data). It then checks that the Person does not already have an entry in the PeopleStatus table - if it does not, a new entry in the PeopleStatus table is created, and that ID is inserted into #peopleStatus. If an entry already exists, the ID of that entry is inserted into #peopleStatus.
An entry is then inserted into PeopleStatusCodes table based off #data, with Result = 1. After that, entries for the remaining Codes which do not have matching data are inserted with Result = 0.
--declare temporary tables
DECLARE #peopleStatus TABLE (peopleStatusID INT)
DECLARE #data TABLE (FirstName VARCHAR (100), LastName VARCHAR (100), Codename VARCHAR (100))
--insert data into #data
INSERT INTO #data(
[FirstName]
,[LastName]
,[Codename]
)
VALUES(
'John'
,'Smith'
,'02 - Code2'
)
--check if entry exists inside PeopleStatus and insert into #peopleStatus based on that
IF NOT EXISTS (SELECT [ps].[PersonCode] FROM PeopleStatus [ps], People [p], #data [d]
WHERE [ps].[PersonCode] = [p].[PersonCode]
AND [p].[FirstName] = [d].[FirstName]
AND [p].[LastName] = [d].[LastName])
INSERT INTO PeopleStatus (
[PersonCode]
,[Status]
)
OUTPUT inserted.[ID]
INTO #peopleStatus
SELECT
[p].[PersonCode]
,1
FROM [People] [p], #data [d]
WHERE [p].[FirstName] = [d].[FirstName]
AND [p].[LastName] = [d].[LastName]
ELSE INSERT INTO #peopleStatus (peopleStatusID)
SELECT [ps].[ID]
FROM PeopleStatus [ps], People [p], #data [d]
WHERE [ps].[PersonCode] = [p].[PersonCode]
AND [p].[FirstName] = [d].[FirstName]
AND [p].[LastName] = [d].[LastName]
--insert into PeopleStatusCodes a row of data with Result = 1 based off data stored in #data
INSERT INTO [dbo].[PeopleStatusCodes] (
[PeopleStatusID]
,[CodeID]
,[Result]
)
SELECT
[temp].[peopleStatusID]
,(SELECT ID FROM Codes WHERE CodeNumber + ' - ' + Name = [d].[Codename])
,1
FROM #peopleStatus [temp], #data [d]
--for every remaining Code in the Codes table which did not have a match with the data, insert into PeopleStatusCodes a row of data with Result = 0
DECLARE #IDColumn INT
SELECT #IDColumn = MIN(c.ID)
FROM Codes [c], PeopleStatusCodes [psc], #peopleStatus [temp]
WHERE [psc].CodeID != [c].ID
AND [psc].PeopleStatusID = [temp].peopleStatusID
WHILE #IDColumn IS NOT NULL
BEGIN
INSERT INTO [dbo].[PeopleStatusCodes] (
[PeopleStatusID]
,[CodeID]
,[Result]
)
SELECT
[temp].peopleStatusID
,#IDColumn
,0
FROM #peopleStatus [temp]
SELECT #IDColumn = MIN(c.ID)
FROM Codes [c], PeopleStatusCodes [psc], #peopleStatus [temp]
WHERE [psc].CodeID != [c].ID
AND [psc].PeopleStatusID = [temp].peopleStatusID
AND c.ID > #IDColumn
END
My problem is that when I run the code, instead of 3 entries in the PeopleStatusCodes table, I get 4 entries, with 1 entry a duplicate.
What I get:
+----+----------------+--------+--------+
| ID | PeopleStatusID | CodeID | Result |
+----+----------------+--------+--------+
| 1 | 1 | 2 | 1 |
| 2 | 1 | 1 | 0 |
| 3 | 1 | 2 | 0 |
| 4 | 1 | 3 | 0 |
+----+----------------+--------+--------+
What I want:
+----+----------------+--------+--------+
| ID | PeopleStatusID | CodeID | Result |
+----+----------------+--------+--------+
| 1 | 1 | 2 | 1 |
| 2 | 1 | 1 | 0 |
| 3 | 1 | 3 | 0 |
+----+----------------+--------+--------+
Update: I managed to solve it by going about it in a more straight forward way - insert all rows first, then update rows where necessary.
In the last pasrt, you could use a row number to remove duplicates:
;WITH ROW AS (
SELECT #IDColumn = MIN(c.ID),
ROW_NUMBER () OVER (PARTITION BY PeopleStatusID, CodeID ORDER BY
PeopleStatusID) AS ROW
FROM Codes [c], PeopleStatusCodes [psc], #peopleStatus [temp]
WHERE [psc].CodeID != [c].ID
AND [psc].PeopleStatusID = [temp].peopleStatusID
AND c.ID > #IDColumn )
SELECT * FROM ROW WHERE Row = 1
I managed to solve it by going about it a different way. Instead of inserting one row with Result = 1 followed by the remaining rows, I inserted ALL rows with default Result = 0. I then Updated the row that matched the data to have Result = 1.
--Inserts a row for every Code into PeopleStatusCodes
DECLARE #IDColumn VARCHAR (10)
SELECT #IDColumn = MIN(c.ID)
FROM Codes [c]
WHILE #IDColumn IS NOT NULL
BEGIN
INSERT INTO [dbo].[PeopleStatusCodes] (
[PeopleStatusID]
,[CodeID]
,[Result]
)
SELECT
[temp].[peopleStatusID]
,#IDColumn
,0
FROM #peopleStatus [temp]
SELECT #IDColumn = MIN(c.ID)
FROM Codes [c]
WHERE c.ID > #IDColumn
END
--Checks if the data matching row has not had Result changed to 1 already, and if so, update that row.
IF NOT EXISTS (SELECT [psc].ID
FROM PeopleStatusCodes [psc], #peopleStatus [temp]
WHERE [psc].PeopleStatusID = [temp].peopleStatusID
AND [psc].CodeID = (SELECT [c].ID FROM Codes [c], #data [d] WHERE [c].CodeNumber + ' - ' + [c].Name = [d].[Codename])
AND [psc].Result = 1)
UPDATE [dbo].[PeopleStatusCodes] SET Result = 1 WHERE CodeID = (SELECT [c].ID FROM Codes [c], #data [d] WHERE [c].CodeNumber + ' - ' + [c].Name = [d].[Codename])

SQL Server : Bulk insert a Datatable into 2 tables

Consider this datatable :
word wordCount documentId
---------- ------- ---------------
Ball 10 1
School 11 1
Car 4 1
Machine 3 1
House 1 2
Tree 5 2
Ball 4 2
I want to insert these data into two tables with this structure :
Table WordDictionary
(
Id int,
Word nvarchar(50),
DocumentId int
)
Table WordDetails
(
Id int,
WordId int,
WordCount int
)
FOREIGN KEY (WordId) REFERENCES WordDictionary(Id)
But because I have thousands of records in initial table, I have to do this just in one transaction (batch query) for example using bulk insert can help me doing this purpose.
But the question here is how I can separate this data into these two tables WordDictionary and WordDetails.
For more details :
Final result must be like this :
Table WordDictionary:
Id word
---------- -------
1 Ball
2 School
3 Car
4 Machine
5 House
6 Tree
and table WordDetails :
Id wordId WordCount DocumentId
---------- ------- ----------- ------------
1 1 10 1
2 2 11 1
3 3 4 1
4 4 3 1
5 5 1 2
6 6 5 2
7 1 4 2
Notice :
The words in the source can be duplicated so I must check word existence in table WordDictionary before any insert record in these tables and if a word is found in table WordDictionary, the just found Word ID must be inserted into table WordDetails (please see Word Ball)
Finally the 1 M$ problem is: this insertion must be done as fast as possible.
If you're looking to just load the table the first time without any updates to the table over time you could potentially do it this way (I'm assuming you've already created the tables you're loading into):
You can put all of the distinct words from the datatable into the WordDictionary table first:
SELECT DISTINCT word
INTO WordDictionary
FROM datatable;
Then after you populate your WordDictionary you can then use the ID values from it and the rest of the information from datatable to load your WordDetails table:
SELECT WD.Id as wordId, DT.wordCount as WordCount, DT.documentId AS DocumentId
INTO WordDetails
FROM datatable as DT
INNER JOIN WordDictionary AS WD ON WD.word = DT.word
There a little discrepancy between declared table schema and your example data, but it was solved:
1) Setup
-- this the table with the initial data
-- drop table DocumentWordData
create table DocumentWordData
(
Word NVARCHAR(50),
WordCount INT,
DocumentId INT
)
GO
-- these are result table with extra information (identity, primary key constraints, working foreign key definition)
-- drop table WordDictionary
create table WordDictionary
(
Id int IDENTITY(1, 1) CONSTRAINT PK_WordDictionary PRIMARY KEY,
Word nvarchar(50)
)
GO
-- drop table WordDetails
create table WordDetails
(
Id int IDENTITY(1, 1) CONSTRAINT PK_WordDetails PRIMARY KEY,
WordId int CONSTRAINT FK_WordDetails_Word REFERENCES WordDictionary,
WordCount int,
DocumentId int
)
GO
2) The actual script to put data in the last two tables
begin tran
-- this is to make sure that if anything in this block fails, then everything is automatically rolled back
set xact_abort on
-- the dictionary is obtained by considering all distinct words
insert into WordDictionary (Word)
select distinct Word
from DocumentWordData
-- details are generating from initial data joining the word dictionary to get word id
insert into WordDetails (WordId, WordCount, DocumentId)
SELECT W.Id, DWD.WordCount, DWD.DocumentId
FROM DocumentWordData DWD
JOIN WordDictionary W ON W.Word = DWD.Word
commit
-- just to test the results
select * from WordDictionary
select * from WordDetails
I expect this script to run very fast, if you do not have a very large number of records (millions at most).
This is the query. I'm using temp table to be able to test.
if you use the 2 CTEs, you'll be able to generate the final result
1.Setting up a sample data for test.
create table #original (word varchar(10), wordCount int, documentId int)
insert into #original values
('Ball', 10, 1),
('School', 11, 1),
('Car', 4, 1),
('Machine', 3, 1),
('House', 1, 2),
('Tree', 5, 2),
('Ball', 4, 2)
2. Use cte1 and cte2. In your real database, you need to replace #original with the actual table name you have all initial records.
;with cte1 as (
select ROW_NUMBER() over (order by word) Id, word
from #original
group by word
)
select * into #WordDictionary
from cte1
;with cte2 as (
select ROW_NUMBER() over (order by #original.word) Id, Id as wordId,
#original.word, #original.wordCount, #original.documentId
from #WordDictionary
inner join #original on #original.word = #WordDictionary.word
)
select * into #WordDetails
from cte2
select * from #WordDetails
This will be data in #WordDetails
+----+--------+---------+-----------+------------+
| Id | wordId | word | wordCount | documentId |
+----+--------+---------+-----------+------------+
| 1 | 1 | Ball | 10 | 1 |
| 2 | 1 | Ball | 4 | 2 |
| 3 | 2 | Car | 4 | 1 |
| 4 | 3 | House | 1 | 2 |
| 5 | 4 | Machine | 3 | 1 |
| 6 | 5 | School | 11 | 1 |
| 7 | 6 | Tree | 5 | 2 |
+----+--------+---------+-----------+------------+

Resources