Consecutive number for specific value in the field - sql-server

I'm using SQL Server 2017.
Table:
CREATE TABLE [T1]
(
REC_ID decimal(28,6) NOT NULL,
BUSINESS_NR decimal(10,6) NULL,
Description varchar(20) NULL,
);
INSERT INTO T1 (REC_ID,BUSINESS_NR, Description)
VALUES (312, 1, 'Created'),
(314, 1, 'Adjustment'),
(356, 2, 'Created'),
(388, 1, 'NoChange'),
(565, 2, 'Adjustment'),
(701, 2, 'NoChange'),
(769, 1, 'Adjustment'),
(832, 2, 'Adjustment'),
(992, 2, 'Closed'),
(995, 1, 'Closed');
Question:
I would like to give each 'Adjustment' a consecutive number per Business_NR.
Example
SELECT *
FROM T1 .......
ORDER BY Business_NR ASC, REC_ID ASC
Output:
+--------+-------------+-------------+-----------------+
| REC_ID | BUSINESS_NR | Description | Adjustment Count|
+--------+-------------+-------------+-----------------+
| 312 | 1 | Created | |
| 314 | 1 | Adjustment | 1 |
| 388 | 1 | NoChange | |
| 769 | 1 | Adjustment | 2 |
| 995 | 1 | Closed | |
| 356 | 2 | Created | |
| 565 | 2 | Adjustment | 1 |
| 701 | 2 | NoChange | |
| 832 | 2 | Adjustment | 2 |
| 992 | 2 | Closed | |
+--------+-------------+-------------+-----------------+
Info: the REC_ID is unique and consecutive for each table entry.
I have no useful attempt to show and therefore have not added any query samples.
SQL FIDDLE: LINK

Seems like you just want a ROW_NUMBER in a CASE expression:
CASE [Description] WHEN 'Adjustment' THEN ROW_NUMBER() OVER (PARTITION BY BUSINESS_NR, [Description] ORDER BY REC_ID ASC) END AS AdjustmentCount

Related

SQL Server Lag by partitioned group

I have a table of data as follows:
+----+-------+----------+
| id | value | group_id |
+----+-------+----------+
| 1 | -200 | 0 |
| 2 | -620 | 0 |
| 3 | -310 | 0 |
| 4 | 400 | 1 |
| 5 | 300 | 1 |
| 6 | 100 | 1 |
| 7 | -200 | 2 |
| 8 | -400 | 2 |
| 9 | -500 | 2 |
+----+-------+----------+
What I would like to do is produce a 4th column that, for each record, shows the last value of the preceding group_id.
So the result I want is as follows:
+----+-------+----------+----------------+
| id | value | group_id | LastValByGroup |
+----+-------+----------+----------------+
| 1 | -200 | 0 | 0 |
| 2 | -620 | 0 | 0 |
| 3 | -310 | 0 | 0 |
| 4 | 400 | 1 | -310 |
| 5 | 300 | 1 | -310 |
| 6 | 100 | 1 | -310 |
| 7 | -200 | 2 | 100 |
| 8 | -400 | 2 | 100 |
| 9 | -500 | 2 | 100 |
+----+-------+----------+----------------+
What I have done so far is in 2 parts. First I use the LAST_VALUE function to get the last Value in each group. Then I have tried to use the LAG function to get the last value from the previous group. Unfortunately the second part of my code isn't working as desired.
Here is my code:
CREATE TABLE #temp
(
id int identity(1,1),
value int,
group_id int
)
INSERT #temp VALUES(-200,0)
INSERT #temp VALUES(-620,0)
INSERT #temp VALUES(-310,0)
INSERT #temp VALUES(400,1)
INSERT #temp VALUES(300,1)
INSERT #temp VALUES(100,1)
INSERT #temp VALUES(-200,3)
INSERT #temp VALUES(-400,3)
INSERT #temp VALUES(-500,3)
;WITH cte AS
(
SELECT
*,
LastValByGroup = LAST_VALUE(Value) OVER(Partition By group_id ORDER BY id
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
FROM
#temp
), lagged AS
(
SELECT
*,
LaggedLastValByGroup = LAG(LastValByGroup,1,0) OVER(Partition By group_id ORDER BY id)
FROM
cte
)
SELECT * FROM lagged ORDER BY id
DROP TABLE #temp
And this is the result I get:
+----+-------+----------+----------------+----------------------+
| id | value | group_id | LastValByGroup | LaggedLastValByGroup |
+----+-------+----------+----------------+----------------------+
| 1 | -200 | 0 | -310 | 0 |
| 2 | -620 | 0 | -310 | -310 |
| 3 | -310 | 0 | -310 | -310 |
| 4 | 400 | 1 | 100 | 0 |
| 5 | 300 | 1 | 100 | 100 |
| 6 | 100 | 1 | 100 | 100 |
| 7 | -200 | 3 | -500 | 0 |
| 8 | -400 | 3 | -500 | -500 |
| 9 | -500 | 3 | -500 | -500 |
+----+-------+----------+----------------+----------------------+
Any help is much appreciated.
Thanks
You can use first_value like following to get the desired result.
select distinct t2.*, ISNULL(FIRST_VALUE(t1.[value]) over(partition by t1.group_id order by t1.id desc), 0) LastValByGroup
from #data t1
right join #data t2 on t1.group_id + 1 = t2.group_id
Please find the db<>fiddle here.

Divide selected value by count(*)

I have a Microsoft SQL Server with the following tables:
Projects
BookedHours (with fk_Project = Projects.ID)
Products
ProjectsToProducts (n:m with fk_Projects = Projects.ID and fk_Products = Products.ID)
I now want to select how many hours are booked to which product per month. The problem is, that one project can have multiple products (that's why I need the n:m table).
If I do the following, it will count the hours twice if a project has two products.
SELECT
P.ID AS fk_Product, MONTH(B.Datum) AS Monat, SUM(B.Hours) AS Stunden
FROM
tbl_BookedHours AS B
INNER JOIN
tbl_Projects AS M on B.fk_Project = M.ID
INNER JOIN
tbl_ProjectProduct AS PP ON PP.fk_Project = M.ID
INNER JOIN
tbl_Products AS P ON PP.fk_Product = P.ID
WHERE
YEAR(B.Datum) = 2020
GROUP BY
P.ID, MONTH(B.Datum)
ORDER BY
P.ID, MONTH(B.Datum)
I can get the number of products for each project with this SQL:
SELECT fk_Project, COUNT(*) AS Cnt
FROM tbl_ProjectProduct
GROUP By fk_MainProject
But how can I now divide the hours for each project by its individual factor and add it all up per product and month?
I could do it in my C# program or I could use a cursor and iterate through all projects, but I think there should be an more elegant way.
Edit with sample data:
|----------------| |----------------| |------------------------------|
| tbl_Projects | | tbl_Products | | tbl_ProjectProduct |
|----------------| |----------------| |------------------------------|
| ID | Name | | ID | Name | | ID | fk_Project | fk_Product |
|----+-----------| |----+-----------| |------------------------------|
| 1 | Project 1 | | 1 | Product 1 | | 1 | 1 | 1 |
| 2 | Project 2 | | 2 | Product 2 | | 2 | 1 | 2 |
| 3 | Project 3 | | 3 | Product 3 | | 3 | 2 | 1 |
| 4 | Project 4 | | 4 | Product 4 | | 4 | 3 | 3 |
|----------------| |----------------| | 5 | 4 | 1 |
| 6 | 4 | 2 |
| 7 | 4 | 4 |
|------------------------------|
|--------------------------------------|
| tbl_BookedHours |
|--------------------------------------|
| ID | fk_Project | Hours | Date |
|--------------------------------------|
| 1 | 1 | 10 | 2020-01-15 |
| 2 | 1 | 20 | 2020-01-20 |
| 3 | 2 | 10 | 2020-01-15 |
| 4 | 3 | 30 | 2020-01-18 |
| 5 | 2 | 20 | 2020-01-20 |
| 6 | 4 | 30 | 2020-01-25 |
| 7 | 1 | 10 | 2020-02-15 |
| 8 | 1 | 20 | 2020-02-20 |
| 9 | 2 | 10 | 2020-02-15 |
| 10 | 3 | 30 | 2020-03-18 |
| 11 | 2 | 20 | 2020-03-20 |
| 12 | 4 | 30 | 2020-03-25 |
|--------------------------------------|
The Result should be:
|----------------------------|
| fk_Product | Month | Hours |
|----------------------------|
| 1 | 1 | 55 |
| 2 | 1 | 25 |
| 3 | 1 | 30 |
| 4 | 1 | 10 |
| 1 | 2 | 25 |
| 2 | 2 | 15 |
| 1 | 3 | 30 |
| 2 | 3 | 10 |
| 3 | 3 | 30 |
| 4 | 3 | 10 |
|----------------------------|
For example booking Nr. 1 has to be divided by 2 (because Project 1 has two products) and one half of amount added to Product 1 and the other to Product 2 (Both in January). Booking Nr. 4 should not be divided, because Project 3 only has one product. Booking Numer 12 for example has to be divided by 3.
So that in total the Hours in the end add up to the same total.
I hope it's clearer now.
*** EDIT 2***
DECLARE #tbl_Projects TABLE (ID INT, [Name] VARCHAR(MAX))
INSERT INTO #tbl_Projects VALUES
(1,'Project 1'),
(2,'Project 2'),
(3,'Project 3'),
(4,'Project 4')
DECLARE #tbl_Products TABLE (ID INT, [Name] VARCHAR(MAX))
INSERT INTO #tbl_Products VALUES
(1,'Product 1'),
(2,'Product 2'),
(3,'Product 3'),
(4,'Product 4')
DECLARE #tbl_ProjectProduct TABLE (ID INT, fk_Project int, fk_Product int)
INSERT INTO #tbl_ProjectProduct VALUES
(1,1,1),
(2,1,2),
(3,2,1),
(4,3,3),
(5,4,1),
(6,4,2),
(7,4,4)
DECLARE #tbl_BookedHours TABLE (ID INT, fk_Project int, Hours int, [Date] Date)
INSERT INTO #tbl_BookedHours VALUES
(1,1,10,'2020-01-15'),
(2,1,20,'2020-01-20'),
(3,2,10,'2020-01-15'),
(4,3,30,'2020-01-18'),
(5,2,20,'2020-01-20'),
(6,4,30,'2020-01-25'),
(7,1,10,'2020-02-15'),
(8,1,20,'2020-02-20'),
(9,2,10,'2020-02-15'),
(10,3,30,'2020-03-18'),
(11,2,20,'2020-03-20'),
(12,4,30,'2020-03-25')
SELECT P.ID AS fk_Product, MONTH(B.Date) AS Month, SUM(B.Hours) AS SumHours
FROM #tbl_BookedHours AS B INNER JOIN #tbl_Projects AS M on B.fk_Project = M.ID
INNER JOIN #tbl_ProjectProduct AS PP ON PP.fk_Project = M.ID
INNER JOIN #tbl_Products AS P ON PP.fk_Product = P.ID
GROUP BY P.ID,MONTH(B.Date)
ORDER BY P.ID, MONTH(B.Date)
This gives me the wrong result, because it Counts the hours for both products:
| fk_Product | Month | SumHours |
|-------------------------------|
| 1 | 1 | 90 |
| 1 | 2 | 40 |
| 1 | 3 | 50 |
| 2 | 1 | 60 |
| 2 | 2 | 30 |
| 2 | 3 | 30 |
| 3 | 1 | 30 |
| 3 | 3 | 30 |
| 4 | 1 | 30 |
| 4 | 3 | 30 |
|-------------------------------|
Consider the following query. I modified your table variables to temp tables so it was easier to debug.
;WITH CTE AS
(
SELECT fk_Project, count(fk_Product) CNT
FROM #tbl_ProjectProduct
GROUP BY fk_Project
)
,CTE2 AS
(
SELECT t1.Date, t2.fk_Project, Hours/CNT NewHours
FROM #tbl_BookedHours t1
INNER JOIN CTE t2 on t1.fk_Project = t2.fk_Project
)
SELECT t4.ID fk_Product, MONTH(date) MN, SUM(NewHours) HRS
FROM CTE2 t1
INNER JOIN #tbl_Projects t2 on t1.fk_Project = t2.id
INNER JOIN #tbl_ProjectProduct t3 on t3.fk_Project = t2.ID
INNER JOIN #tbl_Products t4 on t4.ID = t3.fk_Product
GROUP BY t4.ID,MONTH(date)

Getting Top 10 based on column value

I have a code that output a long list of the sum of count of work orders per name and sorts it by total, name and count:
;with cte as (
SELECT [Name],
[Emergency],
count([Emergency]) as [CountItem]
FROM tableA
GROUP BY [Name], [Emergency])
select Name,[Emergency],[Count],SUM([CountItem]) OVER(PARTITION BY Name) as Total from cte
order by Total desc, Name, [CountItem] desc
but I only want to get the top 10 Names with the highest total like the one below:
+-------+-------------------------------+-------+-------+
| Name | Emergency | Count | Total |
+-------+-------------------------------+-------+-------+
| PLB | No | 7 | 15 |
| PLB | No Hot Water | 4 | 15 |
| PLB | Resident Locked Out | 2 | 15 |
| PLB | Overflowing Tub | 1 | 15 |
| PLB | No Heat | 1 | 15 |
| GG | Broken Lock - Exterior | 6 | 6 |
| BOA | Broken Lock - Exterior | 2 | 4 |
| BOA | Garage Door not working | 1 | 4 |
| BOA | Resident Locked Out | 1 | 4 |
| 15777 | Smoke Alarm not working | 3 | 3 |
| FP | No air conditioning | 2 | 3 |
| FP | Flood | 1 | 3 |
| KB | No electrical power | 2 | 3 |
| KB | No | 1 | 3 |
| MEM | Noise Complaint | 3 | 3 |
| ANG | Parking Issue | 2 | 2 |
| ALL | Smoke Alarm not working | 2 | 2 |
| AAS | No air conditioning | 1 | 2 |
| AAS | Toilet - Clogged (1 Bathroom) | 1 | 2 |
+-------+-------------------------------+-------+-------+
Note: I'm not after unique values. As you can see from the example above it gets the top 10 names from a very long table.
What I want to happen is assign a row id for each name so all PLB above will have a row id of 1, GG = 2, BOA = 3, ...
So on my final select I will only add the where clause where row id <= 10. I already tried ROW_NUMBER() OVER(PARTITION BY Name ORDER BY Name) but it's assigning 1 to every unique Name it encounters.
You may try this:
;with cte as (
SELECT [Name],
[Emergency],
count([Emergency]) as [CountItem]
FROM tableA
GROUP BY [Name], [Emergency]),
ct as (
select Name,[Emergency],[Count],SUM([CountItem]) OVER(PARTITION BY PropertyName) as Total from cte
),
ctname as (
select dense_rank() over ( order by total, name ) as RankName, Name,[Emergency],[Count], total from ct )
select * from ctname where rankname < 11

What's an efficient way to count "previous" rows in SQL?

Hard to phrase the title for this one.
I have a table of data which contains a row per invoice. For example:
| Invoice ID | Customer Key | Date | Value | Something |
| ---------- | ------------ | ---------- | ------| --------- |
| 1 | A | 08/02/2019 | 100 | 1 |
| 2 | B | 07/02/2019 | 14 | 0 |
| 3 | A | 06/02/2019 | 234 | 1 |
| 4 | A | 05/02/2019 | 74 | 1 |
| 5 | B | 04/02/2019 | 11 | 1 |
| 6 | A | 03/02/2019 | 12 | 0 |
I need to add another column that counts the number of previous rows per CustomerKey, but only if "Something" is equal to 1, so that it returns this:
| Invoice ID | Customer Key | Date | Value | Something | Count |
| ---------- | ------------ | ---------- | ------| --------- | ----- |
| 1 | A | 08/02/2019 | 100 | 1 | 2 |
| 2 | B | 07/02/2019 | 14 | 0 | 1 |
| 3 | A | 06/02/2019 | 234 | 1 | 1 |
| 4 | A | 05/02/2019 | 74 | 1 | 0 |
| 5 | B | 04/02/2019 | 11 | 1 | 0 |
| 6 | A | 03/02/2019 | 12 | 0 | 0 |
I know I can do this using either a CTE like this...
(
select
count(*)
from table
where
[Customer Key] = t.[Customer Key]
and [Date] < t.[Date]
and Something = 1
)
But I have a lot of data and that's pretty slow. I know I can also use cross apply to achieve the same thing, but as far as I can tell that's not any better performing than just using a CTE.
So; is there a more efficient means of achieving this, or do I just suck it up?
EDIT: I originally posted this without the requirement that only rows where Something = 1 are counted. Mea culpa - I asked it in a hurry. Unfortunately I think that this means I can't use row_number() over (partition by [Customer Key])
Assuming you're using SQL Server 2012+ you can use Window Functions:
COUNT(CASE WHEN Something = 1 THEN CustomerKey END) OVER (PARTITION BY CustomerKey ORDER BY [Date]
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) -1 AS [Count]
Old answer before new required logic:
COUNT(CustomerKey) OVER (PARTITION BY CustomerKey ORDER BY [Date]
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) -1 AS [Count]
If you're not using 2012 an alternative is to use ROW_NUMBER
ROW_NUMBER() OVER (PARTITION BY CustomerKey ORDER BY [Date]) - 1 AS Count

Using Common Table Expressions for multi-time windows

I have run this query in SQL Server as:
WITH CTE AS
(
SELECT AIP.aid [Author_ID],
MIN(CAST(P.abstract_research_area as VARCHAR(100))) [Research_Area],
CAST(RC.research_category as VARCHAR(100)) [Research_Category],
P.abstract_research_area_category_id [Category_ID],
COUNT(*) [Paper_Count],
P.p_year [Paper_Year]
FROM author_individual_papers AIP
JOIN sub_aminer_paper P ON AIP.pid = P.pid
JOIN research_categories RC ON P.abstract_research_area_category_id = RC.category_id
WHERE P.abstract_research_area_category_id IS NOT NULL AND
AIP.aid IN (SELECT Author_ID FROM Authors) AND AIP.p_year BETWEEN 2005 AND 2014
GROUP BY AIP.aid,
CAST(RC.research_category as VARCHAR(100)),
P.abstract_research_area_category_id,
P.p_year
),
CTE_1 AS
(
SELECT *, ROW_NUMBER()
OVER(
PARTITION BY Author_ID, Paper_Year
ORDER BY Paper_Count DESC, Research_Area ASC
) AS Rank
FROM CTE
)
SELECT *
FROM CTE_1
WHERE Rank <= 3
which returns this output:
+-----------+------------------------+-------------------+-------------+-------------+------------+------+
| Author_ID | Research_Area | Research_Category | Category_ID | Paper_Count | Paper_Year | Rank |
+-----------+------------------------+-------------------+-------------+-------------+------------+------+
| 677 | feature extraction | Data Mining | 8 | 1 | 2005 | 1 |
| 677 | image annotation | Image Processing | 11 | 1 | 2005 | 2 |
| 677 | retrieval model | Info retrieval | 12 | 1 | 2005 | 3 |
| 677 | semantic | Prog Languages | 19 | 1 | 2007 | 1 |
| 677 | feature extraction | Data Mining | 8 | 1 | 2009 | 1 |
| 677 | image annotation | Image Processing | 11 | 1 | 2011 | 1 |
| 677 | semantic | Prog Languages | 19 | 1 | 2012 | 1 |
| 677 | video sequence | Computation Math | 5 | 2 | 2013 | 1 |
| 1359 | adversary model | Analysis of Algo | 1 | 2 | 2005 | 1 |
| 1359 | ensemble method | Machine Learning | 14 | 2 | 2005 | 2 |
| 1359 | image representation | Image Processing | 11 | 2 | 2005 | 3 |
| 1359 | adversary model | Analysis of Algo | 1 | 7 | 2006 | 1 |
| 1359 | concurrency control | Signal Processing | 17 | 5 | 2006 | 2 |
| 1359 | information system | Info retrieval | 12 | 2 | 2006 | 3 |
| 1359 | algorithm analysis | Analysis of Algo | 1 | 3 | 2007 | 1 |
| 1359 | markov model | Prob & Statistics | 18 | 2 | 2007 | 2 |
| 1359 | real time systems | Signal Processing | 17 | 2 | 2007 | 3 |
| 1359 | point based model | Computation Math | 5 | 3 | 2008 | 1 |
| 1359 | discriminant analysis | Analysis of Algo | 1 | 2 | 2008 | 2 |
| 1359 | fuzzy logic systems | Artif Intelligence| 2 | 2 | 2008 | 3 |
| ... | ... | ... | ... | ... | ... | ... |
| ... | ... | ... | ... | ... | ... | ... |
| ... | ... | ... | ... | ... | ... | ... |
+-----------+------------------------+-------------------+-------------+-------------+------------+------+
This is showing TOP 3 ROWS for each Author_ID in every Paper_Year ranging BETWEEN 2005 to 2014 which is ORDER BY Paper_Count DESC. So now each Author_ID if having papers in each (10) Paper_Year, then will correspond to 30 rows.
I want to display TOP 3 ROWS for each Author_ID not for every Paper_Year individually but for each Paper_Interval e.g. for Paper_Interval i.e. 2005-06, 2007-08, 2009-10, 2011-12, 2013-14.
The desired/expected* OUTPUT is:
* If there is no paper in respected year for any author, in-spite of this, the year should get mentioned in the Interval e.g. Author_ID = 677 has no paper in 2006, so still the Interval should get displayed as 2005-2006.
+-----------+---------------------+-------------+-------------+----------------+------+
| Author_ID | Research_Category | Category_ID | Paper_Count | Paper_Interval | Rank |
+-----------+---------------------+-------------+-------------+----------------+------+
| 677 | Data Mining | 8 | 1 | 2005-06 | 1 |
| 677 | Image Processing | 11 | 1 | 2005-06 | 2 |
| 677 | Info retrieval | 12 | 1 | 2005-06 | 3 |
| 677 | Prog Languages | 19 | 1 | 2007-08 | 1 |
| 677 | Data Mining | 8 | 1 | 2009-10 | 1 |
| 677 | Image Processing | 11 | 1 | 2011-12 | 1 |
| 677 | Prog Languages | 19 | 1 | 2011-12 | 2 |
| 677 | Computation Math | 5 | 2 | 2013-14 | 1 |
| 1359 | Analysis of Algo | 1 | 9 | 2005-06 | 1 |
| 1359 | Signal Processing | 17 | 5 | 2005-06 | 2 |
| 1359 | Machine Learning | 14 | 2 | 2005-06 | 3 |
| 1359 | Analysis of Algo | 1 | 5 | 2007-08 | 1 |
| 1359 | Prob & Statistics | 5 | 3 | 2007-08 | 2 |
| 1359 | Artif Intelligence | 2 | 2 | 2007-08 | 3 |
| ... | ... | ... | ... | ... | ... |
| ... | ... | ... | ... | ... | ... |
| ... | ... | ... | ... | ... | ... |
+-----------+---------------------+-------------+-------------+----------------+------+
whereas if I make 2 years interval for each Author_ID then each author will have 15 rows max if having papers in each Paper_Interval (5 Intervals).
Moreover, Research_Category with the first highest total Paper_Count in a single Paper_Interval will come at Rank = 1 and vice versa. If there is a match in Paper_Count as in this case:
For Author_ID = 1359 and Paper_Interval = 2005-06
In terms of highest total Paper_Count,
First Highest Total Paper_Count = 9 for Category_ID = 1 will be at Rank = 1
Second Highest Total Paper_Count = 5 for Category_ID = 17 will be at Rank = 2
Third Highest Total
There is a match in terms of Third Highest Total Paper_Count i.e.
Research_Area | Category_ID | Paper_Count | Paper_Interval
-----------------------------------------------------------------
ensemble method | 14 | 2 | 2005-06
image representation | 11 | 2 | 2005-06
information system | 12 | 2 | 2005-06
Now, in this case we will choose alphabetically (Research_Area) for Rank = 3 which comes Category_ID = 14.
The question is: how can we modify this query to get output in desired form for 5 intervals (i.e. Paper_Interval) for each Author_ID?
ADDENDUM
I have added 3 tables (used in the query) with sample data in .csv format in under-mentioned links as:
CREATE TABLE author_individual_papers
CREATE TABLE [dbo].[author_individual_papers](
[id] [int] IDENTITY(1,1) NOT NULL,
[aid] [int] NULL,
[pid] [int] NULL,
[p_year] [int] NULL,
[p_venue_vid] [int] NULL
)
Table link with sample data (only for Author_ID 677 & 1359)
author_individual_papers
CREATE TABLE sub_aminer_paper
CREATE TABLE [dbo].[sub_aminer_paper](
[pid] [int] NULL,
[p_year] [int] NULL,
[abstract_research_area] [varchar](max) NULL,
[abstract_research_area_category_id] [int] NULL
)
Table link with sample data (only for Author_ID 677 & 1359)
sub_aminer_paper
CREATE TABLE research_categories
CREATE TABLE [dbo].[research_categories](
[category_id] [int] NOT NULL,
[research_category] [nvarchar](max) NULL
)
Table link with data
research_categories
The desired/expected result is already mentioned above in the question.
Try this, hopefully i got your requirement right.
DECLARE #year_start INT
DECLARE #year_end INT
SET #year_start = 2005
SET #year_end = 2014
; WITH
CTE AS
(
SELECT
AIP.aid [Author_ID],
MIN(CAST(P.abstract_research_area as VARCHAR(100))) [Research_Area],
CAST(RC.research_category as VARCHAR(100)) [Research_Category],
P.abstract_research_area_category_id [Category_ID],
COUNT(*) [Paper_Count],
--P.p_year [Paper_Year], -- removed
(p.p_year - #year_start + 2) / 2 [Interval_No], -- added
CAST(MIN(P.p_year) as VARCHAR(4)) + '-' + CAST(MAX(P.p_year) as VARCHAR(4)) [Interval] -- added
FROM
author_individual_papers AIP
JOIN
sub_aminer_paper P
ON AIP.pid = P.pid
JOIN research_categories RC
ON P.abstract_research_area_category_id = RC.category_id
WHERE
P.abstract_research_area_category_id IS NOT NULL
AND
AIP.aid IN (SELECT Author_ID FROM Authors)
AND
AIP.p_year BETWEEN #year_start AND #year_end
GROUP BY
AIP.aid,
CAST(RC.research_category as VARCHAR(100)),
P.abstract_research_area_category_id,
--P.p_year, -- removed
(p.p_year - #year_start + 2) / 2 -- added
),
CTE_1 AS
(
SELECT *,
ROW_NUMBER()
OVER(
PARTITION BY Author_ID, [Interval_No] -- changed
ORDER BY Paper_Count DESC, Research_Area ASC
) AS Rank
FROM CTE
)
SELECT *
FROM CTE_1
WHERE Rank <= 3
EDIT : Updated Query
DECLARE #year_start INT
DECLARE #year_end INT
SET #year_start = 2005
SET #year_end = 2014
; WITH
p_year AS -- added
(
SELECT p_year = #year_start
UNION ALL
SELECT p_year = p_year + 1
FROM p_year
WHERE p_year < #year_end
),
Interval AS -- added
(
SELECT p_year, Interval_No,
Interval = CAST(MIN(p_year) OVER (PARTITION BY Interval_No) AS VARCHAR(4)) + '-' + CAST(MAX(p_year) OVER (PARTITION BY Interval_No) AS VARCHAR(4))
FROM
(
SELECT p_year, (p_year - #year_start + 2) / 2 AS Interval_No
FROM p_year
) AS D
),
CTE AS
(
SELECT
AIP.aid [Author_ID],
MIN(CAST(P.abstract_research_area as VARCHAR(100))) [Research_Area],
CAST(RC.research_category as VARCHAR(100)) [Research_Category],
P.abstract_research_area_category_id [Category_ID],
COUNT(*) [Paper_Count],
--P.p_year [Paper_Year], -- removed
I.Interval_No, -- added, changed
I.Interval -- added, changed
FROM
author_individual_papers AIP
JOIN
sub_aminer_paper P
ON AIP.pid = P.pid
JOIN research_categories RC
ON P.abstract_research_area_category_id = RC.category_id
JOIN Interval I -- added
ON P.p_year = I.p_year
WHERE
P.abstract_research_area_category_id IS NOT NULL
AND
AIP.aid IN (SELECT Author_ID FROM Authors)
AND
AIP.p_year BETWEEN #year_start AND #year_end
GROUP BY
AIP.aid,
CAST(RC.research_category as VARCHAR(100)),
P.abstract_research_area_category_id,
--P.p_year, -- removed
I.Interval_No, I.Interval -- added, changed
),
CTE_1 AS
(
SELECT *,
ROW_NUMBER()
OVER
(
PARTITION BY Author_ID, [Interval_No] -- changed
ORDER BY Paper_Count DESC, Research_Area ASC
) AS Rank
FROM CTE
)
SELECT *
FROM CTE_1
WHERE Rank <= 3
ORDER BY Author_ID, Interval, Rank

Resources