Faster execution of non nulls for a column - sql-server

I need to get percentage of nulls for a given column in a table. The table contains close to 368081344 records as of now in table. Number of records will increase by 20 million each day. Below is the query am using.
SELECT (COUNT_BIG(column)/ count_big(*)) * 100
from <table>
Then, I perform 100 - above output to fetch the required output
Please let me know best possible solution which can yield faster result

Have you tried the below method :
DECLARE #T TABLE
(
Id INT
)
;WITH CTE
AS
(
SELECT
SeqNo = 1,
NULL "Val"
UNION ALL
SELECT
SeqNo = SeqNo+1,
Val
FROM CTE
WHERE SeqNo<100
)
INSERT INTO #T(Id)
SELECT Val FROM CTE
UNION ALL
SELECT SeqNo FROM CTE
SELECT
TotCount = COUNT(1),
ValCount = SUM(CASE WHEN Id IS NULL THEN 0 ELSE 1 END),
NullCount = SUM(CASE WHEN Id IS NOT NULL THEN 0 ELSE 1 END),
NullPercent = (CAST(SUM(CASE WHEN Id IS NOT NULL THEN 0 ELSE 1 END) AS FLOAT)/CAST(COUNT(1) AS FLOAT))*100
FROM #T

Partial answer only. Not sure how to get the count for a specific column
You can speed up the total row count using this query.
SELECT P.ROWS
FROM SYS.OBJECTS AS O INNER JOIN SYS.PARTITIONS AS P
ON O.OBJECT_ID = P.OBJECT_ID
WHERE O.NAME = 'PARENT' AND
P.INDEX_ID < 2
ORDER BY O.NAME

Related

T-SQL Test for checking duplicates in a column and returning if it passes or fails check

I am running a data quality check on multiple tables and columns in a database.
If the column contains at least one duplciate, it fails the test and PassFail returns 0. Likewise if it contains no duplicates, it passses the test and PassFail returns 1.
The 1s and 0s from PassFail is then fed to an average to calculate an overall data quality score.
SELECT
'[Plant.Asset]' AS TableName
,'[ASSETNUM]' AS ColumnName
,COUNT(1) AS TotalRows
,0 AS PassFail
FROM Plant.Asset a
INNER JOIN Plant.Loc AS B ON A.LOCATION = B.LOCATION
GROUP BY
A.ASSETNUM
HAVING
COUNT(A.ASSETNUM) > 1
The script returns correctly when there are duplicates, however when there are no duplicates, the table returns nothing.
If there are duplciates the output is
TableName ColumnName TotalRows PassFail
Plant.Asset ASSETNUM 1234 0
If there are no duplicates, the output is currently blank
TableName ColumnName TotalRows PassFail
If there are no duplicates, the output should be
TableName ColumnName TotalRows PassFail
Plant.Asset ASSETNUM 0 1
You want case expression :
SELECT 'Plant.Asset' AS [TableName], A.ASSETNUM AS [ColumnName], COUNT(1) AS TotalRows,
(CASE WHEN COUNT(1) > 1 THEN 0 ELSE 1 END) AS PassFail
FROM Plant.Asset a INNER JOIN
Plant.Loc AS B
ON A.LOCATION = B.LOCATION
GROUP BY A.ASSETNUM;
You've tagged this with Stored Procedure so I assume this SELECT is part of a stored procedure. A quick and easy way to handle this would be to INSERT the results of your query into a table variable or temp table:
INSERT INTO #ResultTable (TableName, ColumnName, TotalRows, PassFail)
SELECT
'[Plant.Asset]' AS TableName
,'[ASSETNUM]' AS ColumnName
,COUNT(1) AS TotalRows
,0 AS PassFail
FROM Plant.Asset a
INNER JOIN Plant.Loc AS B ON A.LOCATION = B.LOCATION
GROUP BY
A.ASSETNUM
HAVING
COUNT(A.ASSETNUM) > 1;
Then check that table and either return it or your default result, since you have a static result that you want to return if there are no duplicates:
IF (SELECT COUNT(*) FROM #ResultTable > 0)
SELECT * FROM #ResultTable;
ELSE
SELECT
'[Plant.Asset]' AS TableName
,'[ASSETNUM]' AS ColumnName
,0 AS TotalRows
,1 AS PassFail

SQL Server Overall Total in a group by

In my SQL Server Query, I am trying to count the number of employees per site. This works, but when I try to add in a percentage of total, it still groups by Site so it is inaccurate.
Is there an easier way to achieve this?
I am using this Query to create a view.
select Site.SiteName,
sum(case when Employee.ActiveStatus = 'Yes' then 1 else 0 end) as
"NumberOfEmployees",
CONVERT(decimal(6,2),(sum(case when Employee.ActiveStatus = 'Yes' then 1
else 0 end))/(convert(decimal(6,2),COUNT(EmployeeID)))) as PercentageOfEmps
from Employee
left join Site
on(Employee.SiteID=Site.SiteID)
GROUP BY Site.SiteName;
GO
You could use subquery:
select
Site.SiteName,
NumberOfEmployees = sum(case when Employee.ActiveStatus = 'Yes' then 1 else 0 end),
PercentageOfEmps = CONVERT(decimal(6,2),(sum(case when Employee.ActiveStatus = 'Yes' then 1
else 0 end))/(SELECT COUNT(EmployeeID) FROM Employee)
from Employee
left join Site
on Employee.SiteID=Site.SiteID
GROUP BY Site.SiteName;
I can't provide an answer for your scenario, as I don't have any sample data to use, therefore I've provided a small dataset.
One method is to use a CTE/Subquery to get a total number and then include the total in the GROUP BY. This method avoids 2 scans of the table:
WITH VTE AS(
SELECT *
FROM (VALUES(1,'Steve',1),
(2,'Jayne',1),
(3,'Greg',2),
(4,'Sarah',3)) V(EmpID, EmpName, SiteID)),
CTE AS(
SELECT V.EmpID,
V.EmpName,
V.SiteID,
COUNT(V.EmpID) OVER () AS TotalCount
FROM VTE V)
SELECT C.SiteID,
COUNT(C.EmpID) AS Employees,
COUNT(C.EmpID) / (C.TotalCount *1.0) AS Perc
FROM CTE C
GROUP BY C.SiteID,
C.TotalCount;
This script should help-
SELECT
Site.SiteName,
COUNT(EmployeeID) AS [NumberOfEmployees],
((COUNT(EmployeeID)*1.0)/(SELECT COUNT(*) FROM Employee WHERE ActiveStatus = 'Yes'))*100.00 as PercentageOfEmps
FROM Employee
INNER JOIN Site
ON Employee.SiteID = Site.SiteID
WHERE Employee.ActiveStatus = 'Yes'
GROUP BY Site.SiteName;
Data creation script
declare #Employee Table(EmployeeID int ,ActiveStatus nvarchar(20) ,SiteID int)
declare #Site Table(SiteName nvarchar(20) ,SiteID int)
insert into #Employee values(1,'Yes',101),(2,'Yes',101),(3,'Yes',102),(4,'Yes',102),
(5,'Yes',101)
insert into #Site values('Site1',101)
insert into #Site values('Site2',102)
//real script to get the %percentage
;with cte as
(
select s.SiteName,sum(case when e.ActiveStatus = 'Yes' then 1 else 0 end) as "NumberOfEmployees"
from #Employee e
left join #Site s
on(e.SiteID=s.SiteID)
GROUP BY s.SiteName
),
cte_sum as
(select sum(NumberOfEmployees) as total from cte )
select c.*, convert (decimal(6,2),c.NumberOfEmployees)/convert (decimal(6,2),cs.total)*100 from cte_sum cs, cte c;

Create a stored procedure to aggregate rows

Having a transaction table with the following rows:
Id UserId PlatformId TransactionTypeId
-------------------------------------------------
0 1 3 1
1 1 1 2
2 2 3 2
3 3 2 1
4 2 3 1
How do I write a stored procedure that can aggregate the rows into a new table with the following format?
Id UserId Platforms TransactionTypeId
-------------------------------------------------
0 1 {"p3":1,"p1":1} {"t1":1,"t2":1}
1 2 {"p3":2} {"t2":1,"t1":1}
3 3 {"p2":1} {"t1":1}
So the rows are gouped by User, count each platform/transactionType and store as key/value json string.
Ref: My previous related question
You could use GROUP BY and FOR JSON:
SELECT MIN(ID) AS ID, UserId, MIN(sub.x) AS Platforms, MIN(sub2.x) AS Transactions
FROM tab t
OUTER APPLY (SELECT CONCAT('p', platformId) AS platform, cnt
FROM (SELECT PlatformId, COUNT(*) AS cnt
FROM tab t2 WHERE t2.UserId = t.UserId
GROUP BY PlatformId) s
FOR JSON AUTO) sub(x)
OUTER APPLY (SELECT CONCAT('t', TransactiontypeId) AS Transactions, cnt
FROM (SELECT TransactiontypeId, COUNT(*) AS cnt
FROM tab t2 WHERE t2.UserId = t.UserId
GROUP BY TransactiontypeId) s
FOR JSON AUTO) sub2(x)
GROUP BY UserId;
DBFiddle Demo
Result is a bit different(array of key-value) but please treat it as starting point.
Your sample JSON is not really a json, but since you want it that way:
SELECT u.UserId, plt.pValue, ttyp.ttValue
FROM Users AS [u]
CROSS APPLY (
SELECT '{'+STUFF( (SELECT ',"'+pn.pName+'":'+LTRIM(STR(pn.pCount))
FROM (SELECT p.Name AS pName, COUNT(*) AS pCount
FROM transactions t
left JOIN Platforms p ON p.PlatformID = t.PlatformId
WHERE t.UserId = u.UserId
GROUP BY p.PlatformId, p.Name
) pn
FOR XML PATH('')),1,1,'')+'}'
) plt(pValue)
CROSS APPLY (
SELECT '{'+STUFF( (SELECT ',"'+tty.ttName+'":'+LTRIM(STR(tty.ttCount))
FROM (SELECT tt.Name AS ttName, COUNT(*) AS ttCount
FROM transactions t
left JOIN dbo.TransactionType tt ON tt.TransactionTypeId = t.TransactionTypeID
WHERE t.UserId = u.UserId
GROUP BY tt.TransactionTypeId, tt.Name
) tty
FOR XML PATH('')),1,1,'')+'}'
) ttyp(ttValue)
WHERE EXISTS (SELECT * FROM transactions t WHERE u.UserId = t.UserId)
ORDER BY UserId;
DBFiddle Sample

Using max(col) with count in sub-query SQL Server

I am putting together a query in SQL Server but having issues with the sub-query
I wish to use the max(loadid) and count the number of records the query returns.
So for example my last loadid is 400 and the amount of records with 400 is 2300, so I would my recor_count column should display 2300. I have tried various ways below but am getting errors.
select count (loadid)
from t1
where loadid = (select max(loadid) from t1) record_count;
(select top 1 LOADID, count(*)
from t1
group by loadid
order by count(*) desc) as Record_Count
Showing loadid and number of matching rows with the use of grouping, ordering by count and limiting the output to 1 row with top.
select top 1 loadid, count(*) as cnt
from t1
group by loadid
order by cnt desc
This may be easier to achieve with a window function in the inner query:
SELECT COUNT(*)
FROM (SELECT RANK() OVER (ORDER BY loadid DESC) AS rk
FROM t1) t
WHERE rk = 1
Another simplest way to achieve the result :
Set Nocount On;
Declare #Test Table
(
Id Int
)
Insert Into #Test(Id) Values
(397),(398),(399),(400)
Declare #Abc Table
(
Id Int
,Value Varchar(100)
)
INsert Into #Abc(Id,Value) Values
(398,'')
,(400,'')
,(397,'')
,(400,'')
,(400,'')
Select a.Id
,Count(a.Value) As RecordCount
From #Abc As a
Join
(
Select Max(t.Id) As Id
From #Test As t
) As v On a.Id = v.Id
Group By a.Id

Sum of missing data

The below query displays sites against the total orders within last week.
But if there is no order for a given site in last week, i should still see the site with a sum of zero.
At the moment its only giving me four sites, thats because no order has been made in the last week for those sites.
select SITE
,SUM(Case When OrderDate >= dateadd(dd,(datediff(dd,-53690,getdate()-1)/7)*7,-53690)
Then 1
Else 0
End) as COMPLETED
from
(
SELECT DISTINCT ORDERS.SITE, ORDERS.ORDERDATE FROM ORDERS
INNER JOIN PHONEDATA AS P
ON ORDERS.RECID = P.OrderID
where SITE IN ('SITE1','SITE2','SITE3','SITE4','SITE5','SITE6','SITE7')
) X
GROUP BY SITE
order by SITE
RESULT:
Site---------------------Completed
SITE1-----------------------2
SITE2-----------------------2
SITE3-----------------------2
SITE4-----------------------2
EXPECTED RESULT:
Site---------------------Completed
SITE1-----------------------2
SITE2-----------------------2
SITE3-----------------------2
SITE4-----------------------2
SITE5-----------------------0
SITE6-----------------------0
SITE7-----------------------0
updated:
select SITE
,SUM(Case When OrderDate >= dateadd(dd,(datediff(dd,-53690,getdate()-1)/7)*7,-53690)
Then 1
Else 0
End) as COMPLETED
from
(
SELECT DISTINCT ORDERS.SITE, ORDERS.ORDERDATE FROM ORDERS
where SITE IN ('SITE1','SITE2','SITE3','SITE4','SITE5','SITE6','SITE7')
) X
GROUP BY SITE
order by SITE
I have now removed the inner join with phone data table, so i am now getting the missing sites. but the reason i avoided this approach is because if i only rely on the orders table the orderdate time field is inserted few times for a given order, and the final order makes it to the phonedata table, so now i get more values in completed count but it should only consider the latest value for each day for each site
result of update :
Site---------------------Completed
SITE1-----------------------5
SITE2-----------------------5
SITE3-----------------------5
SITE4-----------------------5
SITE5-----------------------0
SITE6-----------------------0
SITE7-----------------------0
expected
Site---------------------Completed
SITE1-----------------------2
SITE2-----------------------2
SITE3-----------------------2
SITE4-----------------------2
SITE5-----------------------0
SITE6-----------------------0
SITE7-----------------------0
If there are no rows in the table with the sites that have no orders, how can it return any rows to count? Perhaps you have a table with all the possible sites that can be joined to? Or create a temp table with the site values. You could then left join the orders table to this. i.e.
create table #sites (site varchar(25));
insert into #sites values ('SITE1','SITE2','SITE3','SITE4','SITE5','SITE6','SITE7');
...
from
(
SELECT DISTINCT ORDERS.SITE, ORDERS.ORDERDATE FROM
#sites s left join ORDERS on orders.site = s.site
INNER JOIN PHONEDATA AS P
ON ORDERS.RECID = P.OrderID
) X
...
Try using a left join instead of the inner join. It is probably not getting rows from the phone data table:
select SITE
,SUM(Case When OrderDate >= dateadd(dd,(datediff(dd,-53690,getdate()-1)/7)*7,-53690)
Then 1
Else 0
End) as COMPLETED
from
(
SELECT DISTINCT ORDERS.SITE, ORDERS.ORDERDATE FROM ORDERS
Left JOIN PHONEDATA AS P
ON ORDERS.RECID = P.OrderID
where SITE IN ('SITE1','SITE2','SITE3','SITE4','SITE5','SITE6','SITE7')
) X
GROUP BY SITE
order by SITE
It'd be best to start with a "Site" table and then left join to your results. This example mimics the behavior, and can be used as a hack-workaround.
DECLARE #table TABLE
(
site VARCHAR(10) ,
Completed TINYINT
)
INSERT INTO #table
( site, Completed )
VALUES ( 'SITE1', 0 ),
( 'SITE2', 0 ),
( 'SITE3', 0 ),
( 'SITE4', 0 ),
( 'SITE5', 0 ),
( 'SITE6', 0 ),
( 'SITE7', 0 )
WITH cte
AS ( SELECT SITE ,
SUM(CASE WHEN OrderDate >= DATEADD(dd,( DATEDIFF(dd, -53690, GETDATE() - 1) / 7 ) * 7, -53690)
THEN 1
ELSE 0
END) AS COMPLETED
FROM ( SELECT DISTINCT
ORDERS.SITE ,
ORDERS.ORDERDATE
FROM ORDERS
INNER JOIN PHONEDATA AS P ON ORDERS.RECID = P.OrderID
WHERE SITE IN ( 'SITE1', 'SITE2', 'SITE3',
'SITE4', 'SITE5', 'SITE6',
'SITE7' )
)
GROUP BY SITE
)
SELECT t.site ,
t.completed + cte.COMPLETED
FROM #table t
LEFT OUTER JOIN cte ON t.site = cte.Site
ORDER BY t.site

Resources