Trying to make a slope calculation work with SQL syntaxes - netezza

I'm a relatively new SQL programmer, I'm trying to make the below code work in SQL. The code is for calculating slopes for a given set of data, following the exact same logic as EXCEL SLOPE function. The problem right now is that count are not allowed since aggregations are being nested. But if I create a subquery for counts and sums, i would have to group on x and y, otherwise I wouldn't have x and y in my outer query to calculate on.
CREATE TABLE TEST (X FLOAT, Y FLOAT);
INSERT INTO TEST (X, Y) VALUES (1,4.10242258729964);
INSERT INTO TEST (X, Y) VALUES (2,4.57708865242591);
INSERT INTO TEST (X, Y) VALUES (3,5.16785670619896);
INSERT INTO TEST (X, Y) VALUES (4,6.88149559336059);
select sum((x-sum(x)/count(x))^2)/sum(((x-sum(x)/count(x))*(y-sum(y)/count(y))))
from TEST

You can calculate the slope from sum(x * x) and sum(x * y) and avg(x) and avg(y) and n:
SELECT avg(x) AS mx,sum(x*x) AS sx2,sum(x*y) as sxy,avg(y) as my, count(x) AS n
FROM test
Then you can use:
SELECT (sxy-n*mx*my)/(sx2 - n* mx*mx)
FROM
( SELECT avg(x) AS mx,sum(x*x) AS sx2,sum(x*y) as sxy,avg(y) as my, count(x) AS n
FROM test
)

Here a working version of the SQL you created:
SELECT sum((x-avgx)*(x-avgx)) / sum((x-avgx)*(y-avgy))
FROM TEST, (SELECT sum(X)/count(X) as avgx, sum(Y)/count(Y) as avgy FROM TEST) average;
I looked up the excel slope function and it is defined a bit differently:
SELECT sum((x-avgx)*(y-avgy)) / sum((x-avgx)*(x-avgx))
FROM TEST,
(
SELECT
sum(X)/count(X) as avgx,
sum(Y)/count(Y) as avgy
FROM TEST
) average;
Hope this is what you needed :)

I usually do something along these lines (I kept the syntax as simple as possible to avoid anything that might be problematic, like CTEs):
CREATE TABLE #test (x FLOAT, y FLOAT);
INSERT INTO #test SELECT 1, 4.10242258729964;
INSERT INTO #test SELECT 2, 4.57708865242591;
INSERT INTO #test SELECT 3, 5.16785670619896;
INSERT INTO #test SELECT 4, 6.88149559336059;
SELECT
(N * SUM_XY - SUM_X * SUM_Y) / (N * SUM_X2 - SUM_X * SUM_X) AS slope
FROM
(
SELECT
COUNT(*) AS N,
SUM(x) AS SUM_X,
SUM(x * x) AS SUM_X2,
SUM(y) AS SUM_Y,
SUM(y * y) AS SUM_Y2,
SUM(x * y) AS SUM_XY
FROM
#test) a;
Just ran this, then noticed you had another answer from "SQL Hacks". I ran both versions and they get the exact same answer, but the other version is shorter :D

Related

How to use recursive CTE to add resolution to a data set

I'm attempting to create a recursive CTE statement that adds blank rows in between data points that will later for interpolation. I'm a beginner with SQL and this is my first time using CTE's and am having some difficulty finding the proper way to do this.
I've attempted a few different slight variations on the code I have provided below after some research but haven't grasped a good enough understanding to see my issue yet. The following code should simulate sparse sampling by taking a observation every 4 hours from the sample data set and the second portion should add rows with there respective x values every 0.1 of an hour which will later be filled with interpolated values derived from a cubic spline.
--Sample Data
create table #temperatures (hour integer, temperature double precision);
insert into #temperatures (hour, temperature) values
(0,18.5),
(1,16.9),
(2,15.3),
(3,14.1),
(4,13.8),
(5,14.7),
(6,14.7),
(7,13.5),
(8,12.2),
(9,11.4),
(10,10.9),
(11,10.5),
(12,12.3),
(13,16.4),
(14,22.3),
(15,27.2),
(16,31.1),
(17,34),
(18,35.6),
(19,33.1),
(20,25.1),
(21,21.3),
(22,22.3),
(23,20.3),
(24,18.4),
(25,16.8),
(26,15.6),
(27,15.4),
(28,14.7),
(29,14.1),
(30,14.2),
(31,14),
(32,13.9),
(33,13.9),
(34,13.6),
(35,13.1),
(36,15),
(37,18.2),
(38,21.8),
(39,24.1),
(40,25.7),
(41,29.9),
(42,28.9),
(43,31.7),
(44,29.4),
(45,30.7),
(46,29.9),
(47,27);
--1
WITH xy (x,y)
AS
(
SELECT TOP 12
CAST(hour AS double precision) AS x
,temperature AS y
FROM #temperatures
WHERE cast(hour as integer) % 4 = 0
)
Select x,y
INTO #xy
FROM xy
Select [x] As [x_input]
INTO #x_series
FROM #xy
--2
with recursive
, x_series(input_x) as (
select
min(x)
from
#xy
union all
select
input_x + 0.1
from
x_series
where
input_x + 0.1 < (select max(x) from x)
)
, x_coordinate as (
select
input_x
, max(x) over(order by input_x) as previous_x
from
x_series
left join
#xy on abs(x_series.input_x - xy.x) < 0.001
)
The first CTE works as expected and produces a list of 12 (a sample every 4 hours for two days) but the second produces syntax error. The expected out put would be something like
(4,13.8), (4.1,null/0), (4.2,null/0),....., (8,12.2)
I dont think you need recursive.
What about this:
SQL DEMO
SELECT DISTINCT n = number *1.0 /10 , #xy.x, #xy.y
FROM master..[spt_values] step
LEFT JOIN #xy
ON step.number*1.0 /10 = #xy.x
WHERE number BETWEEN 40 AND 480
This 480 is based on the two days you mention.
OUTPUT
You dont even need the temporal table
SELECT DISTINCT n = number *1.0 /10 , #temperatures.temperature
FROM master..[spt_values] step
LEFT JOIN #temperatures
ON step.number *1.0 / 10 = #temperatures.hour
AND #temperatures.hour % 4 = 0
WHERE number BETWEEN 40 AND 480;
I don't think you need a recursive CTE here. I think a solution like this would be a better approach. Modify accordingly.
DECLARE #max_value FLOAT =
(SELECT MAX(hour) FROM #temperatures) * 10
INSERT INTO #temperatures (hour, temperature)
SELECT X.N / 10, NULL
FROM (
select CAST(ROW_NUMBER() over(order by t1.number) AS FLOAT) AS N
from master..spt_values t1
cross join master..spt_values t2
) X
WHERE X.N <= #max_value
AND X.N NOT IN (SELECT hour FROM #temperatures)
Use the temp table #xy produced in --1 you have, the following will give you a x series:
;with x_series(input_x)
as
(
select min(x) AS input_x
from #xy
union all
select input_x + 0.1
from x_series
where input_x + 0.1 < (select max(x) from #xy)
)
SELECT * FROM x_series;

Consuming values from 2 columns

I am trying to figure out how to make a sort of "consumption" query where an INT value column (X) is subtracted from another INT column (Y) until it reaches 0, then stop. The column DesiredResult and DesiredResultExplanation are here only for reference to the math being performed. This takes place in DESC date order (future consuming back to the present)
My initial approach was to use window functionality, but the problem is once the value (Y) reaches 0, it needs to stop performing a running total. Had similar issues using a CTE as well.
If changing the table structure will help at all, this can be done.
Version: SQL Server 2014 or higher
Thanks!
DECLARE #test TABLE
(
ID INT IDENTITY (1,1)
,PeriodDate DATE
,X INT
,Y INT
,DesiredResult INT
,DesiredResultExplanation VARCHAR(100)
)
INSERT INTO #test VALUES ('2017-05-01', 100,0, 100,'Nothing left to subtract. Value is unchanged')
INSERT INTO #test VALUES ('2017-05-08', 200,0, 200,'Nothing left to subtract. Value is unchanged')
INSERT INTO #test VALUES ('2017-05-15', 300,0, 100,'300 - 200 = 100 (Orig -1100 has been consumed)')
INSERT INTO #test VALUES ('2017-05-22', 400,0,-200,'400 - 600 = -200 ')
INSERT INTO #test VALUES ('2017-05-29', 500,-1100,-600, '500 - 1100 = -600')
SELECT *
FROM #test
ORDER BY PeriodDate DESC
DEMO
WITH cte as (
SELECT *,
SUM(X) OVER (ORDER BY PeriodDate DESC) accumulated
FROM #test
), parameter as (
SELECT 1100 as startY
)
SELECT *,
CASE WHEN accumulated <= startY
THEN accumulated - startY
WHEN LAG(accumulated) OVER (ORDER BY PeriodDate DESC) < startY
THEN accumulated - startY
ELSE X
END as newDesire
FROM cte
CROSS JOIN parameter
ORDER BY PeriodDate DESC;
OUTPUT
EDIT: You can change the LAG condition with
WHEN accumulated - X < startY

Should I use a cursor for this?

I have a table with three fields. Group number, X-coord and Y-coord. There can be from 0 to about 10 rows within each group number.
What I want to do is calculate the maximum and minimum distance between points within each group. Obviously, this will only give you a value if there are 2 or more rows within that group.
Output should consist of fields: group number, minDistance, maxDistance.
Is a cursor a good solution for this?
(Coordinates are in WGS84 and I have a working formula for calculating distances)
My reasoning for using a cursor is that I cannot avoid doing a cross join for each group and then applying the formula for each result of the cross join.
I wouldn't use a cursor in your situation but preferably a scalar User Defined Function with the required group number in argument, and calculate the maximum distance for that group inside the UDF.
Please note the calculation algorithm inside the function is much simpler than what you may have.
create table dist (groupId int, X int, Y int)
insert into dist(groupid, x, y) values (1,14,20),(1,11,20),(1,10,22),(1,12,24),(1,11,28),(1,19,78)
insert into dist(groupid, x, y) values (2,10,20),(2,11,20),(2,10,22),(2,12,24),(2,11,28),(2,17,52)
create function dbo.getMinMaxDistanceForGroup (#groupId int)
returns table as return (
select MIN(SQRT(SQUARE(b.X - a.X) + SQUARE(b.Y - a.Y))) MinDistance,
MAX(SQRT(SQUARE(b.X - a.X) + SQUARE(b.Y - a.Y))) MaxDistance
from dist a cross join dist b
where a.groupId = #groupId and b.groupId = #groupId
)
select groupId, MinDistance, MaxDistance
from dist OUTER APPLY dbo.getMinMaxDistanceForGroup(groupId)
group by groupid, MinDistance, MaxDistance

Creating test data for calculation using RAND()

I attempted to populate a table with two columns of random FLOATs, but of every row generated was identical.
;WITH CTE (x, y) AS (
SELECT RAND(), RAND()
UNION ALL
SELECT x, y FROM CTE
)
--INSERT INTO CalculationTestData (x, y)
SELECT TOP 5000000 x, y
FROM CTE
OPTION (MAXRECURSION 0)
I can accomplish what I need just fine by just not using the CTE, but this has peaked my curiosity.
Is there a way to do this quickly?
I know quickly is a relative term, by it, I mean approximately how quickly it would take to execute the above.
What do you expect other than for the cte to repeat the rows because you're recursion is just selecting them again
SELECT RAND(), RAND() -- SELECT 9 , 10
UNION ALL
SELECT x, y -- SELECT 9 , 10
what you want to do is more like this
SELECT RAND(), RAND()
UNION ALL
SELECT RAND(), RAND() -- but the problem is that this 'row' will be duplicated
so you need to seed and reseed for each row giving you something like
SELECT RAND(CAST(NEWID() AS VARBINARY)),
RAND(CAST(NEWID() AS VARBINARY))
UNION ALL
SELECT RAND(CAST(NEWID() AS VARBINARY)),
RAND(CAST(NEWID() AS VARBINARY))
using NEWID() as the seed is one way there may well be others that are more efficient etc
Try this instead of rand(): it will give a random positive whole number on each entry. I had the same issue with rand() recently
ABS(Checksum(NewID()))
Float:
cast(ABS(Checksum(NewID()) ) as float)
To be Clear:
;WITH CTE (x, y) AS (
SELECT cast(ABS(Checksum(NewID()) ) as float), cast(ABS(Checksum(NewID()) ) as float)
UNION ALL
SELECT x, y FROM CTE
)
Did not give a random entry on each line?

Sql query using database function

i have DataBase function that calculate distance by coordinates
CREATE OR REPLACE FUNCTION distance(lat1 FLOAT, lon1 FLOAT, lat2 FLOAT, lon2 FLOAT) RETURNS FLOAT AS $$
DECLARE
x float = 69.1 * (lat2 - lat1);
y float = 69.1 * (lon2 - lon1) * cos(lat1 / 57.3);
BEGIN
RETURN sqrt(x * x + y * y);
END
$$ LANGUAGE plpgsql;
now i have 2 tables (with one to one relation) one called
person with columns --> Personid, name, lastname
and one
location with columns --> Personid,latitude,longtitude.
now i try to get the person that the distance is less to 5 using the distnace function
i start with this following query to get the id first
select loc.id,loc.latitude,loc.longitude from location loc
where distance(123,456,loc.latitude,loc.longitude)<0.5
but don't know how to get the all persons with the id from the above query.
how can i do that?
thanks in advance.
Is this what your are looking for?
select p.*, loc.id,loc.latitude,loc.longitude
from location loc inner join person p on (p.Personid=loc.Personid)
where distance(123,456,loc.latitude,loc.longitude)<0.5

Resources