!!!UPDATE!!! Using the vertex shader to generate quads via DrawInstanced() calls definitely reduced CPU overhead and increased quads drawn per second. But there was much more performance to be found by using a combination of instanced drawing via a vertex shader that generates a point list, and a geometry shader that generates quads based on those points.
Thanks to #Soonts for not only recommending a faster way, but also for reminding me of conditional moves and unrolling loops.
Here is the geometry shader I created for sprites with 2D rotation:
cbuffer CB_PROJ {
matrix camera;
};
/* Reduced packet size -- 256x256 max atlas segments
-------------------
FLOAT3 Sprite location // 12 bytes
FLOAT Rotation // 16 bytes
FLOAT2 Scale // 24 bytes
UINT // 28 bytes
Fixed8p00 Texture X segment
Fixed8p00 Texture X total segments
Fixed8p00 Texture Y segment
Fixed8p00 Texture Y total segments
.Following vertex data is only processed by the vertex shader.
UINT // 32 bytes
Fixed3p00 Squadron generation method
Fixed7p00 Sprite stride
Fixed8p14 X/Y distance between sprites
*/
struct VOut {
float3 position : POSITION;
float3 r_s : NORMAL;
uint bits : BLENDINDICES;
};
struct GOut {
float4 pos : SV_Position;
float3 position : POSITION;
float3 n : NORMAL;
float2 tex : TEXCOORD;
uint pID : SV_PrimitiveID;
};
[maxvertexcount(4)]
void main(point VOut gin[1], uint pID : SV_PrimitiveID, inout TriangleStream<GOut> triStream) {
GOut output;
const uint bits = gin[0].bits;
const uint ySegs = (bits & 0x0FF000000) >> 24u;
const uint _yOS = (bits & 0x000FF0000) >> 16u;
const float yOS = 1.0f - float(_yOS) / float(ySegs);
const float yOSd = rcp(float(ySegs));
const uint xSegs = (bits & 0x00000FF00) >> 8u;
const uint _xOS = (bits & 0x0000000FF);
const float xOS = float(_xOS) / float(xSegs);
const float xOSd = rcp(float(xSegs));
float2 v;
output.pID = pID;
output.n = float3( 0.0f, 0.0f, -1.0f );
output.position = gin[0].position; // Translate
v.x = -gin[0].r_s.y; v.y = -gin[0].r_s.z; // Scale
output.tex = float2(xOS, yOS);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x); // Rotate
output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera); // Transform
triStream.Append(output);
output.position = gin[0].position;
v.x = -gin[0].r_s.y; v.y = gin[0].r_s.z;
output.tex = float2(xOS, yOS - yOSd);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
output.position = gin[0].position;
v.x = gin[0].r_s.y; v.y = -gin[0].r_s.z;
output.tex = float2(xOS + xOSd, yOS);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
output.position = gin[0].position;
v.x = gin[0].r_s.y; v.y = gin[0].r_s.z;
output.tex = float2(xOS + xOSd, yOS - yOSd);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
}
!!!ORIGINAL TEXT!!!
Last time I was coding, I had barely started learning Direct3D9c. Currently I'm hitting about 30K single-texture quads lit with 15 lights at about 450fps. I haven't learned instancing or geometry shading at all yet, and I'm trying to prioritise the order I learn things in for my needs, so I've only taken glances at them.
My first thought was to reduce the amount of vertex data being shunted to the GPU, so I changed the vertex structure to a FLOAT2 (for texture coords) and an UINT (for indexing), relying on 4x float3 constants in the vertex shader to define the corners of the quads.
I figured I could reduce the size of the vertex data further, and reduced each vertex unit to a single UINT containing a 2bit index (to reference the real vertexes of the quad), and 2x 15bit fixed-point numbers (yes, I'm showing my age but fixed-point still has it's value) representing offsets into atlas textures.
So far, so good, but I know bugger all about Direct3D11 and HLSL so I've been wondering if there's a faster way.
Here's the current state of my vertex shader:
cbuffer CB_PROJ
{
matrix model;
matrix modelViewProj;
};
struct VOut
{
float3 position : POSITION;
float3 n : NORMAL;
float2 texcoord : TEXCOORD;
float4 pos : SV_Position;
};
static const float3 position[4] = { -0.5f, 0.0f,-0.5f,-0.5f, 0.0f, 0.5f, 0.5f, 0.0f,-0.5f, 0.5f, 0.0f, 0.5f };
// Index bitpattern: YYYYYYYYYYYYYYYXXXXXXXXXXXXXXXVV
//
// 00-01 . uint2b == Vertex index (0-3)
// 02-17 . fixed1p14 == X offset into atlas texture(s)
// 18-31 . fixed1p14 == Y offset into atlas texture(s)
//
VOut main(uint bitField : BLENDINDICES) {
VOut output;
const uint i = bitField & 0x03u;
const uint xStep = (bitField >> 2) & 0x7FFFu;
const uint yStep = (bitField >> 17);
const float xDelta = float(xStep) * 0.00006103515625f;
const float yDelta = float(yStep) * 0.00006103515625f;
const float2 texCoord = float2(xDelta, yDelta);
output.position = (float3) mul(float4(position[i], 1.0f), model);
output.n = mul(float3(0.0f, 1.0f, 0.0f), (float3x3) model);
output.texcoord = texCoord;
output.pos = mul(float4(output.position, 1.0f), modelViewProj);
return output;
}
My pixel shader for completeness:
Texture2D Texture : register(t0);
SamplerState Sampler : register(s0);
struct LIGHT {
float4 lightPos; // .w == range
float4 lightCol; // .a == flags
};
cbuffer cbLight {
LIGHT l[16] : register(b0); // 256 bytes
}
static const float3 ambient = { 0.15f, 0.15f, 0.15f };
float4 main(float3 position : POSITION, float3 n : NORMAL, float2 TexCoord : TEXCOORD) : SV_Target
{
const float4 Texel = Texture.Sample(Sampler, TexCoord);
if (Texel.a < 0.707106f) discard; // My source images have their alpha values inverted.
float3 result = { 0.0f, 0.0f, 0.0f };
for (uint xx = 0 ; xx < 16 && l[xx].lightCol.a != 0xFFFFFFFF; xx++)
{
const float3 lCol = l[xx].lightCol.rgb;
const float range = l[xx].lightPos.w;
const float3 vToL = l[xx].lightPos.xyz - position;
const float distToL = length(vToL);
if (distToL < range * 2.0f)
{
const float att = min(1.0f, (distToL / range + distToL / (range * range)) * 0.5f);
const float3 lum = Texel.rgb * saturate(dot(vToL / distToL, n)) * lCol;
result += lum * (1.0f - att);
}
}
return float4(ambient * Texel.rgb + result, Texel.a);
}
And the rather busy looking C function to generate the vertex data (all non-relevant functions removed):
al16 struct CLASS_PRIMITIVES {
ID3D11Buffer* pVB = { NULL, NULL }, * pIB = { NULL, NULL };
const UINT strideV1 = sizeof(VERTEX1);
void CreateQuadSet1(ui32 xSegs, ui32 ySegs) {
al16 VERTEX1* vBuf;
al16 D3D11_BUFFER_DESC bd = {};
D3D11_SUBRESOURCE_DATA srd = {};
ui32 index = 0, totalVerts = xSegs * ySegs * 4;
if (pVB) return;
vBuf = (VERTEX1*)_aligned_malloc(strideV1 * totalVerts, 16);
for (ui32 yy = ySegs; yy; yy--)
for (ui32 xx = 0; xx < xSegs; xx++) {
double dyStep2 = 16384.0 / double(ySegs); double dyStep1 = dyStep2 * double(yy); dyStep2 *= double(yy - 1);
ui32 yStep1 = dyStep1;
yStep1 <<= 17;
ui32 yStep2 = dyStep2;
yStep2 <<= 17;
vBuf[index].b = 0 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep1;
index++;
vBuf[index].b = 1 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep2;
index++;
vBuf[index].b = 2 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep1;
index++;
vBuf[index].b = 3 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep2;
index++;
}
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
bd.CPUAccessFlags = 0;
bd.ByteWidth = strideV1 * totalVerts;
bd.StructureByteStride = strideV1;
srd.pSysMem = vBuf;
hr = dev->CreateBuffer(&bd, &srd, &pVB);
if (hr != S_OK) ThrowError();
_aligned_free(vBuf);
};
void DrawQuadFromSet1(ui32 offset) {
offset *= sizeof(VERTEX1) * 4;
devcon->IASetVertexBuffers(0, 1, &pVB, &strideV1, &offset);
devcon->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
devcon->Draw(4, 0);
};
void DestroyQuadSet() {
if (pVB) pVB->Release();
};
It's all functioning as it should, but it just seems like I'm resorting to hacks to achieve my goal. Surely there's a faster way? Using DrawIndexed() consistently dropped the frame-rate by 1% so I switched back to non-indexed Draw calls.
reducing vertex data down to 32bits per vertex is as far as the GPU will allow
You seem to think that vertex buffer sizes are what's holding you back. Make no mistake here, they are not. You have many gigs of VRAM to work with, use them if it will make your code faster. Specifically, anything you're unpacking in your shaders that could otherwise be stored explicitly in your vertex buffer should probably be stored in your vertex buffer.
I am wondering if anyone has experience with using geometry shaders to auto-generate quads
I'll stop you right there, geometry shaders are very inefficient in most driver implementations, even today. They just aren't used that much so nobody bothered to optimize them.
One quick thing that jumps at me is that you're allocating and freeing your system-side vertex array every frame. Building it is fine, but cache the array, C memory allocation is about as slow as anything is going to get. A quick profiling should have shown you that.
Your next biggest problem is that you have a lot of branching in your pixel shader. Use standard functions (like clamp or mix) or blending to let the math cancel out instead of checking for ranges or fully transparent values. Branching will absolutely kill performance.
And lastly, make sure you have the correct hints and usage on your buffers. You don't show them, but they should be set to whatever the equivalent of GL_STREAM_DRAW is, and you need to ensure you don't corrupt the in-flight parts of your vertex buffer. Future frames will render at the same time as the current one as long as you don't invalidate their data by overwriting their vertex buffer, so instead use a round-robin scheme to allow as many vertices as possible to survive (again, use memory for performance). Personally I allocate a very large vertex buffer (5x the data a frame needs) and write it sequentially until I reach the end, at which point I orphan the whole thing and re-allocate it and start from the beginning again.
I think your code is CPU bound. While your approach has very small vertices, you have non-trivial API overhead.
A better approach is rendering all quads with a single draw call. I would probably use instancing for that.
Assuming you want arbitrary per-quad size, position, and orientation in 3D space, here’s one possible approach. Untested.
Vertex buffer elements:
struct sInstanceData
{
// Center of the quad in 3D space
XMFLOAT3 center;
// XY coordinates of the sprite in the atlas
uint16_t spriteX, spriteY;
// Local XY vectors of the quad in 3D space
// length of the vectors = half width/height of the quad
XMFLOAT3 plusX, plusY;
};
Input layout:
D3D11_INPUT_ELEMENT_DESC desc[ 4 ];
desc[ 0 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadCenter", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 1 ] = D3D11_INPUT_ELEMENT_DESC{ "SpriteIndex", 0, DXGI_FORMAT_R16G16_UINT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 2 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusX", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 3 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusY", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
Vertex shader:
cbuffer Constants
{
matrix viewProj;
// Pass [ 1.0 / xSegs, 1.0 / ySegs ] in that field
float2 texcoordMul;
};
struct VOut
{
float3 position : POSITION;
float3 n : NORMAL;
float2 texcoord : TEXCOORD;
float4 pos : SV_Position;
};
VOut main( uint index: SV_VertexID,
float3 center : QuadCenter, uint2 texcoords : SpriteIndex,
float3 plusX : QuadPlusX, float3 plusY : QuadPlusY )
{
VOut result;
float3 pos = center;
int2 uv = ( int2 )texcoords;
// No branches are generated in release builds;
// only conditional moves are there
if( index & 1 )
{
pos += plusX;
uv.x++;
}
else
pos -= plusX;
if( index & 2 )
{
pos += plusY;
uv.y++;
}
else
pos -= plusY;
result.position = pos;
result.n = normalize( cross( plusX, plusY ) );
result.texcoord = ( ( float2 )uv ) * texcoordMul;
result.pos = mul( float4( pos, 1.0f ), viewProj );
return result;
}
Rendering:
UINT stride = sizeof( sInstanceData );
UINT off = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride, &off );
context->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP );
context->DrawInstanced( 4, countQuads, 0, 0 );
I've been trying to add this post-processing (taken from sebastian lague video which I am trying to convert from unity to threejs) effect that when a ray hits the ocean on my mesh (the blue):
it is colored white (just like in his video):
and everywhere else the original color is returned. But for the life of me can't seem to figure out the problem, I assume my ray origin or direction might be wrong but nothing seems to work, Here's the code that I pass to the ray Sphere intersection function and the function itself.
vec2 raySphere(vec3 centre, float radius, vec3 rayOrigin, vec3 rayDir) {
vec3 offset = rayOrigin - centre;
float a = 1.0; // set to dot(rayDir, rayDir) instead of rayDir may not be normalized
float b = 2.0 * dot(offset, rayDir);
float c = dot(offset, offset) - radius * radius;
float discriminant = b*b-4.0*a*c;
// No intersection: discriminant < 0
// 1 intersection: discriminant == 0
// 2 intersection: discriminant > 0
if(discriminant > 0.0) {
float s = sqrt(discriminant);
float dstToSphereNear = max(0.0, (-b - s) / (2.0 * a));
float dstToSphereFar = (-b + s) / (2.0 * a);
if (dstToSphereFar >= 0.0) {
return vec2(dstToSphereNear, dstToSphereFar-dstToSphereNear);
}
}
return vec2(99999999, 0.0);
}
vec4 ro = inverse(modelMatrix) * vec4(cameraPosition, 1.0);
vec3 rd = normalize(position - ro.xyz);
vec3 oceanCentre = vec3(0.0, 0.0, 0.0);
float oceanRadius = 32.0;
vec2 hitInfo = raySphere(oceanCentre, oceanRadius, ro.xyz, rd);
float dstToOcean = hitInfo.x;
float dstThroughOcean = hitInfo.y;
vec3 rayOceanIntersectPos = ro.xyz + rd * dstToOcean - oceanCentre;
// dst that view ray travels through ocean (before hitting terrain / exiting ocean)
float oceanViewDepth = min(dstThroughOcean, depth - dstToOcean);
vec4 oceanCol;
float alpha;
if(oceanViewDepth > 0.0) {
gl_FragColor = vec4(vec3(1.0), .1);
}
gl_FragColor = texture2D(tDiffuse, vUv);
Can someone help point out where I might be messing up?
Oh wow, we're in the same place while we're stuck at making these shaders. I checked your ray intersectors have small problems. But here is the cases:
What we want if case 3 happens like on your example, so the intersection are in count the problem probably come from no depth correction by doing this:
Make sure your sphere intersection max depth same as the camera.
I do suspect if the last line is the problem, try do this:
vec3 col; // Declare the color
vec2 o = sphere(ro, rd, vec3(0), 1.0); // Ocean Depth.
float oceanViewDepth = min(o.y - o.x, t - o.x);
if(depth > 0.0 && tmax > depth) {
col = originalCol;
}
if(oceanViewDepth > 0.0) {
col = vec3(1);
}
gl_FragColor = vec4(col, 1.0);
If that doesn't work for you I have some finished example for you to checkout at shadertoy
I am currently working on a basic raytracing program using C, and i have managed to so some simple shapes ex, sphere/box/plane/cone/..., and i also did some shading to them using phong illumination.
But my question is that i can get a hang of how i can ray trace a Hemisphere , like is there a set equation that define the Hemisphere if so enlighten me on it because i couldn't find any , or is there a set method to do it that i couldn't figure out.
I have also tried to tried to cut the sphere with a plane and only show the only the top half but it didn't work (I am still new to all this so my understanding may be wrong).
Edit: Ok, I am sorry because i am really new to all this but here is what i have tryied.
#include "raytacing.h"
t_env *init_sphere(t_env *e)
{
//sphere position and radius
e->sph.posi.x = 0;
e->sph.posi.y = 0;
e->sph.posi.z = -1;
e->sph.rad = 0;
e->sph.color = (t_color){255, 255, 128);
return (e);
}
t_env *init_plane(t_env *e)
{
//plane position
e->plane.posi.x = 0;
e->olane.posi.y = -0.5;
e->plane.posi.z = 0;
//plane normal
e->plane.norm.x = 0;
e->olane.norm.y = 1;
e->plane.norm.z = 0;
e->plane.color = (t_color){0, 255, 0);
return (e);
}
double inter_plane(t_env *e, double *t) //calculating plane intersection
{
t_vect dist;
double norm;
norm = dot(e->plane.normal, e->r.direction);
if (fabs(norm) > 1e-6)
{
dist = vect_sub(e->plane.posi, e->r.start);
e->t0 = dot(dist, e->plane.normal) / norm;
if (e->t0 < *t && e->t0 > 1e-6)
{
*t = e->t0;
return (1);
}
else
return (0);
}
return (0);
}
double inter_sph(t_env *e, double *t) //calculating sphere intersection
{
double delta;
double sqrtd;
t_vect dist;
e->a = dot(e->r.direction, e->r.direction);
dist = vect_sub(e->r.start, e->sph.posi);
e->b = 2 * dot(dist, e->r.direction);
e->c = dot(dist, dist) - e->sph.rad * e->sph.rad;
delta = e->b * e->b - 4 * e->a * e->c;
if (delta < 0)
return (0);
sqrtd = sqrt(delta);
e->t0 = (-e->b + sqrtd) / (2 * e->a);
e->t1 = (-e->b - sqrtd) / (2 * e->a);
if (e->t0 > e->t1)
e->t0 = e->t1;
if ((e->t0 > 1e-6) && (e->t0 < *t))
{
*t = e->t0;
return (1);
}
else
return (0);
}
double inter_hemisphere(t_env *e) //calculating hemisphere intersection
{
t_vect hit_normal;
if (inter_sph(e, &e->t) == 1)
{
hit_normal = vect_add(e->r.start, vect_scalaire(e->t, e->r.direction));
hit_normal = vect_normalize(hit_normal);
if (inter_plane(e, &(e->t)) == 1)
{
if (dot(e->plane.normal, hit_normal) < 0)
return (1);
return (0);
}
}
return (0);
}
the e->t is . supposed to be the closest distance to the camera so that i get an exact display of close and far objects
And here i tried to apply what Spektre said and got some thing displayed and look like something like this:
And when i try to rotate it i get this:
Edit2 : After using Spektre Method I got a functional Intersection of a Hemisphere and the intersection look something like this.
double inter_hemisphere(t_env *e, double *t)
{
double delta;
double sqrtd;
t_vect dist;
e->a = dot(e->r.direction, e->r.direction);
dist = vect_sub(e->r.start, e->sph.posi);
e->b = 2 * dot(dist, e->r.direction);
e->c = dot(dist, dist) - e->sph.rad * e->sph.rad;
delta = e->b * e->b - 4 * e->a * e->c;
if (delta < 0)
return (0);
sqrtd = sqrt(delta);
e->t0 = (-e->b + sqrtd) / (2 * e->a);
e->t1 = (-e->b - sqrtd) / (2 * e->a);
t_vect v2;
v2 = vect_add(e->r.start, vect_sub(vect_scalaire(e->t0, e->r.direction), e->sph.posi));
if (dot(e->plane.normal, v2) > 0.0)
e->t0 =-1.0;
v2 = vect_add(e->r.start, vect_sub(vect_scalaire(e->t1, e->r.direction), e->sph.posi));
if (dot(e->plane.normal, v2) > 0.0)
e->t1 =-1.0;
if (e->t0 < 0.0)
e->t0 = e->t1;
if (e->t1 < 0.0)
e->t1 = e->t0;
double tt;
tt = fmin(e->t0, e->t1);
if (tt <= 0.0)
tt = fmax(e->t0, e->t1);
if (tt > 1e-6 && tt < e->t)
{
*t = tt;
return (1);
}
return (0);
}
And here is the Result:
The simplest way is to cut your sphere by a plane.
If you have plane normal than any direction (point on sphere - sphere center) with the same direction to normal is cut off. Simply by this condition:
dot(point on sphere - sphere center , plane normal ) > 0.0
But do not forget to test both intersections of ray and sphere as the closest one can be on the other side of plane ...
I tried to implement this into mine GLSL Ray tracer:
Reflection and refraction impossible without recursive ray tracing?
And come up with this updated fragment shaders:
Vertex (no change):
//------------------------------------------------------------------
#version 420 core
//------------------------------------------------------------------
uniform float aspect;
uniform float focal_length;
uniform mat4x4 tm_eye;
layout(location=0) in vec2 pos;
out smooth vec2 txt_pos; // frag position on screen <-1,+1> for debug prints
out smooth vec3 ray_pos; // ray start position
out smooth vec3 ray_dir; // ray start direction
//------------------------------------------------------------------
void main(void)
{
vec4 p;
txt_pos=pos;
// perspective projection
p=tm_eye*vec4(pos.x/aspect,pos.y,0.0,1.0);
ray_pos=p.xyz;
p-=tm_eye*vec4(0.0,0.0,-focal_length,1.0);
ray_dir=normalize(p.xyz);
gl_Position=vec4(pos,0.0,1.0);
}
//------------------------------------------------------------------
Fragment (added hemispheres):
//------------------------------------------------------------------
#version 420 core
//------------------------------------------------------------------
// Ray tracer ver: 1.000
//------------------------------------------------------------------
in smooth vec3 ray_pos; // ray start position
in smooth vec3 ray_dir; // ray start direction
uniform float n0; // refractive index of camera origin
uniform int fac_siz; // square texture x,y resolution size
uniform int fac_num; // number of valid floats in texture
uniform sampler2D fac_txr; // scene mesh data texture
out layout(location=0) vec4 frag_col;
//---------------------------------------------------------------------------
#define _reflect
#define _refract
//---------------------------------------------------------------------------
void main(void)
{
const vec3 light_dir=normalize(vec3(0.1,0.1,1.0));
const float light_iamb=0.1; // dot offset
const float light_idir=0.5; // directional light amplitude
const vec3 back_col=vec3(0.2,0.2,0.2); // background color
const float _zero=1e-6; // to avoid intrsection with start point of ray
const int _fac_triangles =0; // r,g,b,a, n, triangle count, { x0,y0,z0,x1,y1,z1,x2,y2,z2 }
const int _fac_spheres =1; // r,g,b,a, n, sphere count, { x,y,z,r }
const int _fac_hemispheres=2; // r,g,b,a, n, hemisphere count,{ x,y,z,r,nx,ny,nz }
// ray scene intersection
struct _ray
{
dvec3 pos,dir,nor;
vec3 col;
float refl,refr;// reflection,refraction intensity coeficients
float n0,n1; // refaction index (start,end)
double l; // ray length
int lvl,i0,i1; // recursion level, reflect, refract
};
const int _lvls=4;
const int _rays=(1<<_lvls)-1;
_ray ray[_rays]; int rays;
dvec3 v0,v1,v2,pos;
vec3 c;
float refr,refl,n1;
double tt,t,a;
int i0,ii,num,id;
// fac texture access
vec2 st; int i,j; float ds=1.0/float(fac_siz-1);
#define fac_get texture(fac_txr,st).r; st.s+=ds; i++; j++; if (j==fac_siz) { j=0; st.s=0.0; st.t+=ds; }
// enque start ray
ray[0].pos=ray_pos;
ray[0].dir=normalize(ray_dir);
ray[0].nor=vec3(0.0,0.0,0.0);
ray[0].refl=0.0;
ray[0].refr=0.0;
ray[0].n0=n0;
ray[0].n1=1.0;
ray[0].l =0.0;
ray[0].lvl=0;
ray[0].i0=-1;
ray[0].i1=-1;
rays=1;
// loop all enqued rays
for (i0=0;i0<rays;i0++)
{
// loop through all objects
// find closest forward intersection between them and ray[i0]
// strore it to ray[i0].(nor,col)
// strore it to pos,n1
t=tt=-1.0; ii=1; ray[i0].l=0.0;
ray[i0].col=back_col;
pos=ray[i0].pos; n1=n0;
for (st=vec2(0.0,0.0),i=j=0;i<fac_num;)
{
c.r=fac_get; // RGBA
c.g=fac_get;
c.b=fac_get;
refl=fac_get;
refr=fac_get;
n1=fac_get; // refraction index
a=fac_get; id=int(a); // object type
a=fac_get; num=int(a); // face count
if (id==_fac_triangles)
for (;num>0;num--)
{
v0.x=fac_get; v0.y=fac_get; v0.z=fac_get;
v1.x=fac_get; v1.y=fac_get; v1.z=fac_get;
v2.x=fac_get; v2.y=fac_get; v2.z=fac_get;
dvec3 e1,e2,n,p,q,r;
double t,u,v,det,idet;
//compute ray triangle intersection
e1=v1-v0;
e2=v2-v0;
// Calculate planes normal vector
p=cross(ray[i0].dir,e2);
det=dot(e1,p);
// Ray is parallel to plane
if (abs(det)<1e-8) continue;
idet=1.0/det;
r=ray[i0].pos-v0;
u=dot(r,p)*idet;
if ((u<0.0)||(u>1.0)) continue;
q=cross(r,e1);
v=dot(ray[i0].dir,q)*idet;
if ((v<0.0)||(u+v>1.0)) continue;
t=dot(e2,q)*idet;
if ((t>_zero)&&((t<=tt)||(ii!=0)))
{
ii=0; tt=t;
// store color,n ...
ray[i0].col=c;
ray[i0].refl=refl;
ray[i0].refr=refr;
// barycentric interpolate position
t=1.0-u-v;
pos=(v0*t)+(v1*u)+(v2*v);
// compute normal (store as dir for now)
e1=v1-v0;
e2=v2-v1;
ray[i0].nor=cross(e1,e2);
}
}
if (id==_fac_spheres)
for (;num>0;num--)
{
float r;
v0.x=fac_get; v0.y=fac_get; v0.z=fac_get; r=fac_get;
// compute l0 length of ray(p0,dp) to intersection with sphere(v0,r)
// where rr= r^-2
double aa,bb,cc,dd,l0,l1,rr;
dvec3 p0,dp;
p0=ray[i0].pos-v0; // set sphere center to (0,0,0)
dp=ray[i0].dir;
rr = 1.0/(r*r);
aa=2.0*rr*dot(dp,dp);
bb=2.0*rr*dot(p0,dp);
cc= rr*dot(p0,p0)-1.0;
dd=((bb*bb)-(2.0*aa*cc));
if (dd<0.0) continue;
dd=sqrt(dd);
l0=(-bb+dd)/aa;
l1=(-bb-dd)/aa;
if (l0<0.0) l0=l1;
if (l1<0.0) l1=l0;
t=min(l0,l1); if (t<=_zero) t=max(l0,l1);
if ((t>_zero)&&((t<=tt)||(ii!=0)))
{
ii=0; tt=t;
// store color,n ...
ray[i0].col=c;
ray[i0].refl=refl;
ray[i0].refr=refr;
// position,normal
pos=ray[i0].pos+(ray[i0].dir*t);
ray[i0].nor=pos-v0;
}
}
if (id==_fac_hemispheres)
for (;num>0;num--)
{
float r;
v0.x=fac_get; v0.y=fac_get; v0.z=fac_get; r=fac_get;
v1.x=fac_get; v1.y=fac_get; v1.z=fac_get;
// compute l0 length of ray(p0,dp) to intersection with sphere(v0,r)
// where rr= r^-2
double aa,bb,cc,dd,l0,l1,rr;
dvec3 p0,dp;
p0=ray[i0].pos-v0; // set sphere center to (0,0,0)
dp=ray[i0].dir;
rr = 1.0/(r*r);
aa=2.0*rr*dot(dp,dp);
bb=2.0*rr*dot(p0,dp);
cc= rr*dot(p0,p0)-1.0;
dd=((bb*bb)-(2.0*aa*cc));
if (dd<0.0) continue;
dd=sqrt(dd);
l0=(-bb+dd)/aa;
l1=(-bb-dd)/aa;
// test both hits-v0 against normal v1
v2=ray[i0].pos+(ray[i0].dir*l0)-v0; if (dot(v1,v2)>0.0) l0=-1.0;
v2=ray[i0].pos+(ray[i0].dir*l1)-v0; if (dot(v1,v2)>0.0) l1=-1.0;
if (l0<0.0) l0=l1;
if (l1<0.0) l1=l0;
t=min(l0,l1); if (t<=_zero) t=max(l0,l1);
if ((t>_zero)&&((t<=tt)||(ii!=0)))
{
ii=0; tt=t;
// store color,n ...
ray[i0].col=c;
ray[i0].refl=refl;
ray[i0].refr=refr;
// position,normal
pos=ray[i0].pos+(ray[i0].dir*t);
ray[i0].nor=pos-v0;
}
}
}
ray[i0].l=tt;
ray[i0].nor=normalize(ray[i0].nor);
// split ray from pos and ray[i0].nor
if ((ii==0)&&(ray[i0].lvl<_lvls-1))
{
t=dot(ray[i0].dir,ray[i0].nor);
// reflect
#ifdef _reflect
if ((ray[i0].refl>_zero)&&(t<_zero)) // do not reflect inside objects
{
ray[i0].i0=rays;
ray[rays]=ray[i0];
ray[rays].lvl++;
ray[rays].i0=-1;
ray[rays].i1=-1;
ray[rays].pos=pos;
ray[rays].dir=ray[rays].dir-(2.0*t*ray[rays].nor);
ray[rays].n0=ray[i0].n0;
ray[rays].n1=ray[i0].n0;
rays++;
}
#endif
// refract
#ifdef _refract
if (ray[i0].refr>_zero)
{
ray[i0].i1=rays;
ray[rays]=ray[i0];
ray[rays].lvl++;
ray[rays].i0=-1;
ray[rays].i1=-1;
ray[rays].pos=pos;
t=dot(ray[i0].dir,ray[i0].nor);
if (t>0.0) // exit object
{
ray[rays].n0=ray[i0].n0;
ray[rays].n1=n0;
if (i0==0) ray[i0].n1=n1;
v0=-ray[i0].nor; t=-t;
}
else{ // enter object
ray[rays].n0=n1;
ray[rays].n1=ray[i0].n0;
ray[i0 ].n1=n1;
v0=ray[i0].nor;
}
n1=ray[i0].n0/ray[i0].n1;
tt=1.0-(n1*n1*(1.0-t*t));
if (tt>=0.0)
{
ray[rays].dir=(ray[i0].dir*n1)-(v0*((n1*t)+sqrt(tt)));
rays++;
}
}
#endif
}
else if (i0>0) // ignore last ray if nothing hit
{
ray[i0]=ray[rays-1];
rays--; i0--;
}
}
// back track ray intersections and compute output color col
// lvl is sorted ascending so backtrack from end
for (i0=rays-1;i0>=0;i0--)
{
// directional + ambient light
t=abs(dot(ray[i0].nor,light_dir)*light_idir)+light_iamb;
t*=1.0-ray[i0].refl-ray[i0].refr;
ray[i0].col.rgb*=float(t);
// reflect
ii=ray[i0].i0;
if (ii>=0) ray[i0].col.rgb+=ray[ii].col.rgb*ray[i0].refl;
// refract
ii=ray[i0].i1;
if (ii>=0) ray[i0].col.rgb+=ray[ii].col.rgb*ray[i0].refr;
}
frag_col=vec4(ray[0].col,1.0);
}
//---------------------------------------------------------------------------
The Vertex shader just creates the Ray position and direction which is interpolated by GPU and then Fragment shader handles each ray (per pixel).
I use this scene:
// init mesh raytracer
ray.gl_init();
ray.beg();
// r g b rfl rfr n
ray.add_material(1.0,0.7,0.1,0.3,0.0,_n_glass); ray.add_hemisphere( 0.0, 0.0, 2.0,0.5, 0.0, 0.0, 1.0);
ray.add_material(1.0,1.0,1.0,0.3,0.0,_n_glass); ray.add_box ( 0.0, 0.0, 6.0,9.0,9.0,0.1);
ray.add_material(1.0,1.0,1.0,0.1,0.8,_n_glass); ray.add_sphere ( 0.0, 0.0, 0.5,0.5);
ray.add_material(1.0,0.1,0.1,0.3,0.0,_n_glass); ray.add_sphere (+2.0, 0.0, 2.0,0.5);
ray.add_material(0.1,1.0,0.1,0.3,0.0,_n_glass); ray.add_box (-2.0, 0.0, 2.0,0.5,0.5,0.5);
ray.add_material(0.1,0.1,1.0,0.3,0.0,_n_glass);
ray.add_tetrahedron
(
0.0, 0.0, 3.0,
-1.0,-1.0, 4.0,
+1.0,-1.0, 4.0,
0.0,+1.0, 4.0
);
ray.end();
containing single yellow hemisphere at (0.0, 0.0, 2.0) with radius r=0.5 and plane normal (0.0, 0.0, 1.0). Rotation of the object can by done simply by rotating the plane normal.
And this is preview:
As you can see hemisphere is working by just cutting with a plane ... The only important code from above for you is this (see the *** comments):
if (id==_fac_hemispheres) // *** ignore
for (;num>0;num--) // *** ignore
{
float r;
// *** here v0 is center, v1 is plane normal and r is radius
v0.x=fac_get; v0.y=fac_get; v0.z=fac_get; r=fac_get;
v1.x=fac_get; v1.y=fac_get; v1.z=fac_get;
// *** this is ray/ellipsoid intersection returning l0,l1 ray distances for both hits
// compute l0 length of ray(p0,dp) to intersection with sphere(v0,r)
// where rr= r^-2
double aa,bb,cc,dd,l0,l1,rr;
dvec3 p0,dp;
p0=ray[i0].pos-v0; // set sphere center to (0,0,0)
dp=ray[i0].dir;
rr = 1.0/(r*r);
aa=2.0*rr*dot(dp,dp);
bb=2.0*rr*dot(p0,dp);
cc= rr*dot(p0,p0)-1.0;
dd=((bb*bb)-(2.0*aa*cc));
if (dd<0.0) continue;
dd=sqrt(dd);
l0=(-bb+dd)/aa;
l1=(-bb-dd)/aa;
// *** this thro away hits on wrong side of plane
// test both hits-v0 against normal v1
v2=ray[i0].pos+(ray[i0].dir*l0)-v0; if (dot(v1,v2)>0.0) l0=-1.0;
v2=ray[i0].pos+(ray[i0].dir*l1)-v0; if (dot(v1,v2)>0.0) l1=-1.0;
// *** this is just using closer valid hit
if (l0<0.0) l0=l1;
if (l1<0.0) l1=l0;
t=min(l0,l1); if (t<=_zero) t=max(l0,l1);
if ((t>_zero)&&((t<=tt)||(ii!=0)))
{
ii=0; tt=t;
// store color,n ...
ray[i0].col=c;
ray[i0].refl=refl;
ray[i0].refr=refr;
// position,normal
pos=ray[i0].pos+(ray[i0].dir*t);
ray[i0].nor=pos-v0;
}
}
I used mine ray and ellipsoid intersection accuracy improvement as it returns both hits not just the first one.
If you cross check the spheres and hemispheres you will see I just added these two lines:
v2=ray[i0].pos+(ray[i0].dir*l0)-v0; if (dot(v1,v2)>0.0) l0=-1.0;
v2=ray[i0].pos+(ray[i0].dir*l1)-v0; if (dot(v1,v2)>0.0) l1=-1.0;
which just converts ray distances to hit positions and computing the condition mentioned above...
I found this copy on github to link to but I am using the one downloaded from sourceforge.
My question about the way they design their matrix operations seems really strange to me.
For example, if I create a 4×4 matrix and set it's scale. Then I would like to rotate that previous matrix using the vectormath matrix library seems to reset the matrix back to an identity matrix then applies the rotation which doesn't make any sense why this would happen.
Take a look at this function to make a rotation
static inline void vmathM4MakeRotationY( VmathMatrix4 *result, float radians )
{
float s, c;
s = sinf( radians );
c = cosf( radians );
vmathV4MakeFromElems( &result->col0, c, 0.0f, -s, 0.0f );
vmathV4MakeYAxis( &result->col1 );
vmathV4MakeFromElems( &result->col2, s, 0.0f, c, 0.0f );
vmathV4MakeWAxis( &result->col3 );
}
Does this library expect you to keep multiple matrices around use one to apply rotations then multiply?
edit
this is some previous matrix math code that I used to rotate a matrix, it looks like this.
mat4_s mat4_rotateX(mat4_s* out, float angle, mat4_s* inMat)
{
float s = sinf(angle),
c = cosf(angle),
a10 = inMat->m[4],
a11 = inMat->m[5],
a12 = inMat->m[6],
a13 = inMat->m[7],
a20 = inMat->m[8],
a21 = inMat->m[9],
a22 = inMat->m[10],
a23 = inMat->m[11];
if (!out->m) {
for(size_t i = 0; i < 16; i++)
{
out->m[i] = inMat->m[i];
}
} else if (inMat->m != out->m) { // If the source and destination differ, copy the unchanged rows
out->m[0] = inMat->m[0];
out->m[1] = inMat->m[1];
out->m[2] = inMat->m[2];
out->m[3] = inMat->m[3];
out->m[12] = inMat->m[12];
out->m[13] = inMat->m[13];
out->m[14] = inMat->m[14];
out->m[15] = inMat->m[15];
}
out->m[4] = a10 * c + a20 * s;
out->m[5] = a11 * c + a21 * s;
out->m[6] = a12 * c + a22 * s;
out->m[7] = a13 * c + a23 * s;
out->m[8] = a10 * -s + a20 * c;
out->m[9] = a11 * -s + a21 * c;
out->m[10] = a12 * -s + a22 * c;
out->m[11] = a13 * -s + a23 * c;
return *out;
}
this is the process that I have to take to get vectormath to do the same thing.
mat4* mat4_rotate_y(mat4* out, const float angle){
mat4 m;
mat4_identity_v(&m);
vmathM4MakeRotationY(m, angle);
mat4_multi(out, out, m);
return out;
}
the multiplication code is fairly standard but the vmathM4MakeRotationY looks like this:
static inline void vmathM4MakeRotationZ( VmathMatrix4 *result, float radians )
{
float s, c;
s = sinf( radians );
c = cosf( radians );
vmathV4MakeFromElems( &result->col0, c, s, 0.0f, 0.0f );
vmathV4MakeFromElems( &result->col1, -s, c, 0.0f, 0.0f );
vmathV4MakeZAxis( &result->col2 );
vmathV4MakeWAxis( &result->col3 );
}
just for completeness the vmathV4Make_Axis looks like this:
static inline void vmathV4MakeZAxis(VmathVector4 *result) {
vmathV4MakeFromElems(result, 0.0f, 0.0f, 1.0f, 0.0f);
}
vmathV4MakeFromElms looks like this:
static inline void vmathV4MakeFromElems(VmathVector4 *result, float _x,
float _y, float _z, float _w) {
result->x = _x;
result->y = _y;
result->z = _z;
result->w = _w;
}
This function seems to do "Initialize the matrix to be a rotation transform" instead of what you are expecting, "Add a rotation transform to the current transform".
Like you say, you can work around by storing the rotation in a separate temporary matrix and then multiplying:
VmathMatrix4 temp;
vmathM4MakeRotationY(&temp, 1.23);
vmathM4Mul(&mytransform, &mytransform, &temp);
That is pretty much what a hypothetical vmathM4ApplyRotationY() would have to do anyway.