Vulkan Ray Tracing - Not every primitive Id from any hit shader - c

EDIT: I added the .cpp file of my project, it is quit similar to the example in the repo from Sascha Willems.
I'm new in Vulkan and I try to write a Ray/Triangle intersection. Sadly I did not find an Example. Maybe you know one?
I need these intersections to calculate the attenuation of rays.
So I did a raygen shader like this:
#version 460
#extension GL_NV_ray_tracing : require
#define debug 0
layout(binding = 0, set = 0) uniform accelerationStructureNV topLevelAS;
layout(binding = 1, set = 0, rgba8) uniform image2D image;
layout(binding = 2, set = 0) uniform CameraProperties
{
mat4 viewInverse;
mat4 projInverse;
} cam;
layout(binding = 3, set = 0) buffer detectorProperties
{
double detectorValue[];
} detectors;
//layout(binding = 4, set = 0) buffer outputProperties
//{
// double outputValues[];
//} outputData;
layout(binding = 5, set = 0) buffer debugProperties
{
double debugValues[];
} debugData;
struct RayPayload {
uint outputId;
uint hitCounter;
};
layout(location = 0) rayPayloadNV RayPayload rayPayload;
void main()
{
rayPayload.outputId = gl_LaunchIDNV.x * 18+ gl_LaunchIDNV.y * gl_LaunchSizeNV.x * 18;
rayPayload.hitCounter = 0;
vec3 origin = vec3(cam.viewInverse[0].x, cam.viewInverse[1].y, cam.viewInverse[2].z);
uint rayId = uint(gl_LaunchIDNV.x + gl_LaunchSizeNV.x * gl_LaunchIDNV.y);
uint targetXId = rayId;
uint targetYId = rayId + gl_LaunchSizeNV.x * gl_LaunchSizeNV.y;
uint targetZId = rayId + gl_LaunchSizeNV.x * gl_LaunchSizeNV.y *2;
vec3 target = vec3(detectors.detectorValue[targetXId],detectors.detectorValue[targetYId], detectors.detectorValue[targetZId]) ;
vec3 direction = target.xyz-origin.xyz ;
#ifdef debug
uint debugId = rayPayload.outputId;
debugData.debugValues[debugId + 0 ] = gl_LaunchSizeNV.x;
debugData.debugValues[debugId + 1] = gl_LaunchSizeNV.y;
debugData.debugValues[debugId+ 2 ] = gl_LaunchIDNV.x;
debugData.debugValues[debugId+ 3 ] = gl_LaunchIDNV.y;
debugData.debugValues[debugId + 4] = targetXId;
debugData.debugValues[debugId + 5] = targetYId;
debugData.debugValues[debugId + 6] = targetZId;
debugData.debugValues[debugId + 7] = target.x;
debugData.debugValues[debugId + 8] = target.y;
debugData.debugValues[debugId + 9] = target.z;
debugData.debugValues[debugId + 10] = origin.x;
debugData.debugValues[debugId + 11] = origin.y;
debugData.debugValues[debugId + 12] = origin.z;
debugData.debugValues[debugId + 13] = direction.x;
debugData.debugValues[debugId + 14] = direction.y;
debugData.debugValues[debugId + 15] = direction.z;
debugData.debugValues[debugId + 16] = rayId;
debugData.debugValues[debugId + 17] = rayPayload.outputId;
#endif
uint rayFlags = gl_RayFlagsNoneNV;
uint cullMask = 0xff;
float tmin = 0.00001;
float tmax = 10000.0;
traceNV(topLevelAS, rayFlags, cullMask, 0, 0, 0, origin.xyz, tmin, direction.xyz, tmax, 0);
// uint outputId = gl_LaunchIDNV.x * 18+ gl_LaunchIDNV.y * gl_LaunchSizeNV.x *18;
// outputData.outputValues[outputId + hitCounter] = double(hitValue[hitCounter]);
imageStore(image, ivec2(gl_LaunchIDNV.xy), vec4(rayPayload.hitCounter,0,0, 0.0));
}
And with the any-hit shader I just want to give back the primitive Id like that:
#version 460
#extension GL_NV_ray_tracing : require
#extension GL_EXT_nonuniform_qualifier : enable
layout(binding = 4, set = 0) buffer outputProperties
{
float outputValues[];
} outputData;
struct RayPayload {
uint outputId;
uint hitCounter;
};
layout(location = 0) rayPayloadInNV RayPayload rayPayload;
hitAttributeNV vec3 attribs;
void main()
{
// uint outputIdCurrentTriangle = rayPayload.outputId + rayPayload.hitCounter;
uint outputIdCurrentTriangle = rayPayload.outputId + rayPayload.hitCounter++;
outputData.outputValues[outputIdCurrentTriangle] = gl_PrimitiveID;
// rayPayload.hitCounter ++;
}
You can see the .cpp file here:
https://drive.google.com/file/d/1iTX3ATaP3pT7d4CEowo4IVnQxTOerxaD/view?usp=sharing
My Problem is, that I just find all surface triangles, nearest to the source even though the object is noOpaque (it's tested with setting Rayflag to cullNoOpaques).
Is anyone have the same problem or have an example which also gives primitivesId of any hit back?
I just found one Example for any-hit shaders. Is the any-hit shader this less used?
Thank you for your help!

If you want to have your any hit shader to register intersections for all triangles in your scene you should call ignoreIntersectionNV in your any hit shader (see GLSL_NV_ray_tracing specs).
With this, your any hit shader will carry on without modifying gl_RayTmaxNV and gl_HitKindNV, triggering for all ray intersections.
With a simple any hit shader like this:
#version 460
#extension GL_NV_ray_tracing : require
#extension GL_GOOGLE_include_directive : enable
#include "raypayload.glsl"
layout(binding = 1, set = 0, rgba8) uniform image2D image;
layout(location = 0) rayPayloadInNV RayPayload rayPayload;
void main()
{
rayPayload.hitcount++;
ignoreIntersectionNV();
}
The hitCount will be increased by one for each intersection, visualizing this with color coding by hitcount will yield something like this:

Related

SOLVED: Faster HLSL code? Wondering about lower CPU overhead when rendering quads in 3-space

!!!UPDATE!!! Using the vertex shader to generate quads via DrawInstanced() calls definitely reduced CPU overhead and increased quads drawn per second. But there was much more performance to be found by using a combination of instanced drawing via a vertex shader that generates a point list, and a geometry shader that generates quads based on those points.
Thanks to #Soonts for not only recommending a faster way, but also for reminding me of conditional moves and unrolling loops.
Here is the geometry shader I created for sprites with 2D rotation:
cbuffer CB_PROJ {
matrix camera;
};
/* Reduced packet size -- 256x256 max atlas segments
-------------------
FLOAT3 Sprite location // 12 bytes
FLOAT Rotation // 16 bytes
FLOAT2 Scale // 24 bytes
UINT // 28 bytes
Fixed8p00 Texture X segment
Fixed8p00 Texture X total segments
Fixed8p00 Texture Y segment
Fixed8p00 Texture Y total segments
.Following vertex data is only processed by the vertex shader.
UINT // 32 bytes
Fixed3p00 Squadron generation method
Fixed7p00 Sprite stride
Fixed8p14 X/Y distance between sprites
*/
struct VOut {
float3 position : POSITION;
float3 r_s : NORMAL;
uint bits : BLENDINDICES;
};
struct GOut {
float4 pos : SV_Position;
float3 position : POSITION;
float3 n : NORMAL;
float2 tex : TEXCOORD;
uint pID : SV_PrimitiveID;
};
[maxvertexcount(4)]
void main(point VOut gin[1], uint pID : SV_PrimitiveID, inout TriangleStream<GOut> triStream) {
GOut output;
const uint bits = gin[0].bits;
const uint ySegs = (bits & 0x0FF000000) >> 24u;
const uint _yOS = (bits & 0x000FF0000) >> 16u;
const float yOS = 1.0f - float(_yOS) / float(ySegs);
const float yOSd = rcp(float(ySegs));
const uint xSegs = (bits & 0x00000FF00) >> 8u;
const uint _xOS = (bits & 0x0000000FF);
const float xOS = float(_xOS) / float(xSegs);
const float xOSd = rcp(float(xSegs));
float2 v;
output.pID = pID;
output.n = float3( 0.0f, 0.0f, -1.0f );
output.position = gin[0].position; // Translate
v.x = -gin[0].r_s.y; v.y = -gin[0].r_s.z; // Scale
output.tex = float2(xOS, yOS);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x); // Rotate
output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera); // Transform
triStream.Append(output);
output.position = gin[0].position;
v.x = -gin[0].r_s.y; v.y = gin[0].r_s.z;
output.tex = float2(xOS, yOS - yOSd);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.x * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
output.position = gin[0].position;
v.x = gin[0].r_s.y; v.y = -gin[0].r_s.z;
output.tex = float2(xOS + xOSd, yOS);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
output.position = gin[0].position;
v.x = gin[0].r_s.y; v.y = gin[0].r_s.z;
output.tex = float2(xOS + xOSd, yOS - yOSd);
output.position.x += v.x * cos(gin[0].r_s.x) - v.y * sin(gin[0].r_s.x);
output.position.y += v.y * sin(gin[0].r_s.x) + v.y * cos(gin[0].r_s.x);
output.pos = mul(float4(output.position, 1.0f), camera);
triStream.Append(output);
}
!!!ORIGINAL TEXT!!!
Last time I was coding, I had barely started learning Direct3D9c. Currently I'm hitting about 30K single-texture quads lit with 15 lights at about 450fps. I haven't learned instancing or geometry shading at all yet, and I'm trying to prioritise the order I learn things in for my needs, so I've only taken glances at them.
My first thought was to reduce the amount of vertex data being shunted to the GPU, so I changed the vertex structure to a FLOAT2 (for texture coords) and an UINT (for indexing), relying on 4x float3 constants in the vertex shader to define the corners of the quads.
I figured I could reduce the size of the vertex data further, and reduced each vertex unit to a single UINT containing a 2bit index (to reference the real vertexes of the quad), and 2x 15bit fixed-point numbers (yes, I'm showing my age but fixed-point still has it's value) representing offsets into atlas textures.
So far, so good, but I know bugger all about Direct3D11 and HLSL so I've been wondering if there's a faster way.
Here's the current state of my vertex shader:
cbuffer CB_PROJ
{
matrix model;
matrix modelViewProj;
};
struct VOut
{
float3 position : POSITION;
float3 n : NORMAL;
float2 texcoord : TEXCOORD;
float4 pos : SV_Position;
};
static const float3 position[4] = { -0.5f, 0.0f,-0.5f,-0.5f, 0.0f, 0.5f, 0.5f, 0.0f,-0.5f, 0.5f, 0.0f, 0.5f };
// Index bitpattern: YYYYYYYYYYYYYYYXXXXXXXXXXXXXXXVV
//
// 00-01 . uint2b == Vertex index (0-3)
// 02-17 . fixed1p14 == X offset into atlas texture(s)
// 18-31 . fixed1p14 == Y offset into atlas texture(s)
//
VOut main(uint bitField : BLENDINDICES) {
VOut output;
const uint i = bitField & 0x03u;
const uint xStep = (bitField >> 2) & 0x7FFFu;
const uint yStep = (bitField >> 17);
const float xDelta = float(xStep) * 0.00006103515625f;
const float yDelta = float(yStep) * 0.00006103515625f;
const float2 texCoord = float2(xDelta, yDelta);
output.position = (float3) mul(float4(position[i], 1.0f), model);
output.n = mul(float3(0.0f, 1.0f, 0.0f), (float3x3) model);
output.texcoord = texCoord;
output.pos = mul(float4(output.position, 1.0f), modelViewProj);
return output;
}
My pixel shader for completeness:
Texture2D Texture : register(t0);
SamplerState Sampler : register(s0);
struct LIGHT {
float4 lightPos; // .w == range
float4 lightCol; // .a == flags
};
cbuffer cbLight {
LIGHT l[16] : register(b0); // 256 bytes
}
static const float3 ambient = { 0.15f, 0.15f, 0.15f };
float4 main(float3 position : POSITION, float3 n : NORMAL, float2 TexCoord : TEXCOORD) : SV_Target
{
const float4 Texel = Texture.Sample(Sampler, TexCoord);
if (Texel.a < 0.707106f) discard; // My source images have their alpha values inverted.
float3 result = { 0.0f, 0.0f, 0.0f };
for (uint xx = 0 ; xx < 16 && l[xx].lightCol.a != 0xFFFFFFFF; xx++)
{
const float3 lCol = l[xx].lightCol.rgb;
const float range = l[xx].lightPos.w;
const float3 vToL = l[xx].lightPos.xyz - position;
const float distToL = length(vToL);
if (distToL < range * 2.0f)
{
const float att = min(1.0f, (distToL / range + distToL / (range * range)) * 0.5f);
const float3 lum = Texel.rgb * saturate(dot(vToL / distToL, n)) * lCol;
result += lum * (1.0f - att);
}
}
return float4(ambient * Texel.rgb + result, Texel.a);
}
And the rather busy looking C function to generate the vertex data (all non-relevant functions removed):
al16 struct CLASS_PRIMITIVES {
ID3D11Buffer* pVB = { NULL, NULL }, * pIB = { NULL, NULL };
const UINT strideV1 = sizeof(VERTEX1);
void CreateQuadSet1(ui32 xSegs, ui32 ySegs) {
al16 VERTEX1* vBuf;
al16 D3D11_BUFFER_DESC bd = {};
D3D11_SUBRESOURCE_DATA srd = {};
ui32 index = 0, totalVerts = xSegs * ySegs * 4;
if (pVB) return;
vBuf = (VERTEX1*)_aligned_malloc(strideV1 * totalVerts, 16);
for (ui32 yy = ySegs; yy; yy--)
for (ui32 xx = 0; xx < xSegs; xx++) {
double dyStep2 = 16384.0 / double(ySegs); double dyStep1 = dyStep2 * double(yy); dyStep2 *= double(yy - 1);
ui32 yStep1 = dyStep1;
yStep1 <<= 17;
ui32 yStep2 = dyStep2;
yStep2 <<= 17;
vBuf[index].b = 0 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep1;
index++;
vBuf[index].b = 1 + (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2) + yStep2;
index++;
vBuf[index].b = 2 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep1;
index++;
vBuf[index].b = 3 + (ui32(double(16384.0 / double(xSegs) * double(xx + 1))) << 2) + yStep2;
index++;
}
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
bd.CPUAccessFlags = 0;
bd.ByteWidth = strideV1 * totalVerts;
bd.StructureByteStride = strideV1;
srd.pSysMem = vBuf;
hr = dev->CreateBuffer(&bd, &srd, &pVB);
if (hr != S_OK) ThrowError();
_aligned_free(vBuf);
};
void DrawQuadFromSet1(ui32 offset) {
offset *= sizeof(VERTEX1) * 4;
devcon->IASetVertexBuffers(0, 1, &pVB, &strideV1, &offset);
devcon->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
devcon->Draw(4, 0);
};
void DestroyQuadSet() {
if (pVB) pVB->Release();
};
It's all functioning as it should, but it just seems like I'm resorting to hacks to achieve my goal. Surely there's a faster way? Using DrawIndexed() consistently dropped the frame-rate by 1% so I switched back to non-indexed Draw calls.
reducing vertex data down to 32bits per vertex is as far as the GPU will allow
You seem to think that vertex buffer sizes are what's holding you back. Make no mistake here, they are not. You have many gigs of VRAM to work with, use them if it will make your code faster. Specifically, anything you're unpacking in your shaders that could otherwise be stored explicitly in your vertex buffer should probably be stored in your vertex buffer.
I am wondering if anyone has experience with using geometry shaders to auto-generate quads
I'll stop you right there, geometry shaders are very inefficient in most driver implementations, even today. They just aren't used that much so nobody bothered to optimize them.
One quick thing that jumps at me is that you're allocating and freeing your system-side vertex array every frame. Building it is fine, but cache the array, C memory allocation is about as slow as anything is going to get. A quick profiling should have shown you that.
Your next biggest problem is that you have a lot of branching in your pixel shader. Use standard functions (like clamp or mix) or blending to let the math cancel out instead of checking for ranges or fully transparent values. Branching will absolutely kill performance.
And lastly, make sure you have the correct hints and usage on your buffers. You don't show them, but they should be set to whatever the equivalent of GL_STREAM_DRAW is, and you need to ensure you don't corrupt the in-flight parts of your vertex buffer. Future frames will render at the same time as the current one as long as you don't invalidate their data by overwriting their vertex buffer, so instead use a round-robin scheme to allow as many vertices as possible to survive (again, use memory for performance). Personally I allocate a very large vertex buffer (5x the data a frame needs) and write it sequentially until I reach the end, at which point I orphan the whole thing and re-allocate it and start from the beginning again.
I think your code is CPU bound. While your approach has very small vertices, you have non-trivial API overhead.
A better approach is rendering all quads with a single draw call. I would probably use instancing for that.
Assuming you want arbitrary per-quad size, position, and orientation in 3D space, here’s one possible approach. Untested.
Vertex buffer elements:
struct sInstanceData
{
// Center of the quad in 3D space
XMFLOAT3 center;
// XY coordinates of the sprite in the atlas
uint16_t spriteX, spriteY;
// Local XY vectors of the quad in 3D space
// length of the vectors = half width/height of the quad
XMFLOAT3 plusX, plusY;
};
Input layout:
D3D11_INPUT_ELEMENT_DESC desc[ 4 ];
desc[ 0 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadCenter", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 1 ] = D3D11_INPUT_ELEMENT_DESC{ "SpriteIndex", 0, DXGI_FORMAT_R16G16_UINT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 2 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusX", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 3 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusY", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
Vertex shader:
cbuffer Constants
{
matrix viewProj;
// Pass [ 1.0 / xSegs, 1.0 / ySegs ] in that field
float2 texcoordMul;
};
struct VOut
{
float3 position : POSITION;
float3 n : NORMAL;
float2 texcoord : TEXCOORD;
float4 pos : SV_Position;
};
VOut main( uint index: SV_VertexID,
float3 center : QuadCenter, uint2 texcoords : SpriteIndex,
float3 plusX : QuadPlusX, float3 plusY : QuadPlusY )
{
VOut result;
float3 pos = center;
int2 uv = ( int2 )texcoords;
// No branches are generated in release builds;
// only conditional moves are there
if( index & 1 )
{
pos += plusX;
uv.x++;
}
else
pos -= plusX;
if( index & 2 )
{
pos += plusY;
uv.y++;
}
else
pos -= plusY;
result.position = pos;
result.n = normalize( cross( plusX, plusY ) );
result.texcoord = ( ( float2 )uv ) * texcoordMul;
result.pos = mul( float4( pos, 1.0f ), viewProj );
return result;
}
Rendering:
UINT stride = sizeof( sInstanceData );
UINT off = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride, &off );
context->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP );
context->DrawInstanced( 4, countQuads, 0, 0 );

OpenGL: When applying my own lookAt function the screen is blank

I'm writing my own version of gluLookAt. I believe I have the correct implementation however, often the matrix that gets returned is full of nan's.
Here is my function:
mat4 look_at(vec4 eye, vec4 at, vec4 up)
{
vec4 vpn = vec4_sub(eye, at);
vec4 n = vec4_norm(vpn);
vec4 u = vec4_norm(vec4_cross(up, n));
vec4 v = vec4_norm(vec4_cross(n, u));
vec4 p = { eye.x, eye.y, eye.z, 1 };
mat4 m;
m.x.x = u.x; m.y.x = u.y; m.z.x = u.z; m.w.x = u.w;
m.x.y = v.x; m.y.y = v.y; m.z.y = v.z; m.w.y = v.w;
m.x.z = n.x; m.y.z = n.y; m.z.z = n.z; m.w.z = n.w;
m.x.w = p.x; m.y.w = p.y; m.z.w = p.z; m.w.w = p.w;
mat4 m_t = mat4_trans(m);
mat4 m_t_inv= mat4_inv(m_t);
return m_t_inv;
}
I am currently trying to look at the top of a cube I made. The cube is 1x1x1 and is centered at the origin. I am setting the model_view like so:
vec4 e = {0, 1, 0, 1};
vec4 a = {0, 0, 0, 0};
vec4 u = {0, 0, 1, 0};
model_view = look_at(e, a, u);
I believe I have the parameters correct. I want to look down at the origin from y=1.
Does the issue appear to be in my function? or have I misunderstood the parameters?
The fourth component of the axis vectors u, v and p has to be zero. In your case the fourth component of vpn is not zero, because eye is {0, 1, 0, 1}.
I recommend doing the computation of u, v and p with vec3 rather than vec4. However, you can fix the issue with setting vpn[3] = 0:
vec4 vpn = vec4_sub(eye, at);
vpn[3] = 0

Bitmap font rendering issue

Learning OpenGL here. Trying to write a bitmap font rendering system.
I'm using Hiero to generate the font file (Angel Code font format) and atlas (.fnt and .png files).
I first parse the font file reading in the font and character data. That part is easy. (I verified that the parsed results are indeed all correct)
// (typedefs) u32: unsigned int, r32: float, string: char*
struct font_character
{
u32 Id;
r32 X, Y;
r32 Width, Height;
r32 XOffset, YOffset;
r32 XAdvance;
};
struct font_set
{
string Name;
u32 Atlas;
r32 Size;
u32 Stretch;
u32 Smooth;
u32 AntiAliasing;
u32 Padding[4];
u32 Spacing[2];
u32 LineHeight;
u32 BaseLine;
r32 Width, Height;
u32 CharacterCount;
font_character Characters[128];
};
// Parsing related codes...
font_set FontLoad(string FilePath)
{
font_set Result = {};
string Content = ReadFile(FilePath);
if (Content)
{
List(string) FontSettings;
ListAlloc(FontSettings, 1024);
bool DoneParsing = false;
while(!DoneParsing)
{
token Token = GetToken(Content);
switch(Token.Type)
{
case TokenType_EOF:
DoneParsing = true;
break;
case TokenType_Unknown:
Assert(!"Unknown token in font file");
break;
case TokenType_Number:
case TokenType_String:
case TokenType_Identifier:
ListPush(FontSettings, Token.Content);
break;
}
}
for (int i = 0, Count = ListCount(FontSettings); i < Count; i += 2)
{
string SettingKey = FontSettings[i];
string SettingValue = FontSettings[i + 1];
if (StringEqual(SettingKey, "face"))
Result.Name = SettingValue;
else if (StringEqual(SettingKey, "size"))
Result.Size = atoi(SettingValue);
else if (StringEqual(SettingKey, "stretchH"))
Result.Stretch = atoi(SettingValue);
else if (StringEqual(SettingKey, "smooth"))
Result.Smooth = atoi(SettingValue);
else if (StringEqual(SettingKey, "aa"))
Result.AntiAliasing = atoi(SettingValue);
else if (StringEqual(SettingKey, "lineHeight"))
Result.LineHeight = atoi(SettingValue);
else if (StringEqual(SettingKey, "base"))
Result.BaseLine = atoi(SettingValue);
else if (StringEqual(SettingKey, "scaleW"))
Result.Width = atoi(SettingValue);
else if (StringEqual(SettingKey, "scaleH"))
Result.Height = atoi(SettingValue);
else if (StringEqual(SettingKey, "spacing"))
{
// Ascii(48) = Decimal(0)
Result.Spacing[0] = SettingValue[0] - 48;
Result.Spacing[1] = SettingValue[2] - 48;
}
else if (StringEqual(SettingKey, "padding"))
{
Result.Padding[0] = SettingValue[0] - 48;
Result.Padding[1] = SettingValue[2] - 48;
Result.Padding[2] = SettingValue[4] - 48;
Result.Padding[3] = SettingValue[6] - 48;
}
else if (StringEqual(SettingKey, "char"))
{
font_character Character;
// Although they're 10 pairs of data, we're gonna skip the last two cause we don't care about them
For(u32, PairIndex, 8)
{
string CharKey = FontSettings[(i + 1) + PairIndex * 2];
string CharValue = FontSettings[(i + 2) + PairIndex * 2];
if (StringEqual(CharKey, "id"))
Character.Id = atoi(CharValue);
else if (StringEqual(CharKey, "x"))
Character.X = atoi(CharValue);
else if (StringEqual(CharKey, "y"))
Character.Y = atoi(CharValue);
else if (StringEqual(CharKey, "width"))
Character.Width = atoi(CharValue);
else if (StringEqual(CharKey, "height"))
Character.Height = atoi(CharValue);
else if (StringEqual(CharKey, "xoffset"))
Character.XOffset = atoi(CharValue);
else if (StringEqual(CharKey, "yoffset"))
Character.YOffset = atoi(CharValue);
else if (StringEqual(CharKey, "xadvance"))
Character.XAdvance = atoi(CharValue);
}
Result.Characters[Result.CharacterCount++] = Character;
i += 19;
}
else i--;
}
}
// Load texture
char TexturePath[256];
sprintf(TexturePath, "%s.png", FilePath);
Result.Atlas = TextureLoad(TexturePath); // loads texture from file via stbi_load, does glGenTexture, configures texture parameters etc.
return (Result);
}
Then we get to rendering, which is where I'm struggling a bit. My understanding is that I need to use the character data I got from the font data to build quads that I could render to screen.
First, here's my shaders. Vertex shader:
#version 330 core
layout (location = 0) in vec2 VertPos;
layout (location = 1) in vec2 VertUV;
out vec2 FragUV;
uniform mat4 Projection;
void main()
{
gl_Position = Projection * vec4(VertPos, 0, 1);
FragUV = VertUV;
}
And fragment shader:
#version 330 core
out vec4 FinalColor;
in vec2 FragUV;
uniform sampler2D FontAtlas;
uniform vec3 Color;
void main()
{
FinalColor = vec4(Color, texture(FontAtlas, FragUV).a);
}
Here's how I load the font and render:
struct font_renderer
{
string Text;
v3 Color;
r32 CurrentX;
u32 VAO, VBO;
u32 Initialized;
u32 Shader;
};
// In an initialization function
font_set Font = FontLoad("res/fonts/Courier New.fnt");
u32 FontShader = ShaderLoadFromFile("Font.vert", "Font.frag");
// In an update/render function
font_renderer FontRenderer = {};
FontRenderer.Text = "A";
FontRenderer.Color = V3(1, 0, 0);
FontRenderer.CurrentX = 20;
FontRenderer.Shader = FontShader;
FontRender(&FontRenderer, &Font);
Rendering function (just trying to get something on the screen)
void FontRender(font_renderer *Renderer, font_set *Font)
{
u32 NumChars = StringLength(Renderer->Text);
u32 Size = NumChars * 12;
if (!Renderer->Initialized)
{
glGenBuffers(1, &Renderer->VBO);
glBindBuffer(GL_ARRAY_BUFFER, Renderer->VBO);
glBufferData(GL_ARRAY_BUFFER, Size * 2, 0, GL_STATIC_DRAW);
glGenVertexArrays(1, &Renderer->VAO);
glBindVertexArray(Renderer->VAO);
glEnableVertexAttribArray(0);
glVertexAttribPointer(0, 2, GL_FLOAT, 0, 0, 0);
glEnableVertexAttribArray(1);
glVertexAttribPointer(1, 2, GL_FLOAT, 0, 0, 0);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindVertexArray(0);
Renderer->Initialized = 1;
}
/*r32 *VertPos = Calloc(Size, r32);
r32 *VertUV = Calloc(Size, r32);*/
// Temporary code, just trying to render a single character on screen
r32 VertPos[12]; // we need a quad <=> 2 triangles <=> 6 vertices <=> 12 floats
r32 VertUV[12]; // same for UVs
For(u32, i, NumChars) // for loop macro
{
font_character Character = Font->Characters[Renderer->Text[i]]; // assuming that 'Characters' are ordered correctly
r32 X = Character.X;
r32 Y = Character.Y;
r32 XOffset = Character.XOffset;
r32 YOffset = Character.YOffset;
r32 XAdvance = Character.XAdvance;
r32 Width = Character.Width;
r32 Height = Character.Height;
// Triangle 1 (clock-wise winding order)
{
// Top Left
VertPos[i] = Renderer->CurrentX + XOffset;
VertPos[i + 1] = YOffset;
// Bottom Left
VertPos[i + 2] = Renderer->CurrentX + XOffset;
VertPos[i + 3] = YOffset + Height;
// Bottom Right
VertPos[i + 4] = Renderer->CurrentX + XOffset + Width;
VertPos[i + 5] = YOffset + Height;
}
// Triangle 2
{
// Bottom Right
VertPos[i + 6] = VertPos[i + 4];
VertPos[i + 7] = VertPos[i + 5];
// Top Right
VertPos[i + 8] = Renderer->CurrentX + XOffset + Width;
VertPos[i + 9] = YOffset;
// Top Left
VertPos[i + 10] = VertPos[i];
VertPos[i + 11] = VertPos[i + 1];
}
// UV 1
{
// Top left
VertUV[i] = X / Font->Width;
VertUV[i + 1] = Y / Font->Height;
// Bottom left
VertUV[i + 2] = X / Font->Width;
VertUV[i + 3] = (Y + Height) / Font->Height;
// Bottom right
VertUV[i + 4] = (X + Width) / Font->Width;
VertUV[i + 5] = (Y + Height) / Font->Height;
}
// UV 2
{
// Bottom right
VertUV[i + 6] = VertUV[i + 4];
VertUV[i + 7] = VertUV[i + 5];
// Top right
VertUV[i + 8] = (X + Width) / Font->Width;
VertUV[i + 9] = Y / Font->Height;
// Top left
VertUV[i + 10] = VertUV[i];
VertUV[i + 11] = VertUV[i + 1];
}
}
glBindBuffer(GL_ARRAY_BUFFER, Renderer->VBO);
u32 Offset = 0;
glBufferSubData(GL_ARRAY_BUFFER, Offset, Size, VertPos);
Offset += Size;
glBufferSubData(GL_ARRAY_BUFFER, Offset, Size, VertUV);
m4 FontProjection = Orthographic(0, 800, 600, 0, -1, +1);
glDisable(GL_DEPTH_TEST);
ShaderUse(Renderer->Shader);
glBindVertexArray(Renderer->VAO);
TextureBind(Font->Atlas);
ShaderSetV3(Renderer->Shader, "Color", Renderer->Color);
ShaderSetM4(Renderer->Shader, "Projection", &FontProjection);
glDrawArrays(GL_TRIANGLES, 0, NumChars * 6);
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindVertexArray(0);
}
As you can see I'm really just trying to get a single character on screen. I'm not even taking into consideration the font size etc just keeping it simple. I choose the vertices going from TopLeft->BottomLeft->BottomRight->TopRight->TopLeft which, correct me if I'm wrong but is a clock-wise winding order. I have two slots in my vertex shader: Position and UV. I'm specifying the buffer data via the call to glBufferSubData
Running the program I don't get any rendering output. Just blank screen. Pretty sure I'm missing something obvious or doing something stupid, I can't see it. What am I doing wrong?
Note that the texture, font data and shader are loaded all correctly.
Appreciate any help.
Your code seems to have a number of problems (even ignoring the horrible C-in-C++ coding style you employ). I haven't found all of them, so I can't say I found the one that you ran into.
But I did find these:
u32 Size = NumChars * 12;
You use Size * 2 as the size for your buffer. So each character in your string takes up 24 bytes; 12 for your positions and 12 for your texture coordinates.
But your positions and texcoords are floats. 4 bytes per float * 2 floats per position * 6 positions per character = 48 bytes per character.
So your size calculation is very off. You can verify this by checking sizeof(VertPos) and comparing that to your Size variable. You'll find that they're very different.
glVertexAttribPointer(0, 2, GL_FLOAT, 0, 0, 0);
glVertexAttribPointer(1, 2, GL_FLOAT, 0, 0, 0);
This says that your position and texture coordinate both have the same offset. That would mean that your position and texcoord in the shader will get the same value. I'm pretty sure that's not what you want.
Your texture coordinate array comes from a different part of the buffer. So you need to apply an appropriate offset. Assuming you have correctly computed Size, that would look like this:
glVertexAttribPointer(1, 2, GL_FLOAT, 0, 0, reinterpret_cast<void*>(Size));

How to cope with WebGL missing glBlendEquation(GL_MAX)

Here's my current C code that does what I'd like to do, but it relies on glBlendEquation(GL_MAX) which is unavailable in WebGL. What I want is to render a wiggly fuzzy line. I could use a Gaussian blur but it would have to have a VERY large radius (16 pixels) and I expect it would be REALLY slow.
Note I've removed some gl state management code and a couple other things fore clarity but the code should work as is.
Existing C code:
static const char *pnt_vtx_shader =
"#version 110\n"
"varying vec2 uv;\n"
"void main() {\n"
" uv = (gl_MultiTexCoord0.st - 1.0f);\n"
" gl_Position = gl_Vertex;\n"
"}";
static const char *pnt_shader_src =
"#version 110\n"
"varying vec2 uv;\n"
"void main() {\n"
" gl_FragColor = vec4(exp(-4.5f*0.5f*log2(dot(uv,uv)+1.0f)));\n"
"}";
GLuint shader_prog ;
int samp;
float pw, ph;
float sco_verts[128*8*4];
int sco_ind[128*3*6];
void init(int width, int height, int num_samp)
{
pw = 0.5f*fmaxf(1.0f/24, 8.0f/width), ph = 0.5f*fmaxf(1.0f/24, 8.0f/height);
samp = num_samp;
// helper function, compiles and links the shader, prints out any errors
shader_prog = compile_program(pnt_vtx_shader, pnt_shader_src);
for(int i=0; i<samp; i++) {
sco_verts[(i*8+0)*4+0] = 0; sco_verts[(i*8+0)*4+1] = 2;
sco_verts[(i*8+1)*4+0] = 0; sco_verts[(i*8+1)*4+1] = 0;
sco_verts[(i*8+2)*4+0] = 1; sco_verts[(i*8+2)*4+1] = 2;
sco_verts[(i*8+3)*4+0] = 1; sco_verts[(i*8+3)*4+1] = 0;
sco_verts[(i*8+4)*4+0] = 1; sco_verts[(i*8+4)*4+1] = 2;
sco_verts[(i*8+5)*4+0] = 1; sco_verts[(i*8+5)*4+1] = 0;
sco_verts[(i*8+6)*4+0] = 2; sco_verts[(i*8+6)*4+1] = 2;
sco_verts[(i*8+7)*4+0] = 2; sco_verts[(i*8+7)*4+1] = 0;
}
for(int i=0; i<samp; i++) {
sco_ind[(i*6+0)*3+0] = i*8+0; sco_ind[(i*6+0)*3+1] = i*8+1; sco_ind[(i*6+0)*3+2] = i*8+3;
sco_ind[(i*6+1)*3+0] = i*8+0; sco_ind[(i*6+1)*3+1] = i*8+3; sco_ind[(i*6+1)*3+2] = i*8+2;
sco_ind[(i*6+2)*3+0] = i*8+2; sco_ind[(i*6+2)*3+1] = i*8+4; sco_ind[(i*6+2)*3+2] = i*8+5;
sco_ind[(i*6+3)*3+0] = i*8+2; sco_ind[(i*6+3)*3+1] = i*8+5; sco_ind[(i*6+3)*3+2] = i*8+3;
sco_ind[(i*6+4)*3+0] = i*8+4; sco_ind[(i*6+4)*3+1] = i*8+6; sco_ind[(i*6+4)*3+2] = i*8+7;
sco_ind[(i*6+5)*3+0] = i*8+4; sco_ind[(i*6+5)*3+1] = i*8+7; sco_ind[(i*6+5)*3+2] = i*8+5;
}
}
// getsamp does some averaging over samples
static float getsamp(const float *data, int len, int i, int w) {
float sum = 0, err = 0;
int l = IMAX(i-w, 0);
int u = IMIN(i+w, len);
for(int i = l; i < u; i++)
sum+= data[i];
return sum / (2*w);
}
// R holds a rotation matrix... it's the transpose of what you would give GL though
// because of reasons :P (I wrote code that did all the stuff from this program in
// software first and the GL version shares a bunch of code with that one)
// data is audio samples, [-1, 1], the length of the array is in len
void render_scope(float R[3][3], const float *data, int len)
{
// do the rotate/project ourselves because the GL matrix won't do the right
// thing if we just send it our verticies, we want wour tris to always be
// parrallel to the view plane, because we're actually drawing a fuzzy line
// not a 3D object
// also it makes it easier to match the software implementation
float px, py;
{
float s = getsamp(data, len, 0, len/96);
s=copysignf(log2f(fabsf(s)*3+1)/2, s);
float xt = -0.5f, yt = 0.2f*s, zt = 0.0f;
float x = R[0][0]*xt + R[1][0]*yt + R[2][0]*zt;
float y = R[0][1]*xt + R[1][1]*yt + R[2][1]*zt;
float z = R[0][2]*xt + R[1][2]*yt + R[2][2]*zt;
const float zvd = 1/(z+2);
px=x*zvd*4/3; py=y*zvd*4/3;
}
for(int i=0; i<samp; i++) {
float s = getsamp(data, len, (i+1)*len/(samp), len/96);
s=copysignf(log2f(fabsf(s)*3+1)/2, s);
float xt = (i+1 - (samp)/2.0f)*(1.0f/(samp)), yt = 0.2f*s, zt = 0.0f;
float x = R[0][0]*xt + R[1][0]*yt + R[2][0]*zt;
float y = R[0][1]*xt + R[1][1]*yt + R[2][1]*zt;
float z = R[0][2]*xt + R[1][2]*yt + R[2][2]*zt;
const float zvd = 1/(z+2);
x=x*zvd*4/3; y=y*zvd*4/3;
const float dx=x-px, dy=y-py;
const float d = 1/hypotf(dx, dy);
const float tx=dx*d*pw, ty=dy*d*pw;
const float nx=-dy*d*pw, ny=dx*d*ph;
sco_verts[(i*8+0)*4+2] = px-nx-tx; sco_verts[(i*8+0)*4+3] = py-ny-ty;
sco_verts[(i*8+1)*4+2] = px+nx-tx; sco_verts[(i*8+1)*4+3] = py+ny-ty;
sco_verts[(i*8+2)*4+2] = px-nx ; sco_verts[(i*8+2)*4+3] = py-ny;
sco_verts[(i*8+3)*4+2] = px+nx ; sco_verts[(i*8+3)*4+3] = py+ny;
sco_verts[(i*8+4)*4+2] = x-nx ; sco_verts[(i*8+4)*4+3] = y-ny;
sco_verts[(i*8+5)*4+2] = x+nx ; sco_verts[(i*8+5)*4+3] = y+ny;
sco_verts[(i*8+6)*4+2] = x-nx+tx; sco_verts[(i*8+6)*4+3] = y-ny+ty;
sco_verts[(i*8+7)*4+2] = x+nx+tx; sco_verts[(i*8+7)*4+3] = y+ny+ty;
px=x,py=y;
}
glEnable(GL_BLEND);
glBlendEquation(GL_MAX);
glUseProgram(shader_prog);
glColor4f(1.0f, 1.0f, 1.0f, 1.0f);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
glTexCoordPointer(2, GL_FLOAT, sizeof(float)*4, sco_verts);
glVertexPointer(2, GL_FLOAT, sizeof(float)*4, sco_verts + 2);
glDrawElements(GL_TRIANGLES, samp*3*6, GL_UNSIGNED_INT, sco_ind);
}
Here's a screenshot from a test app, I'm not sure the line width is right in this screen shot... but meh it gives the idea, also I'd be using way more points so the lines would be smoother.

Decode from VP8 video frame to RGB

In my application, we need to display the video frame on the screen. I use libvpx to decode a video from WebM, but frame is decoded to YUV format (VPX_IMG_FMT_I420 according to the documentation). I need to output format is RGB and the documentation says a image supported a RGB format (VPX_IMG_FMT_RGB24). I have a formula for translating YUV->RGB:
R = Y + 1.13983 * (V - 128);
G = Y - 0.39465 * (U - 128) - 0.58060 * (V - 128);
B = Y + 2.03211 * (U - 128);
But I think is too many conversions VP8->YUV->RGB. Is there a method for set a output frame format for conversion function?
If you can afford using Intel's IPP library, here is some CPU friendly piece of code that you can try and apply in your project:
unsigned char* mpRGBBuffer;
void VPXImageToRGB24(vpx_image_t* pImage, bool isUsingBGR)
{
const unsigned int rgbBufferSize = pImage->d_w * pImage->d_h * 3;
mpRGBBuffer - allocate your raw RGB buffer...
const IppiSize sz = { pImage->d_w, pImage->d_h };
const Ipp8u* src[3] = { pImage->planes[PLANE_Y], pImage->planes[PLANE_U], pImage->planes[PLANE_V] };
int srcStep[3] = { pImage->stride[VPX_PLANE_Y], pImage->stride[VPX_PLANE_U], pImage->stride[VPX_PLANE_V] };
if (isUsingBGR) ippiYCbCr420ToBGR_8u_P3C3R(src, srcStep, pDest, pImage->d_w * 3, sz);
else ippiYCbCr420ToRGB_8u_P3C3R(src, srcStep, pDest, pImage->d_w * 3, sz);
}
If you dont want to use IPP, here is a link to some working peace of core that could really be usefull. Tested this, works for 100% but not sure about the CPU cost.
Here is the code from the link above (in case link fails...)
inline int clamp8(int v)
{
return std::min(std::max(v, 0), 255);
}
Image VP8Decoder::convertYV12toRGB(const vpx_image_t* img)
{
Image rgbImg(img->d_w, img->d_h);
std::vector<uint8_t>& data = rgbImg.data;
uint8_t *yPlane = img->planes[VPX_PLANE_Y];
uint8_t *uPlane = img->planes[VPX_PLANE_U];
uint8_t *vPlane = img->planes[VPX_PLANE_V];
int i = 0;
for (unsigned int imgY = 0; imgY < img->d_h; imgY++) {
for (unsigned int imgX = 0; imgX < img->d_w; imgX++) {
int y = yPlane[imgY * img->stride[VPX_PLANE_Y] + imgX];
int u = uPlane[(imgY / 2) * img->stride[VPX_PLANE_U] + (imgX / 2)];
int v = vPlane[(imgY / 2) * img->stride[VPX_PLANE_V] + (imgX / 2)];
int c = y - 16;
int d = (u - 128);
int e = (v - 128);
// TODO: adjust colors ?
int r = clamp8((298 * c + 409 * e + 128) >> 8);
int g = clamp8((298 * c - 100 * d - 208 * e + 128) >> 8);
int b = clamp8((298 * c + 516 * d + 128) >> 8);
// TODO: cast instead of clamp8
data[i + 0] = static_cast<uint8_t>(r);
data[i + 1] = static_cast<uint8_t>(g);
data[i + 2] = static_cast<uint8_t>(b);
i += 3;
}
}
return rgbImg;
}

Resources