Skip to content

Commit

Permalink
Next Event Estimation for wavefront pt.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Dec 19, 2024
1 parent 299bc27 commit f8098a2
Show file tree
Hide file tree
Showing 3 changed files with 325 additions and 55 deletions.
25 changes: 15 additions & 10 deletions tiny_bvh_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triD
// View pyramid for a pinhole camera
struct RenderData
{
bvhvec4 eye = bvhvec4( 0, 30, 0, 0 );
bvhvec4 view = bvhvec4( -1, 0, 0, 0 );
bvhvec4 C, p0, p1, p2;
bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ), view = bvhvec4( -1, 0, 0, 0 ), C, p0, p1, p2;
uint32_t frameIdx, dummy1, dummy2, dummy3;
} rd;

Expand All @@ -46,6 +44,15 @@ void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, in
for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
}
void AddQuad( const bvhvec3 pos, const float w, const float d, int c )
{
bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (triCount + 2) * 48 );
if (tris) memcpy( data + 6, tris, triCount * 48 ), tinybvh::free64( tris );
data[0] = bvhvec3( -w, 0, -d ), data[1] = bvhvec3( w, 0, -d );
data[2] = bvhvec3( w, 0, d ), data[3] = bvhvec3( -w, 0, -d ), tris = data;
data[4] = bvhvec3( w, 0, d ), data[5] = bvhvec3( -w, 0, d ), triCount += 2;
for( int i = 0; i < 6; i++ ) data[i] = 0.5f * data[i] + pos, data[i].w = *(float*)&c;
}

Buffer* cwbvhNodes = 0;
Buffer* cwbvhTris = 0;
Expand Down Expand Up @@ -73,6 +80,7 @@ void Init()
// load raw vertex data
AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
AddQuad( bvhvec3( 0, 30, -1 ), 9, 5, 0x1ffffff );
// build bvh (here: 'compressed wide bvh', for efficient GPU rendering)
bvh.Build( tris, triCount );
// create gpu buffers
Expand All @@ -99,12 +107,9 @@ bool UpdateCamera( float delta_time_s, fenster& f )
bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
// get camera controls.
bool moved = false;
if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true;
if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true;
if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true;
if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true;
if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true;
if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true;
if (f.keys['A'] || f.keys['D']) rd.eye += right * delta_time_s * (f.keys['D'] ? 10 : -10), moved = true;
if (f.keys['W'] || f.keys['S']) rd.eye += rd.view * delta_time_s * (f.keys['W'] ? 10 : -10), moved = true;
if (f.keys['R'] || f.keys['F']) rd.eye += up * delta_time_s * (f.keys['R'] ? 20 : -20), moved = true;
if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true;
if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true;
if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true;
Expand Down Expand Up @@ -132,7 +137,7 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
init->Run( 1 ); // init atomic counters, set buffer ptrs etc.
generate->SetArguments( raysOut );
generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
for (int i = 0; i < 4; i++)
for (int i = 0; i < 2; i++)
{
swap( raysOut, raysIn );
extend->SetArguments( raysIn );
Expand Down
248 changes: 248 additions & 0 deletions traverse.cl
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,254 @@ float4 traverse_cwbvh( global const float4* cwbvhNodes, global const float4* cwb
return hit;
}

bool isoccluded_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, const float3 O, const float3 D, const float3 rD, const float t )
{
// initialize ray
const unsigned threadId = get_global_id( 0 );
#ifdef SIMD_AABBTEST
const float4 O4 = (float4)( O, 1 ); // rayData[threadId].O;
const float4 D4 = (float4)( D, 0 ); // rayData[threadId].D;
const float4 rD4 = (float4)( rD, 1 ); // rayData[threadId].rD;
#endif // otherwise, we'll use the float3 input directly.
// prepare traversal
uint2 stack[STACK_SIZE];
uint stackPtr = 0;
float tmax = t;
#ifdef SIMD_AABBTEST
const uint octinv4 = (7 - ((D4.x < 0 ? 4 : 0) | (D4.y < 0 ? 2 : 0) | (D4.z < 0 ? 1 : 0))) * 0x1010101;
#else
const uint octinv4 = (7 - ((D.x < 0 ? 4 : 0) | (D.y < 0 ? 2 : 0) | (D.z < 0 ? 1 : 0))) * 0x1010101;
#endif
uint2 ngroup = (uint2)(0, 0b10000000000000000000000000000000), tgroup = (uint2)(0);
do
{
if (ngroup.y > 0x00FFFFFF)
{
const unsigned hits = ngroup.y, imask = ngroup.y;
const unsigned child_bit_index = __bfind( hits );
const unsigned child_node_base_index = ngroup.x;
ngroup.y &= ~(1 << child_bit_index);
if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( ngroup ); }
{
const unsigned slot_index = (child_bit_index - 24) ^ (octinv4 & 255);
const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
const unsigned child_node_index = child_node_base_index + relative_index;
#ifdef USE_VLOAD_VSTORE
const float* p = (float*)&cwbvhNodes[child_node_index * 5 + 0];
float4 n0 = vload4( 0, p ), n1 = vload4( 1, p ), n2 = vload4( 2, p );
float4 n3 = vload4( 3, p ), n4 = vload4( 4, p );
#else
float4 n0 = cwbvhNodes[child_node_index * 5 + 0], n1 = cwbvhNodes[child_node_index * 5 + 1];
float4 n2 = cwbvhNodes[child_node_index * 5 + 2], n3 = cwbvhNodes[child_node_index * 5 + 3];
float4 n4 = cwbvhNodes[child_node_index * 5 + 4];
#endif
const char4 e = as_char4( n0.w );
ngroup.x = as_uint( n1.x ), tgroup = (uint2)(as_uint( n1.y ), 0);
unsigned hitmask = 0;
#ifdef SIMD_AABBTEST
const float4 idir4 = (float4)(
as_float( (e.x + 127) << 23 ) * rD4.x, as_float( (e.y + 127) << 23 ) * rD4.y,
as_float( (e.z + 127) << 23 ) * rD4.z, 1
);
const float4 orig4 = (n0 - O4) * rD4;
#else
const float idirx = as_float( (e.x + 127) << 23 ) * rD.x;
const float idiry = as_float( (e.y + 127) << 23 ) * rD.y;
const float idirz = as_float( (e.z + 127) << 23 ) * rD.z;
const float origx = (n0.x - O.x) * rD.x;
const float origy = (n0.y - O.y) * rD.y;
const float origz = (n0.z - O.z) * rD.z;
#endif
{ // first 4
const unsigned meta4 = as_uint( n1.z ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.z : n2.x ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.x : n3.z ) );
const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.x : n2.z ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.z : n4.x ) );
const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.z : n3.x ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.x : n4.z ) );
{
#ifdef SIMD_AABBTEST
const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
if (cmina <= cmaxa) UPDATE_HITMASK;
if (cminb <= cmaxb) UPDATE_HITMASK1;
#else
float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK;
if (n1.x <= n1.y) UPDATE_HITMASK1;
#endif
#ifdef SIMD_AABBTEST
const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
if (cminc <= cmaxc) UPDATE_HITMASK2;
if (cmind <= cmaxd) UPDATE_HITMASK3;
#else
tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK2;
if (n1.x <= n1.y) UPDATE_HITMASK3;
#endif
}
}
{ // second 4
const unsigned meta4 = as_uint( n1.w ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.w : n2.y ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.y : n3.w ) );
const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.y : n2.w ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.w : n4.y ) );
const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.w : n3.y ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.y : n4.w ) );
{
#ifdef SIMD_AABBTEST
const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
if (cmina <= cmaxa) UPDATE_HITMASK0;
if (cminb <= cmaxb) UPDATE_HITMASK1;
#else
float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK0;
if (n1.x <= n1.y) UPDATE_HITMASK1;
#endif
#ifdef SIMD_AABBTEST
const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
if (cminc <= cmaxc) UPDATE_HITMASK2;
if (cmind <= cmaxd) UPDATE_HITMASK3;
#else
tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK2;
if (n1.x <= n1.y) UPDATE_HITMASK3;
#endif
}
}
ngroup.y = (hitmask & 0xFF000000) | (as_uint( n0.w ) >> 24), tgroup.y = hitmask & 0x00FFFFFF;
}
}
else tgroup = ngroup, ngroup = (uint2)(0);
while (tgroup.y != 0)
{
#ifdef CWBVH_COMPRESSED_TRIS
// Fast intersection of triangle data for the algorithm in:
// "Fast Ray-Triangle Intersections by Coordinate Transformation"
// Baldwin & Weber, 2016.
const unsigned triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 4;
const float4 T2 = cwbvhTris[triAddr + 2];
const float transS = T2.x * O.x + T2.y * O.y + T2.z * O.z + T2.w;
const float transD = T2.x * D.x + T2.y * D.y + T2.z * D.z;
const float d = -transS / transD;
if (d > 0 && d < tmax)
{
const float4 T0 = cwbvhTris[triAddr + 0];
const float4 T1 = cwbvhTris[triAddr + 1];
#ifdef SIMD_AABBTEST
const float4 I = O4 + d * D4;
#else
const float3 I = O + d * D;
#endif
const float u = T0.x * I.x + T0.y * I.y + T0.z * I.z + T0.w;
const float v = T1.x * I.x + T1.y * I.y + T1.z * I.z + T1.w;
const bool hit = u >= 0 && v >= 0 && u + v < 1;
if (hit) return true;
}
#else
// Möller-Trumbore intersection; triangles are stored as 3x16 bytes,
// with the original primitive index in the (otherwise unused) w
// component of vertex 0.
const int triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 3;
const float3 v0 = cwbvhTris[triAddr].xyz;
const float3 e1 = cwbvhTris[triAddr + 1].xyz - v0;
const float3 e2 = cwbvhTris[triAddr + 2].xyz - v0;
#ifdef SIMD_AABBTEST
const float3 r = cross( D4.xyz, e2 );
#else
const float3 r = cross( D, e2 );
#endif
const float a = dot( e1, r );
if (fabs( a ) > 0.0000001f)
{
const float f = 1 / a;
#ifdef SIMD_AABBTEST
const float3 s = O4.xyz - v0;
#else
const float3 s = O - v0;
#endif
const float u = f * dot( s, r );
if (u >= 0 && u <= 1)
{
const float3 q = cross( s, e1 );
#ifdef SIMD_AABBTEST
const float v = f * dot( D4.xyz, q );
#else
const float v = f * dot( D, q );
#endif
if (v >= 0 && u + v <= 1)
{
const float d = f * dot( e2, q );
if (d > 0.0f && d < tmax) return true;
}
}
}
#endif
tgroup.y -= 1 << triangleIndex;
}
if (ngroup.y <= 0x00FFFFFF)
{
if (stackPtr > 0) { STACK_POP( ngroup ); } else break;
}
} while (true);
return false; // no occlusion found.
}

void kernel batch_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, global struct Ray* rayData )
{
// initialize ray
Expand Down
Loading

0 comments on commit f8098a2

Please sign in to comment.