Skip to content

Commit

Permalink
Added plain C packet traversal.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 4, 2024
1 parent e0789fb commit 6feadfb
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 28 deletions.
102 changes: 102 additions & 0 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ class BVH
void BuildAVX( const bvhvec4* vertices, const unsigned int primCount );
void Refit();
int Intersect( Ray& ray ) const;
void Intersect256Rays( Ray* first ) const;
private:
void IntersectTri( Ray& ray, const unsigned int triIdx ) const;
static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax );
Expand Down Expand Up @@ -717,6 +718,107 @@ int BVH::Intersect( Ray& ray ) const
return steps;
}

// Intersect a BVH with a ray packet.
// The 256 rays travel together to better utilize the caches and to amortize the cost
// of memory transfers over the rays in the bundle.
// Note that this basic implementation assumes a specific layout of the rays. Provided
// as 'proof of concept', should not be used in production code.
void BVH::Intersect256Rays( Ray* packet ) const
{
// Corner rays are: 0, 51, 204 and 255
// Construct the bounding planes, with normals pointing outwards
bvhvec3 O = packet[0].O; // same for all rays in this case
bvhvec3 p0 = packet[0].O + packet[0].D; // top-left
bvhvec3 p1 = packet[51].O + packet[51].D; // top-right
bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left
bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right
bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane
bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
float t0 = dot( O, plane0 ), t1 = dot( O, plane1 );
float t2 = dot( O, plane2 ), t3 = dot( O, plane3 );
// Traverse the tree with the packet
int first = 0, last = 255, tmp; // first and last active ray in the packet
BVHNode* node = &bvhNode[0], * stack[64];
unsigned int stackPtr = 0, firstLast[64];
while (1)
{
// 1. Early-in test: if first ray hits the node, the packet visits the node
bool earlyHit = IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f;
// 2. Early-out test: if the node aabb is outside the four planes, we skip the node
if (!earlyHit)
{
const bvhvec3 bmin = node->aabbMin, bmax = node->aabbMax;
bvhvec3 p0( plane0.x < 0 ? bmax.x : bmin.x, plane0.y < 0 ? bmax.y : bmin.y, plane0.z < 0 ? bmax.z : bmin.z );
bvhvec3 p1( plane1.x < 0 ? bmax.x : bmin.x, plane1.y < 0 ? bmax.y : bmin.y, plane1.z < 0 ? bmax.z : bmin.z );
bvhvec3 p2( plane2.x < 0 ? bmax.x : bmin.x, plane2.y < 0 ? bmax.y : bmin.y, plane2.z < 0 ? bmax.z : bmin.z );
bvhvec3 p3( plane3.x < 0 ? bmax.x : bmin.x, plane3.y < 0 ? bmax.y : bmin.y, plane3.z < 0 ? bmax.z : bmin.z );
if (dot( p0, plane0 ) > t0 || dot( p1, plane1 ) > t1 || dot( p2, plane2 ) > t2 || dot( p3, plane3 ) > t3)
{
if (stackPtr == 0) break; else // pop
node = stack[--stackPtr], tmp = firstLast[stackPtr],
first = tmp >> 8, last = tmp & 255;
}
else
{
// 3. Last resort: update first and last, stay in node if first > last
for( ; first <= last; first++ )
if (IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f) break;
for( ; last >= first; last-- )
if (IntersectAABB( packet[last], node->aabbMin, node->aabbMax ) < 1e30f) break;
}
}
// process result
if (last >= first)
{
if (node->isLeaf())
{
for (unsigned int j = 0; j < node->triCount; j++)
{
const unsigned int idx = triIdx[node->leftFirst + j];
const unsigned int vertIdx = idx * 3;
const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
for( int i = first; i <= last; i++ )
{
// Moeller-Trumbore ray/triangle intersection algorithm
Ray& ray = packet[i];
const bvhvec3 h = cross( ray.D, edge2 );
const float a = dot( edge1, h );
if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
const float f = 1 / a;
const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
const float u = f * dot( s, h );
if (u < 0 || u > 1) continue;
const bvhvec3 q = cross( s, edge1 );
const float v = f * dot( ray.D, q );
if (v < 0 || u + v > 1) continue;
const float t = f * dot( edge2, q );
if (t > 0 && t < ray.hit.t)
{
// register a hit: ray is shortened to t
ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx;
}
}
}
if (stackPtr == 0) break; else // pop
node = stack[--stackPtr], tmp = firstLast[stackPtr],
first = tmp >> 8, last = tmp & 255;
}
else
{
firstLast[stackPtr] = (first << 8) + last;
stack[stackPtr++] = &bvhNode[node->leftFirst + 1];
node = &bvhNode[node->leftFirst];
}
}
else if (stackPtr == 0) break; else // pop
node = stack[--stackPtr], tmp = firstLast[stackPtr],
first = tmp >> 8, last = tmp & 255;
}
}

// IntersectTri
void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const
{
Expand Down
54 changes: 35 additions & 19 deletions tiny_bvh_fenster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,37 +49,53 @@ void Tick( uint32_t* buf )
bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view;
bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up;

// generate primary rays in a buffer
// generate primary rays in a cacheline-aligned buffer - and, for data locality:
// organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile.
int N = 0;
Ray* rays = new Ray[SCRWIDTH * SCRHEIGHT * 16];
for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
{
for (int s = 0; s < 16; s++) // 16 samples per pixel
for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
{
float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4);
float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
rays[N++] = Ray( eye, normalize( P - eye ) );
int pixel_x = tx * 4 + x;
int pixel_y = ty * 4 + y;
for (int s = 0; s < 16; s++) // 16 samples per pixel
{
float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4);
float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
rays[N++] = Ray( eye, normalize( P - eye ) );
}
}
}

// trace primary rays
#if 1
const int packetCount = N / 256;
for (int i = 0; i < packetCount; i++) bvh.Intersect256Rays( rays + i * 256 );
#else
for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
#endif

// visualize result
for (int i = 0, y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
for( int i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
{
float avg = 0;
for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000)
for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
{
int primIdx = rays[i].hit.prim;
bvhvec3 v0 = triangles[primIdx * 3 + 0];
bvhvec3 v1 = triangles[primIdx * 3 + 1];
bvhvec3 v2 = triangles[primIdx * 3 + 2];
bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) );
avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) );
int pixel_x = tx * 4 + x;
int pixel_y = ty * 4 + y;
float avg = 0;
for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000)
{
int primIdx = rays[i].hit.prim;
bvhvec3 v0 = triangles[primIdx * 3 + 0];
bvhvec3 v1 = triangles[primIdx * 3 + 1];
bvhvec3 v2 = triangles[primIdx * 3 + 2];
bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) );
avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) );
}
int c = (int)(15.9f * avg);
buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16);
}
int c = (int)(15.9f * avg);
buf[x + y * SCRWIDTH] = c + (c << 8) + (c << 16);
}
}
47 changes: 38 additions & 9 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,23 @@ int main()
bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view;
bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up;

// generate primary rays in a cacheline-aligned buffer
// generate primary rays in a cacheline-aligned buffer - and, for data locality:
// organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile.
int N = 0;
Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
{
for (int s = 0; s < 16; s++) // 16 samples per pixel
for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
{
float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4);
float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
rays[N++] = Ray( eye, normalize( P - eye ) );
int pixel_x = tx * 4 + x;
int pixel_y = ty * 4 + y;
for (int s = 0; s < 16; s++) // 16 samples per pixel
{
float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4);
float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
rays[N++] = Ray( eye, normalize( P - eye ) );
}
}
}

Expand All @@ -88,12 +94,15 @@ int main()
printf( "----------------------------------------------------------------\n" );

Timer t;
float mrays;

// measure single-core bvh construction time - warming caches
printf( "BVH construction speed\n" );
printf( "warming caches...\n" );
bvh.Build( (bvhvec4*)triangles, verts / 3 );

#if 1

// measure single-core bvh construction time - reference builder
t.reset();
printf( "- reference builder: " );
Expand Down Expand Up @@ -125,7 +134,7 @@ int main()
for (int pass = 0; pass < 3; pass++)
for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
float traceTimeST = t.elapsed() / 3.0f;
float mrays = (float)N / traceTimeST;
mrays = (float)N / traceTimeST;
printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeST * 1000, (float)N * 1e-6f, mrays * 1e-6f );

// trace all rays three times to estimate average performance
Expand All @@ -146,7 +155,27 @@ int main()
mrays = (float)N / traceTimeMT;
printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMT * 1000, (float)N * 1e-6f, mrays * 1e-6f );

// shuffle rays for the next experiment
#endif

// trace all rays three times to estimate average performance
// - coherent distribution, multi-core, packet traversal
t.reset();
printf( "- CPU, coherent, basic 2-way layout, MT, packets: " );
for (int j = 0; j < 3; j++)
{
const int batchCount = N / (30 * 256); // batches of 30 packets of 256 rays
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
{
const int batchStart = batch * 30 * 256;
for (int i = 0; i < 30; i++) bvh.Intersect256Rays( rays + batchStart + i * 256 );
}
}
float traceTimeMTP = t.elapsed() / 3.0f;
mrays = (float)N / traceTimeMTP;
printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMTP * 1000, (float)N * 1e-6f, mrays * 1e-6f );

// shuffle rays for the next experiment - TODO: replace by random bounce
for( int i = 0; i < N; i++ )
{
int j = (i + 17 * rand()) % N;
Expand Down

0 comments on commit 6feadfb

Please sign in to comment.