From 6feadfb612f015abf05f04b16127c34cf97527c0 Mon Sep 17 00:00:00 2001 From: jbikker Date: Mon, 4 Nov 2024 13:47:11 +0100 Subject: [PATCH] Added plain C packet traversal. --- tiny_bvh.h | 102 +++++++++++++++++++++++++++++++++++++++++ tiny_bvh_fenster.cpp | 54 ++++++++++++++-------- tiny_bvh_speedtest.cpp | 47 +++++++++++++++---- 3 files changed, 175 insertions(+), 28 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index 320547f..16d8a37 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -320,6 +320,7 @@ class BVH void BuildAVX( const bvhvec4* vertices, const unsigned int primCount ); void Refit(); int Intersect( Ray& ray ) const; + void Intersect256Rays( Ray* first ) const; private: void IntersectTri( Ray& ray, const unsigned int triIdx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); @@ -717,6 +718,107 @@ int BVH::Intersect( Ray& ray ) const return steps; } +// Intersect a BVH with a ray packet. +// The 256 rays travel together to better utilize the caches and to amortize the cost +// of memory transfers over the rays in the bundle. +// Note that this basic implementation assumes a specific layout of the rays. Provided +// as 'proof of concept', should not be used in production code. +void BVH::Intersect256Rays( Ray* packet ) const +{ + // Corner rays are: 0, 51, 204 and 255 + // Construct the bounding planes, with normals pointing outwards + bvhvec3 O = packet[0].O; // same for all rays in this case + bvhvec3 p0 = packet[0].O + packet[0].D; // top-left + bvhvec3 p1 = packet[51].O + packet[51].D; // top-right + bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left + bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right + bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane + bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane + bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane + bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane + float t0 = dot( O, plane0 ), t1 = dot( O, plane1 ); + float t2 = dot( O, plane2 ), t3 = dot( O, plane3 ); + // Traverse the tree with the packet + int first = 0, last = 255, tmp; // first and last active ray in the packet + BVHNode* node = &bvhNode[0], * stack[64]; + unsigned int stackPtr = 0, firstLast[64]; + while (1) + { + // 1. Early-in test: if first ray hits the node, the packet visits the node + bool earlyHit = IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f; + // 2. Early-out test: if the node aabb is outside the four planes, we skip the node + if (!earlyHit) + { + const bvhvec3 bmin = node->aabbMin, bmax = node->aabbMax; + bvhvec3 p0( plane0.x < 0 ? bmax.x : bmin.x, plane0.y < 0 ? bmax.y : bmin.y, plane0.z < 0 ? bmax.z : bmin.z ); + bvhvec3 p1( plane1.x < 0 ? bmax.x : bmin.x, plane1.y < 0 ? bmax.y : bmin.y, plane1.z < 0 ? bmax.z : bmin.z ); + bvhvec3 p2( plane2.x < 0 ? bmax.x : bmin.x, plane2.y < 0 ? bmax.y : bmin.y, plane2.z < 0 ? bmax.z : bmin.z ); + bvhvec3 p3( plane3.x < 0 ? bmax.x : bmin.x, plane3.y < 0 ? bmax.y : bmin.y, plane3.z < 0 ? bmax.z : bmin.z ); + if (dot( p0, plane0 ) > t0 || dot( p1, plane1 ) > t1 || dot( p2, plane2 ) > t2 || dot( p3, plane3 ) > t3) + { + if (stackPtr == 0) break; else // pop + node = stack[--stackPtr], tmp = firstLast[stackPtr], + first = tmp >> 8, last = tmp & 255; + } + else + { + // 3. Last resort: update first and last, stay in node if first > last + for( ; first <= last; first++ ) + if (IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f) break; + for( ; last >= first; last-- ) + if (IntersectAABB( packet[last], node->aabbMin, node->aabbMax ) < 1e30f) break; + } + } + // process result + if (last >= first) + { + if (node->isLeaf()) + { + for (unsigned int j = 0; j < node->triCount; j++) + { + const unsigned int idx = triIdx[node->leftFirst + j]; + const unsigned int vertIdx = idx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + for( int i = first; i <= last; i++ ) + { + // Moeller-Trumbore ray/triangle intersection algorithm + Ray& ray = packet[i]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) + { + // register a hit: ray is shortened to t + ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx; + } + } + } + if (stackPtr == 0) break; else // pop + node = stack[--stackPtr], tmp = firstLast[stackPtr], + first = tmp >> 8, last = tmp & 255; + } + else + { + firstLast[stackPtr] = (first << 8) + last; + stack[stackPtr++] = &bvhNode[node->leftFirst + 1]; + node = &bvhNode[node->leftFirst]; + } + } + else if (stackPtr == 0) break; else // pop + node = stack[--stackPtr], tmp = firstLast[stackPtr], + first = tmp >> 8, last = tmp & 255; + } +} + // IntersectTri void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const { diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index 0b0df62..3078252 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -49,37 +49,53 @@ void Tick( uint32_t* buf ) bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view; bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; - // generate primary rays in a buffer + // generate primary rays in a cacheline-aligned buffer - and, for data locality: + // organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile. int N = 0; - Ray* rays = new Ray[SCRWIDTH * SCRHEIGHT * 16]; - for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++) + Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) ); + for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ ) { - for (int s = 0; s < 16; s++) // 16 samples per pixel + for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++) { - float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4); - float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4); - bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); - rays[N++] = Ray( eye, normalize( P - eye ) ); + int pixel_x = tx * 4 + x; + int pixel_y = ty * 4 + y; + for (int s = 0; s < 16; s++) // 16 samples per pixel + { + float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4); + float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4); + bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); + rays[N++] = Ray( eye, normalize( P - eye ) ); + } } } // trace primary rays +#if 1 + const int packetCount = N / 256; + for (int i = 0; i < packetCount; i++) bvh.Intersect256Rays( rays + i * 256 ); +#else for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); +#endif // visualize result - for (int i = 0, y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++) + for( int i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ ) { - float avg = 0; - for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000) + for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++) { - int primIdx = rays[i].hit.prim; - bvhvec3 v0 = triangles[primIdx * 3 + 0]; - bvhvec3 v1 = triangles[primIdx * 3 + 1]; - bvhvec3 v2 = triangles[primIdx * 3 + 2]; - bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) ); - avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) ); + int pixel_x = tx * 4 + x; + int pixel_y = ty * 4 + y; + float avg = 0; + for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000) + { + int primIdx = rays[i].hit.prim; + bvhvec3 v0 = triangles[primIdx * 3 + 0]; + bvhvec3 v1 = triangles[primIdx * 3 + 1]; + bvhvec3 v2 = triangles[primIdx * 3 + 2]; + bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) ); + avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) ); + } + int c = (int)(15.9f * avg); + buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16); } - int c = (int)(15.9f * avg); - buf[x + y * SCRWIDTH] = c + (c << 8) + (c << 16); } } diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index cd5e75c..f0b569c 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -67,17 +67,23 @@ int main() bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view; bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; - // generate primary rays in a cacheline-aligned buffer + // generate primary rays in a cacheline-aligned buffer - and, for data locality: + // organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile. int N = 0; Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) ); - for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++) + for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ ) { - for (int s = 0; s < 16; s++) // 16 samples per pixel + for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++) { - float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4); - float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4); - bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); - rays[N++] = Ray( eye, normalize( P - eye ) ); + int pixel_x = tx * 4 + x; + int pixel_y = ty * 4 + y; + for (int s = 0; s < 16; s++) // 16 samples per pixel + { + float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4); + float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4); + bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1); + rays[N++] = Ray( eye, normalize( P - eye ) ); + } } } @@ -88,12 +94,15 @@ int main() printf( "----------------------------------------------------------------\n" ); Timer t; + float mrays; // measure single-core bvh construction time - warming caches printf( "BVH construction speed\n" ); printf( "warming caches...\n" ); bvh.Build( (bvhvec4*)triangles, verts / 3 ); +#if 1 + // measure single-core bvh construction time - reference builder t.reset(); printf( "- reference builder: " ); @@ -125,7 +134,7 @@ int main() for (int pass = 0; pass < 3; pass++) for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); float traceTimeST = t.elapsed() / 3.0f; - float mrays = (float)N / traceTimeST; + mrays = (float)N / traceTimeST; printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeST * 1000, (float)N * 1e-6f, mrays * 1e-6f ); // trace all rays three times to estimate average performance @@ -146,7 +155,27 @@ int main() mrays = (float)N / traceTimeMT; printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMT * 1000, (float)N * 1e-6f, mrays * 1e-6f ); - // shuffle rays for the next experiment +#endif + + // trace all rays three times to estimate average performance + // - coherent distribution, multi-core, packet traversal + t.reset(); + printf( "- CPU, coherent, basic 2-way layout, MT, packets: " ); + for (int j = 0; j < 3; j++) + { + const int batchCount = N / (30 * 256); // batches of 30 packets of 256 rays + #pragma omp parallel for schedule(dynamic) + for (int batch = 0; batch < batchCount; batch++) + { + const int batchStart = batch * 30 * 256; + for (int i = 0; i < 30; i++) bvh.Intersect256Rays( rays + batchStart + i * 256 ); + } + } + float traceTimeMTP = t.elapsed() / 3.0f; + mrays = (float)N / traceTimeMTP; + printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMTP * 1000, (float)N * 1e-6f, mrays * 1e-6f ); + + // shuffle rays for the next experiment - TODO: replace by random bounce for( int i = 0; i < N; i++ ) { int j = (i + 17 * rand()) % N;