diff --git a/tiny_bvh.h b/tiny_bvh.h index 9d94f32..c0272d5 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +// Nov 08, '24: version 0.3.0 // Oct 30, '24: version 0.1.0 : Initial release. // Oct 29, '24: version 0.0.1 : Establishing interface. // @@ -72,7 +73,8 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 0 -#define TINY_BVH_VERSION_MINOR 2 +#define TINY_BVH_VERSION_MINOR 3 +#define TINY_BVH_VERSION_SUB 0 // ============================================================================ // @@ -236,6 +238,15 @@ static bvhvec3 normalize( const bvhvec3& a ) return a * rl; } +// SIMD typedef, helps keeping the interface generic +#ifdef BVH_USEAVX +typedef __m128 SIMDVEC4; +#define SIMD_SETVEC(a,b,c,d) _mm_set_ps( a, b, c, d ) +#else +typedef bvhvec4 SIMDVEC4; +#define SIMD_SETVEC(a,b,c,d) bvhvec4( d, c, b, a ) +#endif + // ============================================================================ // // R A Y T R A C I N G S T R U C T S / C L A S S E S @@ -298,16 +309,14 @@ class BVH bvhvec3 rmax; unsigned int firstTri; // total: 64 bytes bool isLeaf() const { return triCount > 0; } }; -#ifdef BVH_USEAVX struct BVHNodeAlt2 { // Second alternative 64-byte BVH node layout, same as BVHNodeAlt but // with child AABBs stored in SoA order. - __m128 xxxx, yyyy, zzzz; + SIMDVEC4 xxxx, yyyy, zzzz; unsigned int left, right, triCount, firstTri; // total: 64 bytes bool isLeaf() const { return triCount > 0; } }; -#endif struct Fragment { // A fragment stores the bounds of an input primitive. The name 'Fragment' is from @@ -352,11 +361,11 @@ class BVH void Refit(); int Intersect( Ray& ray, BVHLayout layout = WALD_32BYTE ) const; void Intersect256Rays( Ray* first ) const; - void Intersect256RaysSSE( Ray* packet ) const; + void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX private: int Intersect_Wald32Byte( Ray& ray ) const; int Intersect_AilaLaine( Ray& ray ) const; - int Intersect_AltSoA( Ray& ray ) const; + int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX void IntersectTri( Ray& ray, const unsigned int triIdx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax ) @@ -373,9 +382,7 @@ class BVH unsigned int newNodePtr = 0; // number of reserved nodes BVHNode* bvhNode = 0; // BVH node pool, Wald 32-byte format. Root is always in node 0. BVHNodeAlt* altNode = 0; // BVH node in Aila & Laine format. -#ifdef BVH_USEAVX BVHNodeAlt2* alt2Node = 0; // BVH node in Aila & Laine (SoA version) format. -#endif }; } // namespace tinybvh @@ -545,7 +552,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal ) } } } -#ifdef BVH_USEAVX else if (from == WALD_32BYTE && to == ALT_SOA) { // allocate space @@ -570,9 +576,11 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal ) { const BVHNode& left = bvhNode[node.leftFirst]; const BVHNode& right = bvhNode[node.leftFirst + 1]; - alt2Node[idx].xxxx = _mm_setr_ps( left.aabbMin.x, left.aabbMax.x, right.aabbMin.x, right.aabbMax.x ); - alt2Node[idx].yyyy = _mm_setr_ps( left.aabbMin.y, left.aabbMax.y, right.aabbMin.y, right.aabbMax.y ); - alt2Node[idx].zzzz = _mm_setr_ps( left.aabbMin.z, left.aabbMax.z, right.aabbMin.z, right.aabbMax.z ); + // This BVH layout requires BVH_USEAVX for traversal, but at least we + // can convert to it without SSE/AVX/NEON support. + alt2Node[idx].xxxx = SIMD_SETVEC( left.aabbMin.x, left.aabbMax.x, right.aabbMin.x, right.aabbMax.x ); + alt2Node[idx].yyyy = SIMD_SETVEC( left.aabbMin.y, left.aabbMax.y, right.aabbMin.y, right.aabbMax.y ); + alt2Node[idx].zzzz = SIMD_SETVEC( left.aabbMin.z, left.aabbMax.z, right.aabbMin.z, right.aabbMax.z ); alt2Node[idx].left = newAlt2Node; // right will be filled when popped stack[stackPtr++] = idx; stack[stackPtr++] = node.leftFirst + 1; @@ -580,7 +588,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal ) } } } -#endif else { // For now all other conversions are invalid. @@ -588,186 +595,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal ) } } -#ifdef BVH_USEAVX - -// Ultra-fast single-threaded AVX binned-SAH-builder. -// This code produces BVHs nearly identical to reference, but much faster. -// On a 12th gen laptop i7 CPU, Sponza Crytek (~260k tris) is processed in 51ms. -// The code relies on the availability of AVX instructions. AVX2 is not needed. -#ifdef _MSC_VER -#define LANE(a,b) a.m128_f32[b] -// Not using clang/g++ method under MSCC; compiler may benefit from .m128_i32. -#define ILANE(a,b) a.m128i_i32[b] -#else -#define LANE(a,b) a[b] -// Below method reduces to a single instruction. -#define ILANE(a,b) _mm_cvtsi128_si32(_mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps( a ), _mm_castsi128_ps( a ), b))) -#endif -inline float halfArea( const __m128 a /* a contains extent of aabb */ ) -{ - return LANE( a, 0 ) * LANE( a, 1 ) + LANE( a, 1 ) * LANE( a, 2 ) + LANE( a, 2 ) * LANE( a, 3 ); -} -inline float halfArea( const __m256 a /* a contains aabb itself, with min.xyz negated */ ) -{ - const __m128 q = _mm256_castps256_ps128( _mm256_add_ps( _mm256_permute2f128_ps( a, a, 5 ), a ) ); - const __m128 v = _mm_mul_ps( q, _mm_shuffle_ps( q, q, 9 ) ); - return LANE( v, 0 ) + LANE( v, 1 ) + LANE( v, 2 ); -} -#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ - ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \ - splitCost = ANLR, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; } -#if defined(_MSC_VER) -#pragma warning ( push ) -#pragma warning( disable:4701 ) // "potentially uninitialized local variable 'bestLBox' used" -#elif defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif -void BVH::BuildAVX( const bvhvec4* vertices, const unsigned int primCount ) -{ - int test = BVHBINS; - if (test != 8) assert( false ); // AVX builders require BVHBINS == 8. - // aligned data - ALIGNED( 64 ) __m256 binbox[3 * BVHBINS]; // 768 bytes - ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS]; // 768 bytes - ALIGNED( 64 ) unsigned int count[3][BVHBINS]{}; // 96 bytes - ALIGNED( 64 ) __m256 bestLBox, bestRBox; // 64 bytes - // some constants - static const __m128 max4 = _mm_set1_ps( -1e30f ), half4 = _mm_set1_ps( 0.5f ); - static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 ); - static const __m256 max8 = _mm256_set1_ps( -1e30f ); - static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f ); - static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f ); - static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() ); - static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f ); - for (unsigned int i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template - // reset node pool - newNodePtr = 2; - if (!bvhNode) - { - triCount = primCount; - verts = (bvhvec4*)vertices; - triIdx = new unsigned int[triCount]; - bvhNode = (BVHNode*)ALIGNED_MALLOC( triCount * 2 * sizeof( BVHNode ) ); - memset( &bvhNode[1], 0, 32 ); // avoid crash in refit. - fragment = new Fragment[triCount]; - idxCount = triCount; - } - else assert( triCount == primCount ); // don't change triangle count between builds. - struct FragSSE { __m128 bmin4, bmax4; }; - FragSSE* frag4 = (FragSSE*)fragment; - __m256* frag8 = (__m256*)fragment; - const __m128* tris4 = (__m128*)verts; - // assign all triangles to the root node - BVHNode& root = bvhNode[0]; - root.leftFirst = 0, root.triCount = triCount; - // initialize fragments and update root bounds - __m128 rootMin = max4, rootMax = max4; - for (unsigned int i = 0; i < triCount; i++) - { - const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ) ); - const __m128 v2 = _mm_max_ps( _mm_max_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ); - frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = _mm_max_ps( rootMin, v1 ), rootMax = _mm_max_ps( rootMax, v2 ), triIdx[i] = i; - } - rootMin = _mm_xor_ps( rootMin, signFlip4 ); - root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax; - // subdivide recursively - ALIGNED( 64 ) unsigned int task[128], taskCount = 0, nodeIdx = 0; - const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-10f; - while (1) - { - while (1) - { - BVHNode& node = bvhNode[nodeIdx]; - __m128* node4 = (__m128*) & bvhNode[nodeIdx]; - // find optimal object split - const __m128 d4 = _mm_blendv_ps( min1, _mm_sub_ps( node4[1], node4[0] ), mask3 ); - const __m128 nmin4 = _mm_mul_ps( _mm_and_ps( node4[0], mask3 ), two4 ); - const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) ); - // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies": - // main loop operates on two fragments to minimize dependencies and maximize ILP. - unsigned int fi = triIdx[node.leftFirst]; - memset( count, 0, sizeof( count ) ); - __m256 r0, r1, r2, f = frag8[fi]; - __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ); - memcpy( binbox, binboxOrig, sizeof( binbox ) ); - unsigned int i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1; - for (unsigned int i = 0; i < node.triCount - 1; i++) - { - const unsigned int fid = *ti++; - const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; - const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4; - r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); - const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ); - f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++; - binbox[i0] = r0, i0 = ILANE( b4, 0 ); - binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 ); - binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 ); - } - // final business for final fragment - const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; - count[0][i0]++, count[1][i1]++, count[2][i2]++; - r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); - binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; - // calculate per-split totals - float splitCost = 1e30f; - unsigned int bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; - const __m256* bb = binbox; - for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) - { - // hardcoded bin processing for BVHBINS == 8, see end of file for generic code. - assert( BVHBINS == 8 ); - const unsigned int lN0 = count[a][0], rN0 = count[a][7]; - const __m256 lb0 = bb[0], rb0 = bb[7]; - const unsigned int lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; - const unsigned int rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; - const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] ); - const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] ); - const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] ); - const unsigned int lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; - const unsigned int rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; - const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] ); - const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] ); - const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] ); - float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split - float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 ); - float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 ); - float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 ); - float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 ); - float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 ); - float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split - } - if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better. - // in-place partition - const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; - unsigned int t, fr = triIdx[src]; - for (unsigned int i = 0; i < node.triCount; i++) - { - const unsigned int bi = (unsigned int)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); - if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t; - } - // create child nodes and recurse - const unsigned int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; - if (leftCount == 0 || rightCount == 0) break; // should not happen. - *(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 ); - bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount; - node.leftFirst = n++, node.triCount = 0, newNodePtr += 2; - *(__m256*)& bvhNode[n] = _mm256_xor_ps( bestRBox, signFlip8 ); - bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount; - task[taskCount++] = n, nodeIdx = n - 1; - } - // fetch subdivision task from stack - if (taskCount == 0) break; else nodeIdx = task[--taskCount]; - } -} -#if defined(_MSC_VER) -#pragma warning ( pop ) // restore 4701 -#elif defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic pop // restore -Wmaybe-uninitialized -#endif - -#endif - // Refitting: For animated meshes, where the topology remains intact. This // includes trees waving in the wind, or subsequent frames for skinned // animations. Repeated refitting tends to lead to deteriorated BVHs and @@ -811,11 +638,9 @@ int BVH::Intersect( Ray& ray, BVHLayout layout ) const case AILA_LAINE: return Intersect_AilaLaine( ray ); break; - #ifdef BVH_USEAVX case ALT_SOA: return Intersect_AltSoA( ray ); break; - #endif default: assert( false ); }; @@ -899,131 +724,46 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const return steps; } -#ifdef BVH_USEAVX - -// Traverse the second alternative BVH layout (ALT_SOA). -int BVH::Intersect_AltSoA( Ray& ray ) const +// Intersect a WALD_32BYTE BVH with a ray packet. +// The 256 rays travel together to better utilize the caches and to amortize the cost +// of memory transfers over the rays in the bundle. +// Note that this basic implementation assumes a specific layout of the rays. Provided +// as 'proof of concept', should not be used in production code. +// Based on Large Ray Packets for Real-time Whitted Ray Tracing, Overbeck et al., 2008, +// extended with sorted traversal and reduced stack traffic. +void BVH::Intersect256Rays( Ray* packet ) const { - assert( alt2Node != 0 ); - BVHNodeAlt2* node = &alt2Node[0], * stack[64]; - unsigned int stackPtr = 0, steps = 0; - const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x ); - const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y ); - const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z ); + // convenience macro +#define CALC_TMIN_TMAX_WITH_SLABTEST_ON_RAY( r ) const bvhvec3 rD = packet[r].rD, t1 = o1 * rD, t2 = o2 * rD; \ + const float tmin = tinybvh_max( tinybvh_max( tinybvh_min( t1.x, t2.x ), tinybvh_min( t1.y, t2.y ) ), tinybvh_min( t1.z, t2.z ) ); \ + const float tmax = tinybvh_min( tinybvh_min( tinybvh_max( t1.x, t2.x ), tinybvh_max( t1.y, t2.y ) ), tinybvh_max( t1.z, t2.z ) ); + // Corner rays are: 0, 51, 204 and 255 + // Construct the bounding planes, with normals pointing outwards + const bvhvec3 O = packet[0].O; // same for all rays in this case + const bvhvec3 p0 = packet[0].O + packet[0].D; // top-left + const bvhvec3 p1 = packet[51].O + packet[51].D; // top-right + const bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left + const bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right + const bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane + const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane + const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane + const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane + const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; + const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; + const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; + const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; + const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 ); + const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 ); + // Traverse the tree with the packet + int first = 0, last = 255; // first and last active ray in the packet + const BVHNode* node = &bvhNode[0]; + ALIGNED( 64 ) unsigned int stack[64], stackPtr = 0; while (1) { - steps++; if (node->isLeaf()) { - for (unsigned int i = 0; i < node->triCount; i++) - { - const unsigned int tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; - const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; - const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; - const bvhvec3 h = cross( ray.D, edge2 ); - const float a = dot( edge1, h ); - if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle - const float f = 1 / a; - const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); - const float u = f * dot( s, h ); - if (u < 0 || u > 1) continue; - const bvhvec3 q = cross( s, edge1 ); - const float v = f * dot( ray.D, q ); - if (v < 0 || u + v > 1) continue; - const float t = f * dot( edge2, q ); - if (t < 0 || t > ray.hit.t) continue; - ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx; - } - if (stackPtr == 0) break; else node = stack[--stackPtr]; - continue; - } - __m128 x4 = _mm_mul_ps( _mm_sub_ps( node->xxxx, Ox4 ), rDx4 ); - __m128 y4 = _mm_mul_ps( _mm_sub_ps( node->yyyy, Oy4 ), rDy4 ); - __m128 z4 = _mm_mul_ps( _mm_sub_ps( node->zzzz, Oz4 ), rDz4 ); - // transpose - __m128 t0 = _mm_unpacklo_ps( x4, y4 ), t2 = _mm_unpacklo_ps( z4, z4 ); - __m128 t1 = _mm_unpackhi_ps( x4, y4 ), t3 = _mm_unpackhi_ps( z4, z4 ); - __m128 xyzw1a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - __m128 xyzw2a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - __m128 xyzw1b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - __m128 xyzw2b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - // process - __m128 tmina4 = _mm_min_ps( xyzw1a, xyzw2a ), tmaxa4 = _mm_max_ps( xyzw1a, xyzw2a ); - __m128 tminb4 = _mm_min_ps( xyzw1b, xyzw2b ), tmaxb4 = _mm_max_ps( xyzw1b, xyzw2b ); - // transpose back - t0 = _mm_unpacklo_ps( tmina4, tmaxa4 ), t2 = _mm_unpacklo_ps( tminb4, tmaxb4 ); - t1 = _mm_unpackhi_ps( tmina4, tmaxa4 ), t3 = _mm_unpackhi_ps( tminb4, tmaxb4 ); - x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); - z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); - const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() ); - const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) ); - // TODO: use a shuffle here to do the comparison / select with SSE, then extract dist1 and dist2. - const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 ); - const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 ); - float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f; - float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f; - unsigned int lidx = node->left, ridx = node->right; - if (dist1 > dist2) - { - float t = dist1; dist1 = dist2; dist2 = t; - unsigned int i = lidx; lidx = ridx; ridx = i; - } - if (dist1 == 1e30f) - { - if (stackPtr == 0) break; else node = stack[--stackPtr]; - } - else - { - node = alt2Node + lidx; - if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx; - } - } - return steps; -} - -#endif - -// Intersect a WALD_32BYTE BVH with a ray packet. -// The 256 rays travel together to better utilize the caches and to amortize the cost -// of memory transfers over the rays in the bundle. -// Note that this basic implementation assumes a specific layout of the rays. Provided -// as 'proof of concept', should not be used in production code. -// Based on Large Ray Packets for Real-time Whitted Ray Tracing, Overbeck et al., 2008, -// extended with sorted traversal and reduced stack traffic. -void BVH::Intersect256Rays( Ray* packet ) const -{ - // convenience macro -#define CALC_TMIN_TMAX_WITH_SLABTEST_ON_RAY( r ) const bvhvec3 rD = packet[r].rD, t1 = o1 * rD, t2 = o2 * rD; \ - const float tmin = tinybvh_max( tinybvh_max( tinybvh_min( t1.x, t2.x ), tinybvh_min( t1.y, t2.y ) ), tinybvh_min( t1.z, t2.z ) ); \ - const float tmax = tinybvh_min( tinybvh_min( tinybvh_max( t1.x, t2.x ), tinybvh_max( t1.y, t2.y ) ), tinybvh_max( t1.z, t2.z ) ); - // Corner rays are: 0, 51, 204 and 255 - // Construct the bounding planes, with normals pointing outwards - const bvhvec3 O = packet[0].O; // same for all rays in this case - const bvhvec3 p0 = packet[0].O + packet[0].D; // top-left - const bvhvec3 p1 = packet[51].O + packet[51].D; // top-right - const bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left - const bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right - const bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane - const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane - const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane - const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane - const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2; - const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2; - const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2; - const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2; - const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 ); - const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 ); - // Traverse the tree with the packet - int first = 0, last = 255; // first and last active ray in the packet - const BVHNode* node = &bvhNode[0]; - ALIGNED( 64 ) unsigned int stack[64], stackPtr = 0; - while (1) - { - if (node->isLeaf()) - { - // handle leaf node - for (unsigned int j = 0; j < node->triCount; j++) + // handle leaf node + for (unsigned int j = 0; j < node->triCount; j++) { const unsigned int idx = triIdx[node->leftFirst + j], vid = idx * 3; const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid]; @@ -1155,8 +895,230 @@ void BVH::Intersect256Rays( Ray* packet ) const } } +// IntersectTri +void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const +{ + // Moeller-Trumbore ray/triangle intersection algorithm + const unsigned int vertIdx = idx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) return; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) return; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) + { + // register a hit: ray is shortened to t + ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx; + } +} + +// IntersectAABB +float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ) +{ + // "slab test" ray/AABB intersection + float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x; + float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 ); + float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y; + tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) ); + tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) ); + float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z; + tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) ); + tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) ); + if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f; +} + +// ============================================================================ +// +// I M P L E M E N T A T I O N - S I M D C O D E +// +// ============================================================================ + #ifdef BVH_USEAVX +// Ultra-fast single-threaded AVX binned-SAH-builder. +// This code produces BVHs nearly identical to reference, but much faster. +// On a 12th gen laptop i7 CPU, Sponza Crytek (~260k tris) is processed in 51ms. +// The code relies on the availability of AVX instructions. AVX2 is not needed. +#ifdef _MSC_VER +#define LANE(a,b) a.m128_f32[b] +// Not using clang/g++ method under MSCC; compiler may benefit from .m128_i32. +#define ILANE(a,b) a.m128i_i32[b] +#else +#define LANE(a,b) a[b] +// Below method reduces to a single instruction. +#define ILANE(a,b) _mm_cvtsi128_si32(_mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps( a ), _mm_castsi128_ps( a ), b))) +#endif +inline float halfArea( const __m128 a /* a contains extent of aabb */ ) +{ + return LANE( a, 0 ) * LANE( a, 1 ) + LANE( a, 1 ) * LANE( a, 2 ) + LANE( a, 2 ) * LANE( a, 3 ); +} +inline float halfArea( const __m256 a /* a contains aabb itself, with min.xyz negated */ ) +{ + const __m128 q = _mm256_castps256_ps128( _mm256_add_ps( _mm256_permute2f128_ps( a, a, 5 ), a ) ); + const __m128 v = _mm_mul_ps( q, _mm_shuffle_ps( q, q, 9 ) ); + return LANE( v, 0 ) + LANE( v, 1 ) + LANE( v, 2 ); +} +#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ + ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \ + splitCost = ANLR, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; } +#if defined(_MSC_VER) +#pragma warning ( push ) +#pragma warning( disable:4701 ) // "potentially uninitialized local variable 'bestLBox' used" +#elif defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif +void BVH::BuildAVX( const bvhvec4* vertices, const unsigned int primCount ) +{ + int test = BVHBINS; + if (test != 8) assert( false ); // AVX builders require BVHBINS == 8. + // aligned data + ALIGNED( 64 ) __m256 binbox[3 * BVHBINS]; // 768 bytes + ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS]; // 768 bytes + ALIGNED( 64 ) unsigned int count[3][BVHBINS]{}; // 96 bytes + ALIGNED( 64 ) __m256 bestLBox, bestRBox; // 64 bytes + // some constants + static const __m128 max4 = _mm_set1_ps( -1e30f ), half4 = _mm_set1_ps( 0.5f ); + static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 ); + static const __m256 max8 = _mm256_set1_ps( -1e30f ); + static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f ); + static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f ); + static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() ); + static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f ); + for (unsigned int i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template + // reset node pool + newNodePtr = 2; + if (!bvhNode) + { + triCount = primCount; + verts = (bvhvec4*)vertices; + triIdx = new unsigned int[triCount]; + bvhNode = (BVHNode*)ALIGNED_MALLOC( triCount * 2 * sizeof( BVHNode ) ); + memset( &bvhNode[1], 0, 32 ); // avoid crash in refit. + fragment = new Fragment[triCount]; + idxCount = triCount; + } + else assert( triCount == primCount ); // don't change triangle count between builds. + struct FragSSE { __m128 bmin4, bmax4; }; + FragSSE* frag4 = (FragSSE*)fragment; + __m256* frag8 = (__m256*)fragment; + const __m128* tris4 = (__m128*)verts; + // assign all triangles to the root node + BVHNode& root = bvhNode[0]; + root.leftFirst = 0, root.triCount = triCount; + // initialize fragments and update root bounds + __m128 rootMin = max4, rootMax = max4; + for (unsigned int i = 0; i < triCount; i++) + { + const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ) ); + const __m128 v2 = _mm_max_ps( _mm_max_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ); + frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = _mm_max_ps( rootMin, v1 ), rootMax = _mm_max_ps( rootMax, v2 ), triIdx[i] = i; + } + rootMin = _mm_xor_ps( rootMin, signFlip4 ); + root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax; + // subdivide recursively + ALIGNED( 64 ) unsigned int task[128], taskCount = 0, nodeIdx = 0; + const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-10f; + while (1) + { + while (1) + { + BVHNode& node = bvhNode[nodeIdx]; + __m128* node4 = (__m128*) & bvhNode[nodeIdx]; + // find optimal object split + const __m128 d4 = _mm_blendv_ps( min1, _mm_sub_ps( node4[1], node4[0] ), mask3 ); + const __m128 nmin4 = _mm_mul_ps( _mm_and_ps( node4[0], mask3 ), two4 ); + const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) ); + // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies": + // main loop operates on two fragments to minimize dependencies and maximize ILP. + unsigned int fi = triIdx[node.leftFirst]; + memset( count, 0, sizeof( count ) ); + __m256 r0, r1, r2, f = frag8[fi]; + __m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ); + memcpy( binbox, binboxOrig, sizeof( binbox ) ); + unsigned int i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1; + for (unsigned int i = 0; i < node.triCount - 1; i++) + { + const unsigned int fid = *ti++; + const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; + const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4; + r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); + const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ); + f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++; + binbox[i0] = r0, i0 = ILANE( b4, 0 ); + binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 ); + binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 ); + } + // final business for final fragment + const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2]; + count[0][i0]++, count[1][i1]++, count[2][i2]++; + r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); + binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; + // calculate per-split totals + float splitCost = 1e30f; + unsigned int bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; + const __m256* bb = binbox; + for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) + { + // hardcoded bin processing for BVHBINS == 8, see end of file for generic code. + assert( BVHBINS == 8 ); + const unsigned int lN0 = count[a][0], rN0 = count[a][7]; + const __m256 lb0 = bb[0], rb0 = bb[7]; + const unsigned int lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2]; + const unsigned int rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4]; + const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] ); + const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] ); + const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] ); + const unsigned int lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5]; + const unsigned int rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1]; + const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] ); + const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] ); + const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] ); + float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split + float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 ); + float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 ); + float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 ); + float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 ); + float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 ); + float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split + } + if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better. + // in-place partition + const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; + unsigned int t, fr = triIdx[src]; + for (unsigned int i = 0; i < node.triCount; i++) + { + const unsigned int bi = (unsigned int)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd); + if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t; + } + // create child nodes and recurse + const unsigned int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount; + if (leftCount == 0 || rightCount == 0) break; // should not happen. + *(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 ); + bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount; + node.leftFirst = n++, node.triCount = 0, newNodePtr += 2; + *(__m256*)& bvhNode[n] = _mm256_xor_ps( bestRBox, signFlip8 ); + bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount; + task[taskCount++] = n, nodeIdx = n - 1; + } + // fetch subdivision task from stack + if (taskCount == 0) break; else nodeIdx = task[--taskCount]; + } +} +#if defined(_MSC_VER) +#pragma warning ( pop ) // restore 4701 +#elif defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop // restore -Wmaybe-uninitialized +#endif + // Intersect a BVH with a ray packet, basic SSE-optimized version. // Note: This yields +10% on 10th gen Intel CPUs, but a small loss on // more recent hardware. This function needs a full conversion to work @@ -1352,52 +1314,100 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const } } -#endif - -// IntersectTri -void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const +// Traverse the second alternative BVH layout (ALT_SOA). +int BVH::Intersect_AltSoA( Ray& ray ) const { - // Moeller-Trumbore ray/triangle intersection algorithm - const unsigned int vertIdx = idx * 3; - const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; - const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; - const bvhvec3 h = cross( ray.D, edge2 ); - const float a = dot( edge1, h ); - if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle - const float f = 1 / a; - const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); - const float u = f * dot( s, h ); - if (u < 0 || u > 1) return; - const bvhvec3 q = cross( s, edge1 ); - const float v = f * dot( ray.D, q ); - if (v < 0 || u + v > 1) return; - const float t = f * dot( edge2, q ); - if (t > 0 && t < ray.hit.t) + assert( alt2Node != 0 ); + BVHNodeAlt2* node = &alt2Node[0], * stack[64]; + unsigned int stackPtr = 0, steps = 0; + const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x ); + const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y ); + const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z ); + while (1) { - // register a hit: ray is shortened to t - ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx; + steps++; + if (node->isLeaf()) + { + for (unsigned int i = 0; i < node->triCount; i++) + { + const unsigned int tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t < 0 || t > ray.hit.t) continue; + ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx; + } + if (stackPtr == 0) break; else node = stack[--stackPtr]; + continue; + } + __m128 x4 = _mm_mul_ps( _mm_sub_ps( node->xxxx, Ox4 ), rDx4 ); + __m128 y4 = _mm_mul_ps( _mm_sub_ps( node->yyyy, Oy4 ), rDy4 ); + __m128 z4 = _mm_mul_ps( _mm_sub_ps( node->zzzz, Oz4 ), rDz4 ); + // transpose + __m128 t0 = _mm_unpacklo_ps( x4, y4 ), t2 = _mm_unpacklo_ps( z4, z4 ); + __m128 t1 = _mm_unpackhi_ps( x4, y4 ), t3 = _mm_unpackhi_ps( z4, z4 ); + __m128 xyzw1a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + __m128 xyzw2a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + __m128 xyzw1b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + __m128 xyzw2b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + // process + __m128 tmina4 = _mm_min_ps( xyzw1a, xyzw2a ), tmaxa4 = _mm_max_ps( xyzw1a, xyzw2a ); + __m128 tminb4 = _mm_min_ps( xyzw1b, xyzw2b ), tmaxb4 = _mm_max_ps( xyzw1b, xyzw2b ); + // transpose back + t0 = _mm_unpacklo_ps( tmina4, tmaxa4 ), t2 = _mm_unpacklo_ps( tminb4, tmaxb4 ); + t1 = _mm_unpackhi_ps( tmina4, tmaxa4 ), t3 = _mm_unpackhi_ps( tminb4, tmaxb4 ); + x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() ); + const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) ); + // TODO: use a shuffle here to do the comparison / select with SSE, then extract dist1 and dist2. + const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 ); + const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 ); + float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f; + float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f; + unsigned int lidx = node->left, ridx = node->right; + if (dist1 > dist2) + { + float t = dist1; dist1 = dist2; dist2 = t; + unsigned int i = lidx; lidx = ridx; ridx = i; + } + if (dist1 == 1e30f) + { + if (stackPtr == 0) break; else node = stack[--stackPtr]; + } + else + { + node = alt2Node + lidx; + if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx; + } } + return steps; } -// IntersectAABB -float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ) +#else + +int BVH::Intersect_AltSoA( Ray& ray ) const { - // "slab test" ray/AABB intersection - float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x; - float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 ); - float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y; - tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) ); - tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) ); - float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z; - tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) ); - tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) ); - if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f; + // not implemented for platforms that do not support SSE/AVX. + assert( false ); + return 0; } -} // namespace tinybvh +#endif // BVH_USEAVX -#endif +} // namespace tinybvh -#endif +#endif // TINYBVH_IMPLEMENTATION -// EOF +#endif // TINY_BVH_H_ \ No newline at end of file diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index df1bb93..71af684 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -120,8 +120,10 @@ int main() // T I N Y _ B V H P E R F O R M A N C E M E A S U R E M E N T S - int minor = TINY_BVH_VERSION_MINOR, major = TINY_BVH_VERSION_MAJOR; - printf( "tiny_bvh version %i.%i performance statistics\n", major, minor ); + int minor = TINY_BVH_VERSION_MINOR; + int major = TINY_BVH_VERSION_MAJOR; + int sub = TINY_BVH_VERSION_SUB; + printf( "tiny_bvh version %i.%i.%i performance statistics\n", major, minor, sub ); printf( "----------------------------------------------------------------\n" ); Timer t;