diff --git a/tiny_bvh.h b/tiny_bvh.h index a323dc3..716d327 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -120,6 +120,9 @@ THE SOFTWARE. // BVH4 triangle format // #define BVH4_GPU_COMPRESSED_TRIS +// We'll use this whenever a layout has no specialized shadow ray query. +#define FALLBACK_SHADOW_QUERY( s ) { Ray r = s; float d = s.hit.t; Intersect( r ); return r.hit.t < d; } + // include fast AVX BVH builder #if defined(__x86_64__) || defined(_M_X64) || defined(__wasm_simd128__) || defined(__wasm_relaxed_simd__) #define BVH_USEAVX @@ -274,17 +277,9 @@ struct bvhaabb bvhvec3 maxBounds; uint32_t dummy2; }; -/** - * Strided slice of @ref bvhvec4 - */ struct bvhvec4slice { bvhvec4slice() = default; - /** - * @param data Pointer to the first element - * @param count Number of @ref bvhvec4 in the slice, not **bytes** - * @param stride Byte stride between each @ref bvhvec4 element - */ bvhvec4slice( const bvhvec4* data, uint32_t count, uint32_t stride = sizeof( bvhvec4 ) ); operator bool() const { return !!data; } const bvhvec4& operator [] ( size_t i ) const; @@ -547,6 +542,7 @@ class BVHBase void CopyBasePropertiesFrom( const BVHBase& original ); // copy flags from one BVH to another protected: void IntersectTri( Ray& ray, const bvhvec4slice& verts, const uint32_t triIdx ) const; + bool TriOccludes( const Ray& ray, const bvhvec4slice& verts, const uint32_t idx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); static void PrecomputeTriangle( const bvhvec4slice& vert, uint32_t triIndex, float* T ); static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax ); @@ -688,15 +684,16 @@ class BVH_GPU : public BVHBase bool isLeaf() const { return triCount > 0; } }; BVH_GPU( BVHContext ctx = {} ) { context = ctx; } - BVH_GPU( const BVH& original ) { ConvertFrom( original ); } + BVH_GPU( const BVH& original ) { /* DEPRICATED */ bvh = original; ConvertFrom( bvh ); } ~BVH_GPU() { AlignedFree( bvhNode ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH& original ); int32_t Intersect( Ray& ray ) const; - bool IsOccluded( const Ray& ray ) const; + bool IsOccluded( const Ray& ray ) const { FALLBACK_SHADOW_QUERY( ray ); } // BVH data BVHNode* bvhNode = 0; // BVH node in Aila & Laine format. - bvhvec4slice verts = {}; // pointer to input primitive array: 3x16 bytes per tri. - uint32_t* triIdx = 0; // primitive index array, pointer copied from original. + BVH bvh; // BVH4 is created from BVH and uses its data. }; class BVH_SoA : public BVHBase @@ -711,18 +708,10 @@ class BVH_SoA : public BVHBase bool isLeaf() const { return triCount > 0; } }; BVH_SoA( BVHContext ctx = {} ) { context = ctx; } - BVH_SoA( const BVH& original ) { ConvertFrom( original ); } - BVH_SoA( const bvhvec4* vertices, const uint32_t primCount ) - { - bvh.BuildDefault( vertices, primCount ); - ConvertFrom( bvh ); - } - BVH_SoA( const bvhvec4slice& vertices ) - { - bvh.BuildDefault( vertices ); - ConvertFrom( bvh ); - } + BVH_SoA( const BVH& original ) { /* DEPRICATED */ bvh = original; ConvertFrom( bvh ); } ~BVH_SoA() { AlignedFree( bvhNode ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH& original ); int32_t Intersect( Ray& ray ) const; bool IsOccluded( const Ray& ray ) const; @@ -782,16 +771,18 @@ class BVH4 : public BVHBase uint32_t childCount, dummy1, dummy2, dummy3; // dummies are for alignment. bool isLeaf() const { return triCount > 0; } }; + BVH4( BVH4&& src ) : bvh( src.bvh ), bvh4Node( src.bvh4Node ) {} BVH4( BVHContext ctx = {} ) { context = ctx; } BVH4( const BVH& original ) { ConvertFrom( original ); } ~BVH4() { AlignedFree( bvh4Node ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH& original ); int32_t Intersect( Ray& ray ) const; - bool IsOccluded( const Ray& ray ) const; + bool IsOccluded( const Ray& ray ) const { FALLBACK_SHADOW_QUERY( ray ); } // BVH data - bvhvec4slice verts = {}; // pointer to input primitive array: 3x16 bytes per tri. - uint32_t* triIdx = 0; // primitive index array - pointer copied from original. BVHNode* bvh4Node = 0; // BVH node for 4-wide BVH. + BVH bvh; // BVH4 is created from BVH and uses its data. }; class BVH8 : public BVHBase @@ -809,17 +800,17 @@ class BVH8 : public BVHBase BVH8( BVHContext ctx = {} ) { context = ctx; } BVH8( const BVH& original ) { ConvertFrom( original ); } ~BVH8() { AlignedFree( bvh8Node ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH& original ); int32_t Intersect( Ray& ray ) const; - bool IsOccluded( const Ray& ray ) const; + bool IsOccluded( const Ray& ray ) const { FALLBACK_SHADOW_QUERY( ray ); } // Helpers void SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims ); // BVH8 data public: - bvhvec4slice verts = {}; // pointer to input primitive array: 3x16 bytes per tri. - Fragment* fragment = 0; // input primitive bounding boxes - Copy from original. - uint32_t* triIdx = 0; // primitive index array - pointer copied from original. BVHNode* bvh8Node = 0; // BVH node for 8-wide BVH. + BVH bvh; // BVH8 is created from BVH and uses its data. }; class BVH4_GPU : public BVHBase @@ -847,13 +838,16 @@ class BVH4_GPU : public BVHBase BVH4_GPU( BVHContext ctx = {} ) { context = ctx; } BVH4_GPU( const BVH4& original ) { ConvertFrom( original ); } ~BVH4_GPU() { AlignedFree( bvh4Data ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH4& original ); int32_t Intersect( Ray& ray ) const; - bool IsOccluded( const Ray& ray ) const; + bool IsOccluded( const Ray& ray ) const { FALLBACK_SHADOW_QUERY( ray ); } // BVH data bvhvec4* bvh4Data = 0; // 64-byte 4-wide BVH node for efficient GPU rendering. uint32_t allocatedBlocks = 0; // node data and triangles are stored in 16-byte blocks. uint32_t usedBlocks = 0; // actually used storage. + BVH4 bvh4; // BVH4_CPU is created from BVH4 and uses its data. }; class BVH4_CPU : public BVHBase @@ -870,14 +864,17 @@ class BVH4_CPU : public BVHBase uint32_t triCount[4]; }; BVH4_CPU( BVHContext ctx = {} ) { context = ctx; } - BVH4_CPU( const BVH4& original ) { ConvertFrom( original ); } + BVH4_CPU( const BVH4& original ) { /* DEPRECATED */ ConvertFrom( original ); } ~BVH4_CPU() { AlignedFree( bvh4Node ); AlignedFree( bvh4Tris ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( const BVH4& original ); int32_t Intersect( Ray& ray ) const; bool IsOccluded( const Ray& ray ) const; // BVH data BVHNode* bvh4Node = 0; // 128-byte 4-wide BVH node for efficient CPU rendering. bvhvec4* bvh4Tris = 0; // triangle data for BVHNode4Alt2 nodes. + BVH4 bvh4; // BVH4_CPU is created from BVH4 and uses its data. }; class BVH4_WiVe : public BVHBase @@ -898,6 +895,8 @@ class BVH4_WiVe : public BVHBase }; BVH4_WiVe( BVHContext ctx = {} ) { context = ctx; } ~BVH4_WiVe() { AlignedFree( bvh4Node ); } + BVH4_WiVe( const bvhvec4* vertices, const uint32_t primCount ); + BVH4_WiVe( const bvhvec4slice& vertices ); int32_t Intersect( Ray& ray ) const; bool IsOccluded( const Ray& ray ) const; // BVH4 data @@ -912,14 +911,17 @@ class BVH8_CWBVH : public BVHBase BVH8_CWBVH( BVHContext ctx = {} ) { context = ctx; } BVH8_CWBVH( BVH8& original ) { ConvertFrom( original ); } ~BVH8_CWBVH() { AlignedFree( bvh8Data ); AlignedFree( bvh8Tris ); } + void Build( const bvhvec4* vertices, const uint32_t primCount ); + void Build( const bvhvec4slice& vertices ); void ConvertFrom( BVH8& original ); // NOTE: Not const; this may change some nodes in the original. int32_t Intersect( Ray& ray ) const; - bool IsOccluded( const Ray& ray ) const; + bool IsOccluded( const Ray& ray ) const { FALLBACK_SHADOW_QUERY( ray ); } // BVH8 data bvhvec4* bvh8Data = 0; // nodes in CWBVH format. bvhvec4* bvh8Tris = 0; // triangle data for CWBVH nodes. uint32_t allocatedBlocks; // node data is stored in blocks of 16 byte. uint32_t usedBlocks; // actually used blocks. + BVH8 bvh8; // BVH8_CWBVH is created from BVH8 and uses its data. }; // BLASInstance: A TLAS is built over BLAS instances, where a single BLAS can be @@ -929,9 +931,9 @@ class BLASInstance { public: BLASInstance( BVH* bvh ) : blas( bvh ) {} - void Update(); // Update the world bounds based on the current transform. - BVH* blas = 0; // Bottom-level acceleration structure. - bvhaabb worldBounds; // World-space AABB over the transformed blas root node. + void Update(); // Update the world bounds based on the current transform. + BVH* blas = 0; // Bottom-level acceleration structure. + bvhaabb worldBounds; // World-space AABB over the transformed blas root node. float transform[16] = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; // identity bvhvec3 TransformPoint( const bvhvec3& v ) const; bvhvec3 TransformVector( const bvhvec3& v ) const; @@ -1139,7 +1141,7 @@ void BVH::BuildQuick( const bvhvec4* vertices, const uint32_t primCount ) { // build the BVH with a continuous array of bvhvec4 vertices: // in this case, the stride for the slice is 16 bytes. - BuildQuick( bvhvec4slice{ vertices, primCount * 3, 16U } ); + BuildQuick( bvhvec4slice{ vertices, primCount * 3, sizeof( bvhvec4 ) } ); } void BVH::BuildQuick( const bvhvec4slice& vertices ) { @@ -1233,7 +1235,7 @@ void BVH::Build( const bvhvec4* vertices, const uint32_t primCount ) { // build the BVH with a continuous array of bvhvec4 vertices: // in this case, the stride for the slice is 16 bytes. - Build( bvhvec4slice{ vertices, primCount * 3, 16U } ); + Build( bvhvec4slice{ vertices, primCount * 3, sizeof( bvhvec4 ) } ); } void BVH::Build( const bvhvec4slice& vertices ) { @@ -1386,7 +1388,7 @@ void BVH::Build( const bvhvec4slice& vertices ) // primarily useful for static geometry. void BVH::BuildHQ( const bvhvec4* vertices, const uint32_t primCount ) { - BuildHQ( bvhvec4slice{ vertices, primCount * 3, 16U } ); + BuildHQ( bvhvec4slice{ vertices, primCount * 3, sizeof( bvhvec4 ) } ); } void BVH::BuildHQ( const bvhvec4slice& vertices ) { @@ -2185,6 +2187,16 @@ void BVH_Verbose::MergeLeafs() // BVH_GPU implementation // ---------------------------------------------------------------------------- +void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH_GPU::Build( const bvhvec4slice& vertices ) +{ + bvh.BuildDefault( vertices ); + ConvertFrom( bvh ); +} + void BVH_GPU::ConvertFrom( const BVH& original ) { // allocate space @@ -2197,8 +2209,6 @@ void BVH_GPU::ConvertFrom( const BVH& original ) } memset( bvhNode, 0, sizeof( BVHNode ) * spaceNeeded ); CopyBasePropertiesFrom( original ); - this->verts = original.verts; - this->triIdx = original.triIdx; // recursively convert nodes uint32_t newNodePtr = 0, nodeIdx = 0, stack[128], stackPtr = 0; while (1) @@ -2232,6 +2242,8 @@ void BVH_GPU::ConvertFrom( const BVH& original ) int32_t BVH_GPU::Intersect( Ray& ray ) const { BVHNode* node = &bvhNode[0], * stack[64]; + const bvhvec4slice& verts = bvh.verts; + const uint32_t* triIdx = bvh.triIdx; uint32_t stackPtr = 0, steps = 0; while (1) { @@ -2272,68 +2284,19 @@ int32_t BVH_GPU::Intersect( Ray& ray ) const return steps; } -bool BVH_GPU::IsOccluded( const Ray& ray ) const -{ - BVHNode* node = &bvhNode[0], * stack[64]; - uint32_t stackPtr = 0; - while (1) - { - if (node->isLeaf()) - { - for (uint32_t i = 0; i < node->triCount; i++) - { - const uint32_t vertIdx = triIdx[node->firstTri + i] * 3; - const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; - const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; - const bvhvec3 h = cross( ray.D, edge2 ); - const float a = dot( edge1, h ); - if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle - const float f = 1 / a; - const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); - const float u = f * dot( s, h ); - if (u < 0 || u > 1) continue; - const bvhvec3 q = cross( s, edge1 ); - const float v = f * dot( ray.D, q ); - if (v < 0 || u + v > 1) continue; - const float t = f * dot( edge2, q ); - if (t > 0 && t < ray.hit.t) return true; - } - if (stackPtr == 0) break; else node = stack[--stackPtr]; - continue; - } - const bvhvec3 lmin = node->lmin - ray.O, lmax = node->lmax - ray.O; - const bvhvec3 rmin = node->rmin - ray.O, rmax = node->rmax - ray.O; - float dist1 = BVH_FAR, dist2 = BVH_FAR; - const bvhvec3 t1a = lmin * ray.rD, t2a = lmax * ray.rD; - const bvhvec3 t1b = rmin * ray.rD, t2b = rmax * ray.rD; - const float tmina = tinybvh_max( tinybvh_max( tinybvh_min( t1a.x, t2a.x ), tinybvh_min( t1a.y, t2a.y ) ), tinybvh_min( t1a.z, t2a.z ) ); - const float tmaxa = tinybvh_min( tinybvh_min( tinybvh_max( t1a.x, t2a.x ), tinybvh_max( t1a.y, t2a.y ) ), tinybvh_max( t1a.z, t2a.z ) ); - const float tminb = tinybvh_max( tinybvh_max( tinybvh_min( t1b.x, t2b.x ), tinybvh_min( t1b.y, t2b.y ) ), tinybvh_min( t1b.z, t2b.z ) ); - const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) ); - if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina; - if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb; - uint32_t lidx = node->left, ridx = node->right; - if (dist1 > dist2) - { - float t = dist1; dist1 = dist2; dist2 = t; - uint32_t i = lidx; lidx = ridx; ridx = i; - } - if (dist1 == BVH_FAR) - { - if (stackPtr == 0) break; else node = stack[--stackPtr]; - } - else - { - node = bvhNode + lidx; - if (dist2 != BVH_FAR) stack[stackPtr++] = bvhNode + ridx; - } - } - return false; -} - // BVH_SoA implementation // ---------------------------------------------------------------------------- +void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH_SoA::Build( const bvhvec4slice& vertices ) +{ + bvh.BuildDefault( vertices ); + ConvertFrom( bvh ); +} + void BVH_SoA::ConvertFrom( const BVH& original ) { // allocate space @@ -2384,6 +2347,16 @@ void BVH_SoA::ConvertFrom( const BVH& original ) // BVH4 implementation // ---------------------------------------------------------------------------- +void BVH4::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH4::Build( const bvhvec4slice& vertices ) +{ + bvh.BuildDefault( vertices ); + ConvertFrom( bvh ); +} + void BVH4::ConvertFrom( const BVH& original ) { // allocate space @@ -2396,8 +2369,6 @@ void BVH4::ConvertFrom( const BVH& original ) } memset( bvh4Node, 0, sizeof( BVHNode ) * spaceNeeded ); CopyBasePropertiesFrom( original ); - this->verts = original.verts; - this->triIdx = original.triIdx; // create an mbvh node for each bvh2 node for (uint32_t i = 0; i < original.usedNodes; i++) if (i != 1) { @@ -2454,7 +2425,7 @@ int32_t BVH4::Intersect( Ray& ray ) const { steps++; if (node->isLeaf()) for (uint32_t i = 0; i < node->triCount; i++) - IntersectTri( ray, verts, triIdx[node->firstTri + i] ); + IntersectTri( ray, bvh.verts, bvh.triIdx[node->firstTri + i] ); else for (uint32_t i = 0; i < node->childCount; i++) { BVHNode* child = bvh4Node + node->child[i]; @@ -2469,32 +2440,42 @@ int32_t BVH4::Intersect( Ray& ray ) const // BVH4_CPU implementation // ---------------------------------------------------------------------------- -void BVH4_CPU::ConvertFrom( const BVH4& original ) +void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH4_CPU::Build( const bvhvec4slice& vertices ) +{ + bvh4.Build( vertices ); + ConvertFrom( bvh4 ); +} + +void BVH4_CPU::ConvertFrom( const BVH4& bvh4 ) { // Convert a 4-wide BVH to a format suitable for CPU traversal. // See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout, // Atilla T. Áfra, 2013. - uint32_t spaceNeeded = original.usedNodes; + uint32_t spaceNeeded = bvh4.usedNodes; if (allocatedNodes < spaceNeeded) { AlignedFree( bvh4Node ); AlignedFree( bvh4Tris ); bvh4Node = (BVHNode*)AlignedAlloc( spaceNeeded * sizeof( BVHNode ) ); - bvh4Tris = (bvhvec4*)AlignedAlloc( original.idxCount * 4 * sizeof( bvhvec4 ) ); + bvh4Tris = (bvhvec4*)AlignedAlloc( bvh4.idxCount * 4 * sizeof( bvhvec4 ) ); allocatedNodes = spaceNeeded; } memset( bvh4Node, 0, spaceNeeded * sizeof( BVHNode ) ); - CopyBasePropertiesFrom( original ); + CopyBasePropertiesFrom( bvh4 ); // start conversion uint32_t newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0; while (1) { - const BVH4::BVHNode& orig = original.bvh4Node[nodeIdx]; + const BVH4::BVHNode& orig = bvh4.bvh4Node[nodeIdx]; BVHNode& newNode = bvh4Node[newAlt4Ptr++]; int32_t cidx = 0; for (int32_t i = 0; i < 4; i++) if (orig.child[i]) { - const BVH4::BVHNode& child = original.bvh4Node[orig.child[i]]; + const BVH4::BVHNode& child = bvh4.bvh4Node[orig.child[i]]; ((float*)&newNode.xmin4)[cidx] = child.aabbMin.x; ((float*)&newNode.ymin4)[cidx] = child.aabbMin.y; ((float*)&newNode.zmin4)[cidx] = child.aabbMin.z; @@ -2538,8 +2519,8 @@ void BVH4_CPU::ConvertFrom( const BVH4& original ) // assign vertex data for (uint32_t j = 0; j < count; j++) { - uint32_t fi = original.triIdx[first + j]; - PrecomputeTriangle( original.verts, fi * 3, (float*)&bvh4Tris[triPtr] ); + uint32_t fi = bvh4.bvh.triIdx[first + j]; + PrecomputeTriangle( bvh4.bvh.verts, fi * 3, (float*)&bvh4Tris[triPtr] ); bvh4Tris[triPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&fi ); triPtr += 4; } @@ -2554,7 +2535,17 @@ void BVH4_CPU::ConvertFrom( const BVH4& original ) // BVH4_GPU implementation // ---------------------------------------------------------------------------- -void BVH4_GPU::ConvertFrom( const BVH4& original ) +void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH4_GPU::Build( const bvhvec4slice& vertices ) +{ + bvh4.Build( vertices ); + ConvertFrom( bvh4 ); +} + +void BVH4_GPU::ConvertFrom( const BVH4& bvh4 ) { // Convert a 4-wide BVH to a format suitable for GPU traversal. Layout: // offs 0: aabbMin (12 bytes), 4x quantized child xmin (4 bytes) @@ -2564,7 +2555,7 @@ void BVH4_GPU::ConvertFrom( const BVH4& original ) // Leaf: 15 bits for tri count, 16 for offset // Interior: 32 bits for position of child node. // Triangle data ('by value') immediately follows each leaf node. - uint32_t blocksNeeded = original.usedNodes * 4; // here, 'block' is 16 bytes. + uint32_t blocksNeeded = bvh4.usedNodes * 4; // here, 'block' is 16 bytes. blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer. if (allocatedBlocks < blocksNeeded) { @@ -2573,12 +2564,12 @@ void BVH4_GPU::ConvertFrom( const BVH4& original ) allocatedBlocks = blocksNeeded; } memset( bvh4Data, 0, 16 * blocksNeeded ); - CopyBasePropertiesFrom( original ); + CopyBasePropertiesFrom( bvh4 ); // start conversion uint32_t nodeIdx = 0, newAlt4Ptr = 0, stack[128], stackPtr = 0, retValPos = 0; while (1) { - const BVH4::BVHNode& orig = original.bvh4Node[nodeIdx]; + const BVH4::BVHNode& orig = bvh4.bvh4Node[nodeIdx]; // convert BVH4 node - must be an interior node. assert( !orig.isLeaf() ); bvhvec4* nodeBase = bvh4Data + newAlt4Ptr; @@ -2587,8 +2578,8 @@ void BVH4_GPU::ConvertFrom( const BVH4& original ) nodeBase[0] = bvhvec4( orig.aabbMin, 0 ); nodeBase[1] = bvhvec4( (orig.aabbMax - orig.aabbMin) * (1.0f / 255.0f), 0 ); BVH4::BVHNode* childNode[4] = { - &original.bvh4Node[orig.child[0]], &original.bvh4Node[orig.child[1]], - &original.bvh4Node[orig.child[2]], &original.bvh4Node[orig.child[3]] + &bvh4.bvh4Node[orig.child[0]], &bvh4.bvh4Node[orig.child[1]], + &bvh4.bvh4Node[orig.child[2]], &bvh4.bvh4Node[orig.child[3]] }; // start with leaf child node conversion uint32_t childInfo[4] = { 0, 0, 0, 0 }; // will store in final fields later @@ -2599,15 +2590,15 @@ void BVH4_GPU::ConvertFrom( const BVH4& original ) childInfo[i] |= 0x80000000; for (uint32_t j = 0; j < childNode[i]->triCount; j++) { - uint32_t t = original.triIdx[childNode[i]->firstTri + j]; + uint32_t t = bvh4.bvh.triIdx[childNode[i]->firstTri + j]; #ifdef BVH4_GPU_COMPRESSED_TRIS PrecomputeTriangle( verts, t * 3, (float*)&bvh4Alt[newAlt4Ptr] ); bvh4Alt[newAlt4Ptr + 3] = bvhvec4( 0, 0, 0, *(float*)&t ); newAlt4Ptr += 4; #else - bvhvec4 v0 = original.verts[t * 3 + 0]; - bvh4Data[newAlt4Ptr + 1] = original.verts[t * 3 + 1] - v0; - bvh4Data[newAlt4Ptr + 2] = original.verts[t * 3 + 2] - v0; + bvhvec4 v0 = bvh4.bvh.verts[t * 3 + 0]; + bvh4Data[newAlt4Ptr + 1] = bvh4.bvh.verts[t * 3 + 1] - v0; + bvh4Data[newAlt4Ptr + 2] = bvh4.bvh.verts[t * 3 + 2] - v0; v0.w = *(float*)&t; // as_float bvh4Data[newAlt4Ptr + 0] = v0; newAlt4Ptr += 3; @@ -2786,6 +2777,16 @@ int32_t BVH4_GPU::Intersect( Ray& ray ) const // BVH8 implementation // ---------------------------------------------------------------------------- +void BVH8::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH8::Build( const bvhvec4slice& vertices ) +{ + bvh.BuildDefault( vertices ); + ConvertFrom( bvh ); +} + void BVH8::ConvertFrom( const BVH& original ) { // allocate space @@ -2801,9 +2802,6 @@ void BVH8::ConvertFrom( const BVH& original ) } memset( bvh8Node, 0, sizeof( BVHNode ) * spaceNeeded ); CopyBasePropertiesFrom( original ); - this->verts = original.verts; - this->fragment = original.fragment; - this->triIdx = original.triIdx; // create an mbvh node for each bvh2 node for (uint32_t i = 0; i < original.usedNodes; i++) if (i != 1) { @@ -2857,6 +2855,9 @@ void BVH8::ConvertFrom( const BVH& original ) void BVH8::SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims ) { float fragMinFix = frag_min_flipped ? -1.0f : 1.0f; + const bvhvec4slice& verts = bvh.verts; + const uint32_t* triIdx = bvh.triIdx; + const Fragment* fragment = bvh.fragment; BVHNode& node = bvh8Node[nodeIdx]; if (node.triCount <= maxPrims) return; // also catches interior nodes // place all primitives in a new node and make this the first child of 'node' @@ -2892,6 +2893,8 @@ void BVH8::SplitBVH8Leaf( const uint32_t nodeIdx, const uint32_t maxPrims ) int32_t BVH8::Intersect( Ray& ray ) const { BVHNode* node = &bvh8Node[0], * stack[512]; + const bvhvec4slice& verts = bvh.verts; + const uint32_t* triIdx = bvh.triIdx; uint32_t stackPtr = 0, steps = 0; while (1) { @@ -2912,30 +2915,40 @@ int32_t BVH8::Intersect( Ray& ray ) const // BVH8_CWBVH implementation // ---------------------------------------------------------------------------- -void BVH8_CWBVH::ConvertFrom( BVH8& original ) +void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +} +void BVH8_CWBVH::Build( const bvhvec4slice& vertices ) +{ + bvh8.Build( vertices ); + ConvertFrom( bvh8 ); +} + +void BVH8_CWBVH::ConvertFrom( BVH8& bvh8 ) { // Convert a BVH8 to the format specified in: "Efficient Incoherent Ray // Traversal on GPUs Through Compressed Wide BVHs", Ylitie et al. 2017. // Adapted from code by "AlanWBFT". - FATAL_ERROR_IF( original.bvh8Node[0].isLeaf(), "BVH8_CWBVH::ConvertFrom( .. ), converting a single-node bvh." ); + FATAL_ERROR_IF( bvh8.bvh8Node[0].isLeaf(), "BVH8_CWBVH::ConvertFrom( .. ), converting a single-node bvh." ); // allocate memory // Note: This can be far lower (specifically: usedNodes) if we know that // none of the BVH8 leafs has more than three primitives. // Without this guarantee, the only safe upper limit is triCount * 2, since // we will be splitting fat BVH8 leafs to as we go. - uint32_t spaceNeeded = original.triCount * 2 * 5; // CWBVH nodes use 80 bytes each. + uint32_t spaceNeeded = bvh8.triCount * 2 * 5; // CWBVH nodes use 80 bytes each. if (spaceNeeded > allocatedBlocks) { bvh8Data = (bvhvec4*)AlignedAlloc( spaceNeeded * 16 ); - bvh8Tris = (bvhvec4*)AlignedAlloc( original.idxCount * 4 * 16 ); + bvh8Tris = (bvhvec4*)AlignedAlloc( bvh8.idxCount * 4 * 16 ); allocatedBlocks = spaceNeeded; } memset( bvh8Data, 0, spaceNeeded * 16 ); - memset( bvh8Tris, 0, original.idxCount * 3 * 16 ); - CopyBasePropertiesFrom( original ); + memset( bvh8Tris, 0, bvh8.idxCount * 3 * 16 ); + CopyBasePropertiesFrom( bvh8 ); BVH8::BVHNode* stackNodePtr[256]; uint32_t stackNodeAddr[256], stackPtr = 1, nodeDataPtr = 5, triDataPtr = 0; - stackNodePtr[0] = &original.bvh8Node[0], stackNodeAddr[0] = 0; + stackNodePtr[0] = &bvh8.bvh8Node[0], stackNodeAddr[0] = 0; // start conversion while (stackPtr > 0) { @@ -2957,8 +2970,8 @@ void BVH8_CWBVH::ConvertFrom( BVH8& original ) ); for (int32_t i = 0; i < 8; i++) if (orig->child[i] == 0) cost[s][i] = BVH_FAR; else { - BVH8::BVHNode* const child = &original.bvh8Node[orig->child[i]]; - if (child->triCount > 3 /* must be leaf */) original.SplitBVH8Leaf( orig->child[i], 3 ); + BVH8::BVHNode* const child = &bvh8.bvh8Node[orig->child[i]]; + if (child->triCount > 3 /* must be leaf */) bvh8.SplitBVH8Leaf( orig->child[i], 3 ); bvhvec3 childCentroid = (child->aabbMin + child->aabbMax) * 0.5f; cost[s][i] = dot( childCentroid - nodeCentroid, ds ); } @@ -2990,7 +3003,7 @@ void BVH8_CWBVH::ConvertFrom( BVH8& original ) for (int32_t i = 0; i < 8; i++) { if (orig->child[i] == 0) continue; - BVH8::BVHNode* const child = &original.bvh8Node[orig->child[i]]; + BVH8::BVHNode* const child = &bvh8.bvh8Node[orig->child[i]]; const int32_t qlox = (int32_t)floorf( (child->aabbMin.x - nodeLo.x) / powf( 2, (float)ex ) ); const int32_t qloy = (int32_t)floorf( (child->aabbMin.y - nodeLo.y) / powf( 2, (float)ey ) ); const int32_t qloz = (int32_t)floorf( (child->aabbMin.z - nodeLo.z) / powf( 2, (float)ez ) ); @@ -3024,17 +3037,17 @@ void BVH8_CWBVH::ConvertFrom( BVH8& original ) leafChildTriCount += tcount; for (uint32_t j = 0; j < tcount; j++) { - int32_t primitiveIndex = original.triIdx[child->firstTri + j]; + int32_t primitiveIndex = bvh8.bvh.triIdx[child->firstTri + j]; #ifdef CWBVH_COMPRESSED_TRIS PrecomputeTriangle( verts, +primitiveIndex * 3, (float*)&bvh8Tris[triDataPtr] ); bvh8Tris[triDataPtr + 3] = bvhvec4( 0, 0, 0, *(float*)&primitiveIndex ); triDataPtr += 4; #else - bvhvec4 t = original.verts[primitiveIndex * 3 + 0]; + bvhvec4 t = bvh8.bvh.verts[primitiveIndex * 3 + 0]; t.w = *(float*)&primitiveIndex; bvh8Tris[triDataPtr++] = t; - bvh8Tris[triDataPtr++] = original.verts[primitiveIndex * 3 + 1]; - bvh8Tris[triDataPtr++] = original.verts[primitiveIndex * 3 + 2]; + bvh8Tris[triDataPtr++] = bvh8.bvh.verts[primitiveIndex * 3 + 1]; + bvh8Tris[triDataPtr++] = bvh8.bvh.verts[primitiveIndex * 3 + 2]; #endif } } @@ -3101,7 +3114,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const uint32_t primCount ) { // build the BVH with a continuous array of bvhvec4 vertices: // in this case, the stride for the slice is 16 bytes. - BuildAVX( bvhvec4slice{ vertices, primCount * 3, 16U } ); + BuildAVX( bvhvec4slice{ vertices, primCount * 3, sizeof( bvhvec4 ) } ); } void BVH::BuildAVX( const bvhvec4slice& vertices ) { @@ -4201,7 +4214,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const uint32_t primCount ) { // build the BVH with a continuous array of bvhvec4 vertices: // in this case, the stride for the slice is 16 bytes. - BuildNEON( bvhvec4slice{ vertices, primCount * 3, 16U } ); + BuildNEON( bvhvec4slice{ vertices, primCount * 3, sizeof( bvhvec4 ) } ); } void BVH::BuildNEON( const bvhvec4slice& vertices ) { @@ -5089,6 +5102,27 @@ void BVHBase::IntersectTri( Ray& ray, const bvhvec4slice& verts, const uint32_t } } +// IntersectTri +bool BVHBase::TriOccludes( const Ray& ray, const bvhvec4slice& verts, const uint32_t idx ) const +{ + // Moeller-Trumbore ray/triangle intersection algorithm + const uint32_t vertIdx = idx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) return false; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) return false; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) return false; + const float t = f * dot( edge2, q ); + return t > 0 && t < ray.hit.t; +} + // IntersectAABB float BVHBase::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ) { diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index f970bd3..8522812 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -11,7 +11,7 @@ using namespace tinybvh; -BVH_SoA bvh; +BVH bvh; #ifdef LOADSCENE bvhvec4* triangles = 0; @@ -72,7 +72,7 @@ void Init() #endif // build a BVH over the scene - bvh = BVH_SoA( triangles, verts / 3 ); + bvh.Build( triangles, verts / 3 ); // load camera position / direction from file std::fstream t = std::fstream{ "camera.bin", t.binary | t.in }; diff --git a/tiny_bvh_pt.cpp b/tiny_bvh_pt.cpp index b216575..9b533b6 100644 --- a/tiny_bvh_pt.cpp +++ b/tiny_bvh_pt.cpp @@ -21,11 +21,9 @@ using namespace tinybvh; // Application variables #if defined BVH_USEAVX || defined BVH_USENEON -static BVH bvh_build; -static BVH4 bvh4; -static BVH4_CPU bvh_trace; // this is the one we will use for tracing +static BVH4_CPU bvh; #else -static BVH bvh_build, bvh_trace; +static BVH bvh; #endif static bvhvec4* tris = 0; static int triCount = 0, frameIdx = 0, spp = 0; @@ -100,13 +98,7 @@ void Init() AddMesh( "./testdata/suzanne.bin", 0.2f, bvhvec3( -18, 0.95f, -16 ), 0x90ff90 ); AddMesh( "./testdata/head.bin", 0.5f, bvhvec3( 0, 3, 9 ) ); // build bvh - bvh_build.BuildAVX( tris, triCount ); -#if defined BVH_USEAVX || defined BVH_USENEON - bvh4.ConvertFrom( bvh_build ); - bvh_trace.ConvertFrom( bvh4 ); -#else - bvh_trace = bvh; -#endif + bvh.Build( tris, triCount ); // load camera position / direction from file std::fstream t = std::fstream{ "camera.bin", t.binary | t.in }; if (!t.is_open()) return; @@ -142,7 +134,7 @@ bool UpdateCamera( float delta_time_s, fenster& f ) bvhvec3 Trace( Ray ray, unsigned& seed, unsigned depth = 0 ) { // find primary intersection - bvh_trace.Intersect( ray ); + bvh.Intersect( ray ); // shade if (ray.hit.t == 1e30f) return bvhvec3( 0.6f, 0.7f, 1 ); // hit nothing bvhvec3 I = ray.O + ray.hit.t * ray.D; @@ -155,10 +147,10 @@ bvhvec3 Trace( Ray ray, unsigned& seed, unsigned depth = 0 ) bvhvec3 direct = {}, indirect = {}; float NdotL = dot( N, L ), NLdotL = fabs( dot( L, bvhvec3( 0, 1, 0 ) ) ); if (NdotL > 0) - if (!bvh_trace.IsOccluded( Ray( I + L * 0.001f, L, dist ) )) + if (!bvh.IsOccluded( Ray( I + L * 0.001f, L, dist ) )) direct = BRDF * NdotL * NLdotL * bvhvec3( 9, 9, 8 ) * 500 * (1.0f / (dist * dist)); // random bounce - if (depth < 4) + if (depth < 2) { bvhvec3 R = CosWeightedDiffReflection( N, seed ); float pdf = 1.0f / dot( N, R ); diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index 6cf5d8a..905546d 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -15,7 +15,7 @@ // tests to perform // #define BUILD_MIDPOINT #define BUILD_REFERENCE -#define BUILD_DOUBLE +// #define BUILD_DOUBLE #define BUILD_AVX // #define BUILD_NEON // #define BUILD_SBVH @@ -581,7 +581,11 @@ int main() #ifdef TRAVERSE_ALT2WAY_ST // GPU - if (!bvh_gpu) bvh_gpu = new BVH_GPU( *bvh ); + if (!bvh_gpu) + { + bvh_gpu = new BVH_GPU(); + bvh_gpu->Build( triangles, verts / 3 ); + } printf( "- AILA_LAINE - primary: " ); traceTime = TestPrimaryRays( _GPU2, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -594,7 +598,11 @@ int main() #if defined TRAVERSE_SOA2WAY_ST && defined BVH_USEAVX // BVH_SoA::IsOccluded is not available for NEON yet. // SOA - if (!bvh_soa) bvh_soa = new BVH_SoA( *bvh ); + if (!bvh_soa) + { + bvh_soa = new BVH_SoA(); + bvh_soa->Build( triangles, verts / 3 ); + } printf( "- ALT_SOA - primary: " ); traceTime = TestPrimaryRays( _SOA, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -607,8 +615,11 @@ int main() #ifdef TRAVERSE_4WAY // BVH4_CPU - if (!bvh4) bvh4 = new BVH4( *bvh ); - if (!bvh4_cpu) bvh4_cpu = new BVH4_CPU( *bvh4 ); + if (!bvh4_cpu) + { + bvh4_cpu = new BVH4_CPU(); + bvh4_cpu->Build( triangles, verts / 3 ); + } printf( "- BVH4_AFRA - primary: " ); traceTime = TestPrimaryRays( _CPU4, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -631,8 +642,11 @@ int main() #ifdef TRAVERSE_CWBVH // CWBVH - Not efficient on CPU. - if (!bvh8) bvh8 = new BVH8( *bvh ); - if (!cwbvh) cwbvh = new BVH8_CWBVH( *bvh8 ); + if (!bvh8_cwbvh) + { + bvh8_cwbvh = new BVH8_CWBVH(); + bvh8_cwbvh->Build( triangles, verts / 3 ); + } printf( "- BVH8/CWBVH - primary: " ); traceTime = TestPrimaryRays( _CWBVH, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -643,7 +657,11 @@ int main() #ifdef TRAVERSE_BVH4 // Basic BVH4 - Basic implementation, not efficient on CPU. - if (!bvh4) bvh4 = new BVH4( *bvh ); + if (!bvh4) + { + bvh4 = new BVH4(); + bvh4->Build( triangles, verts / 3 ); + } printf( "- BASIC_BVH4 - primary: " ); traceTime = TestPrimaryRays( _BVH4, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -654,7 +672,11 @@ int main() #ifdef TRAVERSE_BVH8 // Basic BVH8 - Basic implementation, not efficient on CPU. - if (!bvh8) bvh8 = new BVH8( *bvh ); + if (!bvh8) + { + bvh8 = new BVH8(); + bvh8->Build( triangles, verts / 3 ); + } printf( "- BASIC_BVH8 - primary: " ); traceTime = TestPrimaryRays( _BVH8, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -676,7 +698,11 @@ int main() #ifdef TRAVERSE_OPTIMIZED_ST // ALT_SOA - if (!bvh_soa) bvh_soa = new BVH_SoA( *bvh ); + if (!bvh_soa) + { + bvh_soa = new BVH_SoA(); + bvh_soa->Build( triangles, verts / 3 ); + } printf( "- ALT_SOA - primary: " ); traceTime = TestPrimaryRays( _SOA, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -689,8 +715,11 @@ int main() #ifdef TRAVERSE_4WAY_OPTIMIZED // BVH4_AFRA - if (!bvh4) bvh4 = new BVH4( *bvh ); - if (!bvh4_cpu) bvh4_cpu = new BVH4_CPU( *bvh4 ); + if (!bvh4_cpu) + { + bvh4_cpu = new BVH4_CPU(); + bvh4_cpu->Build( triangles, verts / 3 ); + } printf( "- BVH4_AFRA - primary: " ); traceTime = TestPrimaryRays( _CPU4, smallBatch, Nsmall, 3 ); ValidateTraceResult( smallBatch, refDist, Nsmall, __LINE__ ); @@ -721,10 +750,14 @@ int main() // trace the rays on GPU using OpenCL printf( "- AILA_LAINE - primary: " ); - if (!bvh_gpu) bvh_gpu = new BVH_GPU( *bvh ); + if (!bvh_gpu) + { + bvh_gpu = new BVH_GPU(); + bvh_gpu->Build( triangles, verts / 3 ); + } // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer gpuNodes( bvh_gpu->usedNodes * sizeof( BVH_GPU::BVHNode ), bvh_gpu->bvhNode ); - tinyocl::Buffer idxData( bvh_gpu->idxCount * sizeof( unsigned ), bvh_gpu->triIdx ); + tinyocl::Buffer idxData( bvh_gpu->idxCount * sizeof( unsigned ), bvh_gpu->bvh.triIdx ); tinyocl::Buffer triData( bvh_gpu->triCount * 3 * sizeof( tinybvh::bvhvec4 ), triangles ); // synchronize the host-side data to the gpu side gpuNodes.CopyToDevice(); @@ -763,8 +796,11 @@ int main() // trace the rays on GPU using OpenCL printf( "- BVH4_GPU - primary: " ); - if (!bvh4) bvh4 = new BVH4( *bvh ); - if (!bvh4_gpu) bvh4_gpu = new BVH4_GPU( *bvh4 ); + if (!bvh4_gpu) + { + bvh4_gpu = new BVH4_GPU(); + bvh4_gpu->Build( triangles, verts / 3 ); + } // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer gpu4Nodes( bvh4_gpu->usedBlocks * sizeof( tinybvh::bvhvec4 ), bvh4_gpu->bvh4Data ); // synchronize the host-side data to the gpu side @@ -804,8 +840,11 @@ int main() // trace the rays on GPU using OpenCL printf( "- BVH8/CWBVH - primary: " ); - if (!bvh8) bvh8 = new BVH8( *bvh ); - if (!cwbvh) cwbvh = new BVH8_CWBVH( *bvh8 ); + if (!cwbvh) + { + cwbvh = new BVH8_CWBVH(); + cwbvh->Build( triangles, verts / 3 ); + } // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer cwbvhNodes( cwbvh->usedBlocks * sizeof( tinybvh::bvhvec4 ), cwbvh->bvh8Data ); #ifdef CWBVH_COMPRESSED_TRIS