From e39b23782e62abe58ef28f0ffadbabcfcb33f4e5 Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Sun, 24 Nov 2024 21:08:25 +0100 Subject: [PATCH] Fixed serious build bug caused by C_INT / C_TRAV. --- tiny_bvh.h | 172 +++++++++++++++++++++++++++++++++-------- tiny_bvh_fenster.cpp | 29 ++++--- tiny_bvh_speedtest.cpp | 2 +- 3 files changed, 158 insertions(+), 45 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index fb584a0..aacc5ec 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -99,7 +99,7 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 0 #define TINY_BVH_VERSION_MINOR 9 -#define TINY_BVH_VERSION_SUB 6 +#define TINY_BVH_VERSION_SUB 7 // ============================================================================ // @@ -374,9 +374,14 @@ class BVH VERBOSE, // For BVH optimizing. Obtained by converting WALD_32BYTE. BASIC_BVH4, // Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE. BVH4_GPU, // For fast GPU rendering. Obtained by converting BASIC_BVH4. + BVH4_AFRA, // For fast CPU rendering. Obtained by converting BASIC_BVH4. BASIC_BVH8, // Input for CWBVH. Obtained by converting WALD_32BYTE. CWBVH // Fastest GPU rendering. Obtained by converting BASIC_BVH8. }; + enum TraceDevice { + USE_CPU = 1, + USE_GPU + }; enum BuildFlags { NONE = 0, // Default building behavior (binned, SAH-driven). FULLSPLIT = 1 // Split as far as possible, even when SAH doesn't agree. @@ -390,7 +395,6 @@ class BVH bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ } float Intersect( const Ray& ray ) const { return BVH::IntersectAABB( ray, aabbMin, aabbMax ); } float SurfaceArea() const { return BVH::SA( aabbMin, aabbMax ); } - float CalculateNodeCost() const { return SurfaceArea() * triCount * C_INT; } }; struct BVHNodeAlt { @@ -449,6 +453,16 @@ class BVH // there is no way we can shave off a full 16 bytes, unless aabbExt is stored // as chars as well, as in CWBVH. }; + struct BVHNode4Alt2 + { + // 4-way BVH node, optimized for CPU rendering. + // Based on: "Faster Incoherent Ray Traversal Using 8-Wide AVX Instructions", + // Áfra, 2013. + SIMDVEC4 xmin4, ymin4, zmin4; + SIMDVEC4 xmax4, ymax4, zmax4; + unsigned childFirst[4]; + unsigned triCount[4]; + }; struct BVHNode8 { // 8-wide (aka 'shallow') BVH layout. @@ -489,7 +503,7 @@ class BVH allocatedAlt2Nodes = 0; allocatedVerbose = 0; allocatedBVH4Nodes = 0; - allocatedAlt4Blocks = 0; + allocatedAlt4aBlocks = 0; allocatedBVH8Nodes = 0; } float SAHCost( const unsigned nodeIdx = 0 ) const @@ -517,14 +531,17 @@ class BVH void BuildNEON( const bvhvec4* vertices, const unsigned primCount ); #endif void Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal = false ); - void SplitLeafs(); // operates on VERBOSE layout + void SplitLeafs( const unsigned maxPrims = 1 ); // operates on VERBOSE layout + void SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims = 1 ); // operates on BVH8 layout void MergeLeafs(); // operates on VERBOSE layout void Optimize( const unsigned iterations, const bool convertBack = true ); // operates on VERBOSE void Refit( const BVHLayout layout = WALD_32BYTE, const unsigned nodeIdx = 0 ); int Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const; bool IsOccluded( const Ray& ray, const BVHLayout layout = WALD_32BYTE ) const; - void BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout = WALD_32BYTE ) const; - void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout = WALD_32BYTE ) const; + void BatchIntersect( Ray* rayBatch, const unsigned N, + const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const; + void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, + const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const; void Intersect256Rays( Ray* first ) const; void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX private: @@ -564,6 +581,7 @@ class BVH BVHNodeVerbose* verbose = 0; // BVH node with additional info, for BVH optimizer. BVHNode4* bvh4Node = 0; // BVH node for 4-wide BVH. bvhvec4* bvh4Alt = 0; // 64-byte 4-wide BVH node for efficient GPU rendering. + BVHNode4Alt2* bvh4Alt2 = 0; // 64-byte 4-wide BVH node for efficient CPU rendering. BVHNode8* bvh8Node = 0; // BVH node for 8-wide BVH. bvhvec4* bvh8Compact = 0; // Nodes in CWBVH format. bvhvec4* bvh8Tris = 0; // Triangle data for CWBVH nodes. @@ -579,7 +597,8 @@ class BVH unsigned allocatedAlt2Nodes = 0; unsigned allocatedVerbose = 0; unsigned allocatedBVH4Nodes = 0; - unsigned allocatedAlt4Blocks = 0; + unsigned allocatedAlt4aBlocks = 0; + unsigned allocatedAlt4bNodes = 0; unsigned allocatedBVH8Nodes = 0; unsigned allocatedCWBVHBlocks = 0; unsigned usedBVHNodes = 0; @@ -587,7 +606,8 @@ class BVH unsigned usedAlt2Nodes = 0; unsigned usedVerboseNodes = 0; unsigned usedBVH4Nodes = 0; - unsigned usedAlt4Blocks = 0; + unsigned usedAlt4aBlocks = 0; + unsigned usedAlt4bNodes = 0; unsigned usedBVH8Nodes = 0; unsigned usedCWBVHBlocks = 0; }; @@ -686,7 +706,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++; } // calculate per-split totals - float splitCost = 1e30f; + float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea(); unsigned bestAxis = 0, bestPos = 0; for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a]) { @@ -706,7 +726,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) // evaluate bin totals to find best position for object split for (unsigned i = 0; i < BVHBINS - 1; i++) { - const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]); + const float C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]); if (C < splitCost) { splitCost = C, bestAxis = a, bestPos = i; @@ -714,7 +734,8 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount ) } } } - if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better. + float noSplitCost = (float)node.triCount * C_INT; + if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition unsigned j = node.leftFirst + node.triCount, src = node.leftFirst; const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis]; @@ -824,7 +845,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++; } // calculate per-split totals - float splitCost = 1e30f; + float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea(); unsigned bestAxis = 0, bestPos = 0; for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { @@ -844,7 +865,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) // evaluate bin totals to find best position for object split for (unsigned i = 0; i < BVHBINS - 1; i++) { - const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]); + const float C = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]); if (C < splitCost) { splitCost = C, bestAxis = a, bestPos = i; @@ -908,7 +929,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) // find best position for spatial split for (unsigned i = 0; i < BVHBINS - 1; i++) { - const float Cspatial = C_TRAV + C_INT * (ANL[i] + ANR[i]); + const float Cspatial = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]); if (Cspatial < splitCost && NL[i] + NR[i] < budget) { spatial = true, splitCost = Cspatial, bestAxis = a, bestPos = i; @@ -919,7 +940,8 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) } } // terminate recursion - if (splitCost >= node.CalculateNodeCost()) break; + float noSplitCost = (float)node.triCount * C_INT; + if (splitCost >= noSplitCost) break; // not splitting is better. // double-buffered partition unsigned A = sliceStart, B = sliceEnd, src = node.leftFirst; if (spatial) @@ -1228,15 +1250,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr // Leaf: 15 bits for tri count, 16 for offset // Interior: 32 bits for position of child node. // Triangle data ('by value') immediately follows each leaf node. - unsigned blocksNeeded = usedBVHNodes * 4; // here, 'block' is 16 bytes. + unsigned blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes. blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer. - if (allocatedAlt4Blocks < blocksNeeded) + if (allocatedAlt4aBlocks < blocksNeeded) { AlignedFree( bvh4Alt ); assert( sizeof( BVHNode4Alt ) == 64 ); assert( bvh4Node != 0 ); bvh4Alt = (bvhvec4*)AlignedAlloc( blocksNeeded * 16 ); - allocatedAlt4Blocks = blocksNeeded; + allocatedAlt4aBlocks = blocksNeeded; } memset( bvh4Alt, 0, 16 * blocksNeeded ); // start conversion @@ -1337,7 +1359,57 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr #ifdef __GNUC__ #pragma GCC diagnostic pop #endif - usedAlt4Blocks = newAlt4Ptr; + usedAlt4aBlocks = newAlt4Ptr; + } + else if (from == BASIC_BVH4 && to == BVH4_AFRA) + { + #if 0 // under construction + // Convert a 4-wide BVH to a format suitable for CPU traversal. + // See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout, + // Atilla T. Áfra, 2013. + unsigned spaceNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes. + if (allocatedAlt4bNodes < spaceNeeded) + { + AlignedFree( bvh4Alt2 ); + assert( sizeof( BVHNode4Alt2 ) == 64 ); + assert( bvh4Node != 0 ); + bvh4Alt2 = (BVHNode4Alt2*)AlignedAlloc( spaceNeeded * sizeof( BVHNode4Alt2 ) ); + allocatedAlt4bNodes = spaceNeeded; + } + memset( bvh4Alt2, 0, spaceNeeded * sizeof( BVHNode4Alt2 ) ); + // start conversion + unsigned newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0; + while (1) + { + const BVHNode4& orig = bvh4Node[nodeIdx]; + BVHNode4Alt2& newNode = bvh4Alt2[newAlt4Ptr++]; + #ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" + #endif + for (int cidx = 0, i = 0; i < 4; i++) if (orig.child[i]) + { + const BVHNode4& child = bvh4Node[orig.child[i]]; + ((float*)&newNode.xmin4)[cidx] = child.aabbMin.x, ((float*)&newNode.ymin4)[cidx] = child.aabbMin.y; + ((float*)&newNode.zmin4)[cidx] = child.aabbMin.z, ((float*)&newNode.xmax4)[cidx] = child.aabbMax.x; + ((float*)&newNode.ymax4)[cidx] = child.aabbMax.y, ((float*)&newNode.zmax4)[cidx] = child.aabbMax.z; + if (child.isLeaf()) + newNode.childFirst[cidx] = orig.firstTri, + newNode.triCount[cidx] = orig.triCount; + else + stack[stackPtr++] = (unsigned)((float*)&newNode.childFirst[cidx] - (float*)bvh4Alt2), + stack[stackPtr++] = orig.child[i]; + cidx++; + } + // pop next task + if (!stackPtr) break; + nodeIdx = stack[--stackPtr], ((float*)bvh4Alt2)[stack[--stackPtr]] = newAlt4Ptr; + #ifdef __GNUC__ + #pragma GCC diagnostic pop + #endif + } + usedAlt4bNodes = newAlt4Ptr; + #endif } else if (from == WALD_32BYTE && to == BASIC_BVH8) { @@ -1438,6 +1510,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr for (int i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = 1e30f; else { BVHNode8* const child = &bvh8Node[node->child[i]]; + // if (child->triCount > 3 /* must be leaf */) SplitBVH8Leaf( node->child[i], 3 ); bvhvec3 childCentroid = (child->aabbMin + child->aabbMax) * 0.5f; cost[s][i] = dot( childCentroid - nodeCentroid, ds ); } @@ -1719,7 +1792,7 @@ void BVH::Compact( const BVHLayout layout ) // necessary to have a single primitive per leaf, it will yield a slightly better // optimized BVH. The leafs of the optimized BVH should be collapsed ('MergeLeafs') // to obtain the final tree. -void BVH::SplitLeafs() +void BVH::SplitLeafs( const unsigned maxPrims ) { unsigned nodeIdx = 0, stack[64], stackPtr = 0; float fragMinFix = frag_min_flipped ? -1.0f : 1.0f; @@ -1729,7 +1802,7 @@ void BVH::SplitLeafs() if (!node.isLeaf()) nodeIdx = node.left, stack[stackPtr++] = node.right; else { // split this leaf - if (node.triCount > 1) + if (node.triCount > maxPrims) { const unsigned newIdx1 = usedVerboseNodes++; const unsigned newIdx2 = usedVerboseNodes++; @@ -1760,6 +1833,42 @@ void BVH::SplitLeafs() } } +// SplitBVH8Leaf: CWBVH requires that a leaf has no more than 3 primitives, +// but regular BVH construction does not guarantee this. So, here we split +// busy leafs recursively in multiple leaves, until the requirement is met. +void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims ) +{ + float fragMinFix = frag_min_flipped ? -1.0f : 1.0f; + BVHNode8& node = bvh8Node[nodeIdx]; + if (node.triCount <= maxPrims) return; // also catches interior nodes + // place all primitives in a new node and make this the first child of 'node' + BVHNode8& firstChild = bvh8Node[node.child[0] = usedBVH8Nodes++]; + firstChild.triCount = node.triCount; + firstChild.firstTri = node.firstTri; + unsigned nextChild = 1; + // share with new sibling nodes + while (firstChild.triCount > maxPrims && nextChild < 8) + { + BVHNode8& child = bvh8Node[node.child[nextChild] = usedBVH8Nodes++]; + firstChild.triCount -= maxPrims, child.triCount = maxPrims; + child.firstTri = firstChild.firstTri + firstChild.triCount; + } + for ( unsigned i = 0; i < nextChild; i++ ) + { + BVHNode8& child = bvh8Node[node.child[i]]; + child.aabbMin = bvhvec3( 1e30f ), child.aabbMax = bvhvec3( -1e30f ); + for ( unsigned j = 0; j < child.triCount; j++ ) + { + unsigned fi = triIdx[child.firstTri + i]; + child.aabbMin = tinybvh_min( child.aabbMin, fragment[fi].bmin * fragMinFix ); + child.aabbMax = tinybvh_max( child.aabbMax, fragment[fi].bmax ); + } + } + node.triCount = 0; + // recurse; should be rare + if (firstChild.triCount > maxPrims) SplitBVH8Leaf( node.child[0], maxPrims ); +} + // MergeLeafs: After optimizing a BVH, single-primitive leafs should be merged whenever // SAH indicates this is an improvement. void BVH::MergeLeafs() @@ -2001,10 +2110,7 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const return 0; } -// Intersect a buffer of rays with the scene. -// For now this exists only to establish the interface. -// A future implementation will exploit the batch to trace the rays faster. -void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout ) const +void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout, const TraceDevice device ) const { for (unsigned i = 0; i < N; i++) Intersect( rayBatch[i], layout ); } @@ -2038,7 +2144,7 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const // A future implementation will exploit the batch to trace the rays faster. // BatchIsOccluded returns the hits as a bit array in result: // Each unsigned integer in this array stores 32 hits. -void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout ) const +void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout, const TraceDevice device ) const { unsigned words = (N + 31 /* round up */) / 32; memset( result, 0, words * 4 ); @@ -2580,7 +2686,7 @@ inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz n } #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \ - const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \ + const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \ splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; } #if defined(_MSC_VER) #pragma warning ( push ) @@ -2684,7 +2790,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f ); binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; // calculate per-split totals - float splitCost = 1e30f; + float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea(); unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; const __m256* bb = binbox; for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) @@ -2711,7 +2817,8 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount ) float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 ); float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split } - if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better. + float noSplitCost = (float)node.triCount * C_INT; + if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; unsigned t, fr = triIdx[src]; @@ -3248,7 +3355,7 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi } #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \ - const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \ + const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \ splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; } void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) @@ -3350,7 +3457,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) r2 = vmaxq_f32x2( b2, f ); binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2; // calculate per-split totals - float splitCost = 1e30f; + float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea(); unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst; const float32x4x2_t* bb = binbox; for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) @@ -3377,7 +3484,8 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount ) float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 ); float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split } - if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better. + float noSplitCost = (float)node.triCount * C_INT; + if (splitCost >= noSplitCost) break; // not splitting is better. // in-place partition const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis]; unsigned t, fr = triIdx[src]; diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index 9f5db15..357ab8f 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -1,7 +1,7 @@ #include "external/fenster.h" // https://github.com/zserge/fenster // #define USE_EMBREE // enable to verify correct implementation, win64 only for now. -#define LOADSPONZA +#define LOADSCENE #define TINYBVH_IMPLEMENTATION #include "tiny_bvh.h" @@ -19,8 +19,9 @@ void embreeError( void* userPtr, enum RTCError error, const char* str ) BVH bvh; #endif -#ifdef LOADSPONZA +#ifdef LOADSCENE bvhvec4* triangles = 0; +const char scene[] = "happybuddha.bin"; #include #else ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; @@ -51,14 +52,16 @@ void sphere_flake( float x, float y, float z, float s, int d = 0 ) void Init() { -#ifdef LOADSPONZA +#ifdef LOADSCENE // load raw vertex data for Crytek's Sponza - std::string filename{ "../testdata/cryteksponza.bin" }; + std::string filename{ "../testdata/" }; + filename += scene; std::fstream s{ filename, s.binary | s.in }; if (!s.is_open()) { // try again, look in .\testdata - std::string filename{ "./testdata/cryteksponza.bin" }; + std::string filename{ "./testdata/" }; + filename += scene; s = std::fstream{ filename, s.binary | s.in }; assert( s.is_open() ); } @@ -94,8 +97,9 @@ void Init() // build a BVH over the scene #if defined(BVH_USEAVX) - bvh.BuildAVX( triangles, verts / 3 ); - bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE ); + bvh.BuildHQ( triangles, verts / 3 ); + bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH8 ); + bvh.Convert( BVH::BASIC_BVH8, BVH::CWBVH ); #elif defined(BVH_USENEON) bvh.BuildNEON( triangles, verts / 3 ); #else @@ -110,8 +114,9 @@ void Tick( uint32_t* buf ) { // setup view pyramid for a pinhole camera: // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) -#ifdef LOADSPONZA - bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) ); +#ifdef LOADSCENE + // bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) ); + bvhvec3 eye( 0, 13, 30 ), view = normalize( bvhvec3( 0, 0.01f, -1 ) ); #else bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) ); #endif @@ -140,7 +145,9 @@ void Tick( uint32_t* buf ) } // trace primary rays -#if defined(USE_EMBREE) +#if !defined USE_EMBREE + for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::CWBVH ); +#else struct RTCRayHit rayhit; for (int i = 0; i < N; i++) { @@ -152,8 +159,6 @@ void Tick( uint32_t* buf ) rays[i].hit.u = rayhit.hit.u, rays[i].hit.u = rayhit.hit.v; rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar; } -#else - for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::AILA_LAINE ); #endif // visualize result diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index 8d6bda1..1cc927e 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -395,7 +395,7 @@ int main() bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_GPU ); // create OpenCL buffers for the BVH data calculated by tiny_bvh.h - tinyocl::Buffer gpu4Nodes( bvh.usedAlt4Blocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt ); + tinyocl::Buffer gpu4Nodes( bvh.usedAlt4aBlocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt ); // synchronize the host-side data to the gpu side gpu4Nodes.CopyToDevice(); #ifndef GPU_2WAY // otherwise these already exist.