Skip to content

Commit

Permalink
Fixed serious build bug caused by C_INT / C_TRAV.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 24, 2024
1 parent c4e47db commit e39b237
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 45 deletions.
172 changes: 140 additions & 32 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ THE SOFTWARE.
// library version
#define TINY_BVH_VERSION_MAJOR 0
#define TINY_BVH_VERSION_MINOR 9
#define TINY_BVH_VERSION_SUB 6
#define TINY_BVH_VERSION_SUB 7

// ============================================================================
//
Expand Down Expand Up @@ -374,9 +374,14 @@ class BVH
VERBOSE, // For BVH optimizing. Obtained by converting WALD_32BYTE.
BASIC_BVH4, // Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE.
BVH4_GPU, // For fast GPU rendering. Obtained by converting BASIC_BVH4.
BVH4_AFRA, // For fast CPU rendering. Obtained by converting BASIC_BVH4.
BASIC_BVH8, // Input for CWBVH. Obtained by converting WALD_32BYTE.
CWBVH // Fastest GPU rendering. Obtained by converting BASIC_BVH8.
};
enum TraceDevice {
USE_CPU = 1,
USE_GPU
};
enum BuildFlags {
NONE = 0, // Default building behavior (binned, SAH-driven).
FULLSPLIT = 1 // Split as far as possible, even when SAH doesn't agree.
Expand All @@ -390,7 +395,6 @@ class BVH
bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ }
float Intersect( const Ray& ray ) const { return BVH::IntersectAABB( ray, aabbMin, aabbMax ); }
float SurfaceArea() const { return BVH::SA( aabbMin, aabbMax ); }
float CalculateNodeCost() const { return SurfaceArea() * triCount * C_INT; }
};
struct BVHNodeAlt
{
Expand Down Expand Up @@ -449,6 +453,16 @@ class BVH
// there is no way we can shave off a full 16 bytes, unless aabbExt is stored
// as chars as well, as in CWBVH.
};
struct BVHNode4Alt2
{
// 4-way BVH node, optimized for CPU rendering.
// Based on: "Faster Incoherent Ray Traversal Using 8-Wide AVX Instructions",
// Áfra, 2013.
SIMDVEC4 xmin4, ymin4, zmin4;
SIMDVEC4 xmax4, ymax4, zmax4;
unsigned childFirst[4];
unsigned triCount[4];
};
struct BVHNode8
{
// 8-wide (aka 'shallow') BVH layout.
Expand Down Expand Up @@ -489,7 +503,7 @@ class BVH
allocatedAlt2Nodes = 0;
allocatedVerbose = 0;
allocatedBVH4Nodes = 0;
allocatedAlt4Blocks = 0;
allocatedAlt4aBlocks = 0;
allocatedBVH8Nodes = 0;
}
float SAHCost( const unsigned nodeIdx = 0 ) const
Expand Down Expand Up @@ -517,14 +531,17 @@ class BVH
void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
#endif
void Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal = false );
void SplitLeafs(); // operates on VERBOSE layout
void SplitLeafs( const unsigned maxPrims = 1 ); // operates on VERBOSE layout
void SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims = 1 ); // operates on BVH8 layout
void MergeLeafs(); // operates on VERBOSE layout
void Optimize( const unsigned iterations, const bool convertBack = true ); // operates on VERBOSE
void Refit( const BVHLayout layout = WALD_32BYTE, const unsigned nodeIdx = 0 );
int Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
bool IsOccluded( const Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
void BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout = WALD_32BYTE ) const;
void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout = WALD_32BYTE ) const;
void BatchIntersect( Ray* rayBatch, const unsigned N,
const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result,
const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
void Intersect256Rays( Ray* first ) const;
void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX
private:
Expand Down Expand Up @@ -564,6 +581,7 @@ class BVH
BVHNodeVerbose* verbose = 0; // BVH node with additional info, for BVH optimizer.
BVHNode4* bvh4Node = 0; // BVH node for 4-wide BVH.
bvhvec4* bvh4Alt = 0; // 64-byte 4-wide BVH node for efficient GPU rendering.
BVHNode4Alt2* bvh4Alt2 = 0; // 64-byte 4-wide BVH node for efficient CPU rendering.
BVHNode8* bvh8Node = 0; // BVH node for 8-wide BVH.
bvhvec4* bvh8Compact = 0; // Nodes in CWBVH format.
bvhvec4* bvh8Tris = 0; // Triangle data for CWBVH nodes.
Expand All @@ -579,15 +597,17 @@ class BVH
unsigned allocatedAlt2Nodes = 0;
unsigned allocatedVerbose = 0;
unsigned allocatedBVH4Nodes = 0;
unsigned allocatedAlt4Blocks = 0;
unsigned allocatedAlt4aBlocks = 0;
unsigned allocatedAlt4bNodes = 0;
unsigned allocatedBVH8Nodes = 0;
unsigned allocatedCWBVHBlocks = 0;
unsigned usedBVHNodes = 0;
unsigned usedAltNodes = 0;
unsigned usedAlt2Nodes = 0;
unsigned usedVerboseNodes = 0;
unsigned usedBVH4Nodes = 0;
unsigned usedAlt4Blocks = 0;
unsigned usedAlt4aBlocks = 0;
unsigned usedAlt4bNodes = 0;
unsigned usedBVH8Nodes = 0;
unsigned usedCWBVHBlocks = 0;
};
Expand Down Expand Up @@ -686,7 +706,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++;
}
// calculate per-split totals
float splitCost = 1e30f;
float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea();
unsigned bestAxis = 0, bestPos = 0;
for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
{
Expand All @@ -706,15 +726,16 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
// evaluate bin totals to find best position for object split
for (unsigned i = 0; i < BVHBINS - 1; i++)
{
const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]);
const float C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]);
if (C < splitCost)
{
splitCost = C, bestAxis = a, bestPos = i;
bestLMin = lBMin[i], bestRMin = rBMin[i], bestLMax = lBMax[i], bestRMax = rBMax[i];
}
}
}
if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
float noSplitCost = (float)node.triCount * C_INT;
if (splitCost >= noSplitCost) break; // not splitting is better.
// in-place partition
unsigned j = node.leftFirst + node.triCount, src = node.leftFirst;
const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis];
Expand Down Expand Up @@ -824,7 +845,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++;
}
// calculate per-split totals
float splitCost = 1e30f;
float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea();
unsigned bestAxis = 0, bestPos = 0;
for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
{
Expand All @@ -844,7 +865,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
// evaluate bin totals to find best position for object split
for (unsigned i = 0; i < BVHBINS - 1; i++)
{
const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]);
const float C = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
if (C < splitCost)
{
splitCost = C, bestAxis = a, bestPos = i;
Expand Down Expand Up @@ -908,7 +929,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
// find best position for spatial split
for (unsigned i = 0; i < BVHBINS - 1; i++)
{
const float Cspatial = C_TRAV + C_INT * (ANL[i] + ANR[i]);
const float Cspatial = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
if (Cspatial < splitCost && NL[i] + NR[i] < budget)
{
spatial = true, splitCost = Cspatial, bestAxis = a, bestPos = i;
Expand All @@ -919,7 +940,8 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
}
}
// terminate recursion
if (splitCost >= node.CalculateNodeCost()) break;
float noSplitCost = (float)node.triCount * C_INT;
if (splitCost >= noSplitCost) break; // not splitting is better.
// double-buffered partition
unsigned A = sliceStart, B = sliceEnd, src = node.leftFirst;
if (spatial)
Expand Down Expand Up @@ -1228,15 +1250,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
// Leaf: 15 bits for tri count, 16 for offset
// Interior: 32 bits for position of child node.
// Triangle data ('by value') immediately follows each leaf node.
unsigned blocksNeeded = usedBVHNodes * 4; // here, 'block' is 16 bytes.
unsigned blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer.
if (allocatedAlt4Blocks < blocksNeeded)
if (allocatedAlt4aBlocks < blocksNeeded)
{
AlignedFree( bvh4Alt );
assert( sizeof( BVHNode4Alt ) == 64 );
assert( bvh4Node != 0 );
bvh4Alt = (bvhvec4*)AlignedAlloc( blocksNeeded * 16 );
allocatedAlt4Blocks = blocksNeeded;
allocatedAlt4aBlocks = blocksNeeded;
}
memset( bvh4Alt, 0, 16 * blocksNeeded );
// start conversion
Expand Down Expand Up @@ -1337,7 +1359,57 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
usedAlt4Blocks = newAlt4Ptr;
usedAlt4aBlocks = newAlt4Ptr;
}
else if (from == BASIC_BVH4 && to == BVH4_AFRA)
{
#if 0 // under construction
// Convert a 4-wide BVH to a format suitable for CPU traversal.
// See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout,
// Atilla T. Áfra, 2013.
unsigned spaceNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
if (allocatedAlt4bNodes < spaceNeeded)
{
AlignedFree( bvh4Alt2 );
assert( sizeof( BVHNode4Alt2 ) == 64 );
assert( bvh4Node != 0 );
bvh4Alt2 = (BVHNode4Alt2*)AlignedAlloc( spaceNeeded * sizeof( BVHNode4Alt2 ) );
allocatedAlt4bNodes = spaceNeeded;
}
memset( bvh4Alt2, 0, spaceNeeded * sizeof( BVHNode4Alt2 ) );
// start conversion
unsigned newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
while (1)
{
const BVHNode4& orig = bvh4Node[nodeIdx];
BVHNode4Alt2& newNode = bvh4Alt2[newAlt4Ptr++];
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
for (int cidx = 0, i = 0; i < 4; i++) if (orig.child[i])
{
const BVHNode4& child = bvh4Node[orig.child[i]];
((float*)&newNode.xmin4)[cidx] = child.aabbMin.x, ((float*)&newNode.ymin4)[cidx] = child.aabbMin.y;
((float*)&newNode.zmin4)[cidx] = child.aabbMin.z, ((float*)&newNode.xmax4)[cidx] = child.aabbMax.x;
((float*)&newNode.ymax4)[cidx] = child.aabbMax.y, ((float*)&newNode.zmax4)[cidx] = child.aabbMax.z;
if (child.isLeaf())
newNode.childFirst[cidx] = orig.firstTri,
newNode.triCount[cidx] = orig.triCount;
else
stack[stackPtr++] = (unsigned)((float*)&newNode.childFirst[cidx] - (float*)bvh4Alt2),
stack[stackPtr++] = orig.child[i];
cidx++;
}
// pop next task
if (!stackPtr) break;
nodeIdx = stack[--stackPtr], ((float*)bvh4Alt2)[stack[--stackPtr]] = newAlt4Ptr;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
}
usedAlt4bNodes = newAlt4Ptr;
#endif
}
else if (from == WALD_32BYTE && to == BASIC_BVH8)
{
Expand Down Expand Up @@ -1438,6 +1510,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
for (int i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = 1e30f; else
{
BVHNode8* const child = &bvh8Node[node->child[i]];
// if (child->triCount > 3 /* must be leaf */) SplitBVH8Leaf( node->child[i], 3 );
bvhvec3 childCentroid = (child->aabbMin + child->aabbMax) * 0.5f;
cost[s][i] = dot( childCentroid - nodeCentroid, ds );
}
Expand Down Expand Up @@ -1719,7 +1792,7 @@ void BVH::Compact( const BVHLayout layout )
// necessary to have a single primitive per leaf, it will yield a slightly better
// optimized BVH. The leafs of the optimized BVH should be collapsed ('MergeLeafs')
// to obtain the final tree.
void BVH::SplitLeafs()
void BVH::SplitLeafs( const unsigned maxPrims )
{
unsigned nodeIdx = 0, stack[64], stackPtr = 0;
float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
Expand All @@ -1729,7 +1802,7 @@ void BVH::SplitLeafs()
if (!node.isLeaf()) nodeIdx = node.left, stack[stackPtr++] = node.right; else
{
// split this leaf
if (node.triCount > 1)
if (node.triCount > maxPrims)
{
const unsigned newIdx1 = usedVerboseNodes++;
const unsigned newIdx2 = usedVerboseNodes++;
Expand Down Expand Up @@ -1760,6 +1833,42 @@ void BVH::SplitLeafs()
}
}

// SplitBVH8Leaf: CWBVH requires that a leaf has no more than 3 primitives,
// but regular BVH construction does not guarantee this. So, here we split
// busy leafs recursively in multiple leaves, until the requirement is met.
void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims )
{
float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
BVHNode8& node = bvh8Node[nodeIdx];
if (node.triCount <= maxPrims) return; // also catches interior nodes
// place all primitives in a new node and make this the first child of 'node'
BVHNode8& firstChild = bvh8Node[node.child[0] = usedBVH8Nodes++];
firstChild.triCount = node.triCount;
firstChild.firstTri = node.firstTri;
unsigned nextChild = 1;
// share with new sibling nodes
while (firstChild.triCount > maxPrims && nextChild < 8)
{
BVHNode8& child = bvh8Node[node.child[nextChild] = usedBVH8Nodes++];
firstChild.triCount -= maxPrims, child.triCount = maxPrims;
child.firstTri = firstChild.firstTri + firstChild.triCount;
}
for ( unsigned i = 0; i < nextChild; i++ )
{
BVHNode8& child = bvh8Node[node.child[i]];
child.aabbMin = bvhvec3( 1e30f ), child.aabbMax = bvhvec3( -1e30f );
for ( unsigned j = 0; j < child.triCount; j++ )
{
unsigned fi = triIdx[child.firstTri + i];
child.aabbMin = tinybvh_min( child.aabbMin, fragment[fi].bmin * fragMinFix );
child.aabbMax = tinybvh_max( child.aabbMax, fragment[fi].bmax );
}
}
node.triCount = 0;
// recurse; should be rare
if (firstChild.triCount > maxPrims) SplitBVH8Leaf( node.child[0], maxPrims );
}

// MergeLeafs: After optimizing a BVH, single-primitive leafs should be merged whenever
// SAH indicates this is an improvement.
void BVH::MergeLeafs()
Expand Down Expand Up @@ -2001,10 +2110,7 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const
return 0;
}

// Intersect a buffer of rays with the scene.
// For now this exists only to establish the interface.
// A future implementation will exploit the batch to trace the rays faster.
void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout ) const
void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout, const TraceDevice device ) const
{
for (unsigned i = 0; i < N; i++) Intersect( rayBatch[i], layout );
}
Expand Down Expand Up @@ -2038,7 +2144,7 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const
// A future implementation will exploit the batch to trace the rays faster.
// BatchIsOccluded returns the hits as a bit array in result:
// Each unsigned integer in this array stores 32 hits.
void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout ) const
void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout, const TraceDevice device ) const
{
unsigned words = (N + 31 /* round up */) / 32;
memset( result, 0, words * 4 );
Expand Down Expand Up @@ -2580,7 +2686,7 @@ inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz n
}
#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \
const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \
const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \
splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
#if defined(_MSC_VER)
#pragma warning ( push )
Expand Down Expand Up @@ -2684,7 +2790,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
// calculate per-split totals
float splitCost = 1e30f;
float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea();
unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
const __m256* bb = binbox;
for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
Expand All @@ -2711,7 +2817,8 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
}
if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
float noSplitCost = (float)node.triCount * C_INT;
if (splitCost >= noSplitCost) break; // not splitting is better.
// in-place partition
const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
unsigned t, fr = triIdx[src];
Expand Down Expand Up @@ -3248,7 +3355,7 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi
}
#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \
const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \
const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \
splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }

void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
Expand Down Expand Up @@ -3350,7 +3457,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
r2 = vmaxq_f32x2( b2, f );
binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
// calculate per-split totals
float splitCost = 1e30f;
float splitCost = 1e30f, rSAV = 1.0f / node.SurfaceArea();
unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
const float32x4x2_t* bb = binbox;
for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
Expand All @@ -3377,7 +3484,8 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
}
if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
float noSplitCost = (float)node.triCount * C_INT;
if (splitCost >= noSplitCost) break; // not splitting is better.
// in-place partition
const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
unsigned t, fr = triIdx[src];
Expand Down
Loading

0 comments on commit e39b237

Please sign in to comment.