From e39b23782e62abe58ef28f0ffadbabcfcb33f4e5 Mon Sep 17 00:00:00 2001
From: Jacco Bikker <bikker.j@gmail.com>
Date: Sun, 24 Nov 2024 21:08:25 +0100
Subject: [PATCH] Fixed serious build bug caused by C_INT / C_TRAV.

---
 tiny_bvh.h             | 172 +++++++++++++++++++++++++++++++++--------
 tiny_bvh_fenster.cpp   |  29 ++++---
 tiny_bvh_speedtest.cpp |   2 +-
 3 files changed, 158 insertions(+), 45 deletions(-)

diff --git a/tiny_bvh.h b/tiny_bvh.h
index fb584a0..aacc5ec 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -99,7 +99,7 @@ THE SOFTWARE.
 // library version
 #define TINY_BVH_VERSION_MAJOR	0
 #define TINY_BVH_VERSION_MINOR	9
-#define TINY_BVH_VERSION_SUB	6
+#define TINY_BVH_VERSION_SUB	7
 
 // ============================================================================
 //
@@ -374,9 +374,14 @@ class BVH
 		VERBOSE,			// For BVH optimizing. Obtained by converting WALD_32BYTE.
 		BASIC_BVH4,			// Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE.
 		BVH4_GPU,			// For fast GPU rendering. Obtained by converting BASIC_BVH4.
+		BVH4_AFRA,			// For fast CPU rendering. Obtained by converting BASIC_BVH4.
 		BASIC_BVH8,			// Input for CWBVH. Obtained by converting WALD_32BYTE.
 		CWBVH				// Fastest GPU rendering. Obtained by converting BASIC_BVH8.
 	};
+	enum TraceDevice {
+		USE_CPU = 1,
+		USE_GPU
+	};
 	enum BuildFlags {
 		NONE = 0,			// Default building behavior (binned, SAH-driven).
 		FULLSPLIT = 1		// Split as far as possible, even when SAH doesn't agree.
@@ -390,7 +395,6 @@ class BVH
 		bool isLeaf() const { return triCount > 0; /* empty BVH leaves do not exist */ }
 		float Intersect( const Ray& ray ) const { return BVH::IntersectAABB( ray, aabbMin, aabbMax ); }
 		float SurfaceArea() const { return BVH::SA( aabbMin, aabbMax ); }
-		float CalculateNodeCost() const { return SurfaceArea() * triCount * C_INT; }
 	};
 	struct BVHNodeAlt
 	{
@@ -449,6 +453,16 @@ class BVH
 		// there is no way we can shave off a full 16 bytes, unless aabbExt is stored
 		// as chars as well, as in CWBVH.
 	};
+	struct BVHNode4Alt2
+	{
+		// 4-way BVH node, optimized for CPU rendering.
+		// Based on: "Faster Incoherent Ray Traversal Using 8-Wide AVX Instructions",
+		// Áfra, 2013.
+		SIMDVEC4 xmin4, ymin4, zmin4;
+		SIMDVEC4 xmax4, ymax4, zmax4;
+		unsigned childFirst[4];
+		unsigned triCount[4];
+	};
 	struct BVHNode8
 	{
 		// 8-wide (aka 'shallow') BVH layout.  
@@ -489,7 +503,7 @@ class BVH
 		allocatedAlt2Nodes = 0;
 		allocatedVerbose = 0;
 		allocatedBVH4Nodes = 0;
-		allocatedAlt4Blocks = 0;
+		allocatedAlt4aBlocks = 0;
 		allocatedBVH8Nodes = 0;
 	}
 	float SAHCost( const unsigned nodeIdx = 0 ) const
@@ -517,14 +531,17 @@ class BVH
 	void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
 #endif
 	void Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal = false );
-	void SplitLeafs(); // operates on VERBOSE layout
+	void SplitLeafs( const unsigned maxPrims = 1 ); // operates on VERBOSE layout
+	void SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims = 1 ); // operates on BVH8 layout
 	void MergeLeafs(); // operates on VERBOSE layout
 	void Optimize( const unsigned iterations, const bool convertBack = true ); // operates on VERBOSE
 	void Refit( const BVHLayout layout = WALD_32BYTE, const unsigned nodeIdx = 0 );
 	int Intersect( Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
 	bool IsOccluded( const Ray& ray, const BVHLayout layout = WALD_32BYTE ) const;
-	void BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout = WALD_32BYTE ) const;
-	void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout = WALD_32BYTE ) const;
+	void BatchIntersect( Ray* rayBatch, const unsigned N,
+		const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
+	void BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result,
+		const BVHLayout layout = WALD_32BYTE, const TraceDevice device = USE_CPU ) const;
 	void Intersect256Rays( Ray* first ) const;
 	void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX
 private:
@@ -564,6 +581,7 @@ class BVH
 	BVHNodeVerbose* verbose = 0;	// BVH node with additional info, for BVH optimizer.
 	BVHNode4* bvh4Node = 0;			// BVH node for 4-wide BVH.
 	bvhvec4* bvh4Alt = 0;			// 64-byte 4-wide BVH node for efficient GPU rendering.
+	BVHNode4Alt2* bvh4Alt2 = 0;		// 64-byte 4-wide BVH node for efficient CPU rendering.
 	BVHNode8* bvh8Node = 0;			// BVH node for 8-wide BVH.
 	bvhvec4* bvh8Compact = 0;		// Nodes in CWBVH format.
 	bvhvec4* bvh8Tris = 0;			// Triangle data for CWBVH nodes.
@@ -579,7 +597,8 @@ class BVH
 	unsigned allocatedAlt2Nodes = 0;
 	unsigned allocatedVerbose = 0;
 	unsigned allocatedBVH4Nodes = 0;
-	unsigned allocatedAlt4Blocks = 0;
+	unsigned allocatedAlt4aBlocks = 0;
+	unsigned allocatedAlt4bNodes = 0;
 	unsigned allocatedBVH8Nodes = 0;
 	unsigned allocatedCWBVHBlocks = 0;
 	unsigned usedBVHNodes = 0;
@@ -587,7 +606,8 @@ class BVH
 	unsigned usedAlt2Nodes = 0;
 	unsigned usedVerboseNodes = 0;
 	unsigned usedBVH4Nodes = 0;
-	unsigned usedAlt4Blocks = 0;
+	unsigned usedAlt4aBlocks = 0;
+	unsigned usedAlt4bNodes = 0;
 	unsigned usedBVH8Nodes = 0;
 	unsigned usedCWBVHBlocks = 0;
 };
@@ -686,7 +706,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 				binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++;
 			}
 			// calculate per-split totals
-			float splitCost = 1e30f;
+			float splitCost = 1e30f, rSAV =  1.0f / node.SurfaceArea();
 			unsigned bestAxis = 0, bestPos = 0;
 			for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim[a])
 			{
@@ -706,7 +726,7 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 				// evaluate bin totals to find best position for object split
 				for (unsigned i = 0; i < BVHBINS - 1; i++)
 				{
-					const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]);
+					const float C = C_TRAV + rSAV * C_INT * (ANL[i] + ANR[i]);
 					if (C < splitCost)
 					{
 						splitCost = C, bestAxis = a, bestPos = i;
@@ -714,7 +734,8 @@ void BVH::Build( const bvhvec4* vertices, const unsigned primCount )
 					}
 				}
 			}
-			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+			float noSplitCost = (float)node.triCount * C_INT;
+			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
 			unsigned j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const float rpd = rpd3.cell[bestAxis], nmin = nmin3.cell[bestAxis];
@@ -824,7 +845,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 				binMax[2][bi.z] = tinybvh_max( binMax[2][bi.z], fragment[fi].bmax ), count[2][bi.z]++;
 			}
 			// calculate per-split totals
-			float splitCost = 1e30f;
+			float splitCost = 1e30f, rSAV =  1.0f / node.SurfaceArea();
 			unsigned bestAxis = 0, bestPos = 0;
 			for (int a = 0; a < 3; a++) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
 			{
@@ -844,7 +865,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 				// evaluate bin totals to find best position for object split
 				for (unsigned i = 0; i < BVHBINS - 1; i++)
 				{
-					const float C = C_TRAV + C_INT * (ANL[i] + ANR[i]);
+					const float C = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
 					if (C < splitCost)
 					{
 						splitCost = C, bestAxis = a, bestPos = i;
@@ -908,7 +929,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 					// find best position for spatial split
 					for (unsigned i = 0; i < BVHBINS - 1; i++)
 					{
-						const float Cspatial = C_TRAV + C_INT * (ANL[i] + ANR[i]);
+						const float Cspatial = C_TRAV + C_INT * rSAV * (ANL[i] + ANR[i]);
 						if (Cspatial < splitCost && NL[i] + NR[i] < budget)
 						{
 							spatial = true, splitCost = Cspatial, bestAxis = a, bestPos = i;
@@ -919,7 +940,8 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount )
 				}
 			}
 			// terminate recursion
-			if (splitCost >= node.CalculateNodeCost()) break;
+			float noSplitCost = (float)node.triCount * C_INT;
+			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// double-buffered partition
 			unsigned A = sliceStart, B = sliceEnd, src = node.leftFirst;
 			if (spatial)
@@ -1228,15 +1250,15 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 		//           Leaf: 15 bits for tri count, 16 for offset
 		//           Interior: 32 bits for position of child node.
 		// Triangle data ('by value') immediately follows each leaf node.
-		unsigned blocksNeeded = usedBVHNodes * 4; // here, 'block' is 16 bytes.
+		unsigned blocksNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
 		blocksNeeded += 6 * triCount; // this layout stores tris in the same buffer.
-		if (allocatedAlt4Blocks < blocksNeeded)
+		if (allocatedAlt4aBlocks < blocksNeeded)
 		{
 			AlignedFree( bvh4Alt );
 			assert( sizeof( BVHNode4Alt ) == 64 );
 			assert( bvh4Node != 0 );
 			bvh4Alt = (bvhvec4*)AlignedAlloc( blocksNeeded * 16 );
-			allocatedAlt4Blocks = blocksNeeded;
+			allocatedAlt4aBlocks = blocksNeeded;
 		}
 		memset( bvh4Alt, 0, 16 * blocksNeeded );
 		// start conversion
@@ -1337,7 +1359,57 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 	#ifdef __GNUC__
 	#pragma GCC diagnostic pop
 	#endif
-		usedAlt4Blocks = newAlt4Ptr;
+		usedAlt4aBlocks = newAlt4Ptr;
+	}
+	else if (from == BASIC_BVH4 && to == BVH4_AFRA)
+	{
+	#if 0 // under construction
+		// Convert a 4-wide BVH to a format suitable for CPU traversal.
+		// See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout,
+		// Atilla T. Áfra, 2013.
+		unsigned spaceNeeded = usedBVH4Nodes * 4; // here, 'block' is 16 bytes.
+		if (allocatedAlt4bNodes < spaceNeeded)
+		{
+			AlignedFree( bvh4Alt2 );
+			assert( sizeof( BVHNode4Alt2 ) == 64 );
+			assert( bvh4Node != 0 );
+			bvh4Alt2 = (BVHNode4Alt2*)AlignedAlloc( spaceNeeded * sizeof( BVHNode4Alt2 ) );
+			allocatedAlt4bNodes = spaceNeeded;
+		}
+		memset( bvh4Alt2, 0, spaceNeeded * sizeof( BVHNode4Alt2 ) );
+		// start conversion
+		unsigned newAlt4Ptr = 0, nodeIdx = 0, stack[128], stackPtr = 0;
+		while (1)
+		{
+			const BVHNode4& orig = bvh4Node[nodeIdx];
+			BVHNode4Alt2& newNode = bvh4Alt2[newAlt4Ptr++];
+		#ifdef __GNUC__
+		#pragma GCC diagnostic push
+		#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+		#endif
+			for (int cidx = 0, i = 0; i < 4; i++) if (orig.child[i])
+			{
+				const BVHNode4& child = bvh4Node[orig.child[i]];
+				((float*)&newNode.xmin4)[cidx] = child.aabbMin.x, ((float*)&newNode.ymin4)[cidx] = child.aabbMin.y;
+				((float*)&newNode.zmin4)[cidx] = child.aabbMin.z, ((float*)&newNode.xmax4)[cidx] = child.aabbMax.x;
+				((float*)&newNode.ymax4)[cidx] = child.aabbMax.y, ((float*)&newNode.zmax4)[cidx] = child.aabbMax.z;
+				if (child.isLeaf())
+					newNode.childFirst[cidx] = orig.firstTri,
+					newNode.triCount[cidx] = orig.triCount;
+				else
+					stack[stackPtr++] = (unsigned)((float*)&newNode.childFirst[cidx] - (float*)bvh4Alt2),
+					stack[stackPtr++] = orig.child[i];
+				cidx++;
+			}
+			// pop next task
+			if (!stackPtr) break;
+			nodeIdx = stack[--stackPtr], ((float*)bvh4Alt2)[stack[--stackPtr]] = newAlt4Ptr;
+		#ifdef __GNUC__
+		#pragma GCC diagnostic pop
+		#endif
+		}
+		usedAlt4bNodes = newAlt4Ptr;
+	#endif
 	}
 	else if (from == WALD_32BYTE && to == BASIC_BVH8)
 	{
@@ -1438,6 +1510,7 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 				for (int i = 0; i < 8; i++) if (node->child[i] == 0) cost[s][i] = 1e30f; else
 				{
 					BVHNode8* const child = &bvh8Node[node->child[i]];
+					// if (child->triCount > 3 /* must be leaf */) SplitBVH8Leaf( node->child[i], 3 ); 
 					bvhvec3 childCentroid = (child->aabbMin + child->aabbMax) * 0.5f;
 					cost[s][i] = dot( childCentroid - nodeCentroid, ds );
 				}
@@ -1719,7 +1792,7 @@ void BVH::Compact( const BVHLayout layout )
 // necessary to have a single primitive per leaf, it will yield a slightly better
 // optimized BVH. The leafs of the optimized BVH should be collapsed ('MergeLeafs')
 // to obtain the final tree.
-void BVH::SplitLeafs()
+void BVH::SplitLeafs( const unsigned maxPrims )
 {
 	unsigned nodeIdx = 0, stack[64], stackPtr = 0;
 	float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
@@ -1729,7 +1802,7 @@ void BVH::SplitLeafs()
 		if (!node.isLeaf()) nodeIdx = node.left, stack[stackPtr++] = node.right; else
 		{
 			// split this leaf
-			if (node.triCount > 1)
+			if (node.triCount > maxPrims)
 			{
 				const unsigned newIdx1 = usedVerboseNodes++;
 				const unsigned newIdx2 = usedVerboseNodes++;
@@ -1760,6 +1833,42 @@ void BVH::SplitLeafs()
 	}
 }
 
+// SplitBVH8Leaf: CWBVH requires that a leaf has no more than 3 primitives,
+// but regular BVH construction does not guarantee this. So, here we split
+// busy leafs recursively in multiple leaves, until the requirement is met.
+void BVH::SplitBVH8Leaf( const unsigned nodeIdx, const unsigned maxPrims )
+{
+	float fragMinFix = frag_min_flipped ? -1.0f : 1.0f;
+	BVHNode8& node = bvh8Node[nodeIdx];
+	if (node.triCount <= maxPrims) return; // also catches interior nodes
+	// place all primitives in a new node and make this the first child of 'node'
+	BVHNode8& firstChild = bvh8Node[node.child[0] = usedBVH8Nodes++];
+	firstChild.triCount = node.triCount;
+	firstChild.firstTri = node.firstTri;
+	unsigned nextChild = 1;
+	// share with new sibling nodes
+	while (firstChild.triCount > maxPrims && nextChild < 8)
+	{
+		BVHNode8& child = bvh8Node[node.child[nextChild] = usedBVH8Nodes++];
+		firstChild.triCount -= maxPrims, child.triCount = maxPrims;
+		child.firstTri = firstChild.firstTri + firstChild.triCount;
+	}
+	for ( unsigned i = 0; i < nextChild; i++ )
+	{
+		BVHNode8& child = bvh8Node[node.child[i]];
+		child.aabbMin = bvhvec3( 1e30f ), child.aabbMax = bvhvec3( -1e30f );
+		for ( unsigned j = 0; j < child.triCount; j++ )
+		{
+			unsigned fi = triIdx[child.firstTri + i];
+			child.aabbMin = tinybvh_min( child.aabbMin, fragment[fi].bmin * fragMinFix ); 
+			child.aabbMax = tinybvh_max( child.aabbMax, fragment[fi].bmax ); 
+		}
+	}
+	node.triCount = 0;
+	// recurse; should be rare
+	if (firstChild.triCount > maxPrims) SplitBVH8Leaf( node.child[0], maxPrims );
+}
+
 // MergeLeafs: After optimizing a BVH, single-primitive leafs should be merged whenever
 // SAH indicates this is an improvement.
 void BVH::MergeLeafs()
@@ -2001,10 +2110,7 @@ int BVH::Intersect( Ray& ray, const BVHLayout layout ) const
 	return 0;
 }
 
-// Intersect a buffer of rays with the scene.
-// For now this exists only to establish the interface.
-// A future implementation will exploit the batch to trace the rays faster.
-void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout ) const
+void BVH::BatchIntersect( Ray* rayBatch, const unsigned N, const BVHLayout layout, const TraceDevice device ) const
 {
 	for (unsigned i = 0; i < N; i++) Intersect( rayBatch[i], layout );
 }
@@ -2038,7 +2144,7 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const
 // A future implementation will exploit the batch to trace the rays faster.
 // BatchIsOccluded returns the hits as a bit array in result:
 // Each unsigned integer in this array stores 32 hits.
-void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout ) const
+void BVH::BatchIsOccluded( Ray* rayBatch, const unsigned N, unsigned* result, const BVHLayout layout, const TraceDevice device ) const
 {
 	unsigned words = (N + 31 /* round up */) / 32;
 	memset( result, 0, words * 4 );
@@ -2580,7 +2686,7 @@ inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz n
 }
 #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
 	ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \
-	const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \
+	const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \
 	splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
 #if defined(_MSC_VER)
 #pragma warning ( push )
@@ -2684,7 +2790,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 			r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
 			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
 			// calculate per-split totals
-			float splitCost = 1e30f;
+			float splitCost = 1e30f, rSAV =  1.0f / node.SurfaceArea();
 			unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const __m256* bb = binbox;
 			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
@@ -2711,7 +2817,8 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned primCount )
 				float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
 				float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
 			}
-			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+			float noSplitCost = (float)node.triCount * C_INT;
+			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
 			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
 			unsigned t, fr = triIdx[src];
@@ -3248,7 +3355,7 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi
 }
 #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
 	ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; \
-	const float C = C_TRAV + C_INT * ANLR; if (C < splitCost) \
+	const float C = C_TRAV + C_INT * rSAV * ANLR; if (C < splitCost) \
 	splitCost = C, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
 
 void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
@@ -3350,7 +3457,7 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 			r2 = vmaxq_f32x2( b2, f );
 			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
 			// calculate per-split totals
-			float splitCost = 1e30f;
+			float splitCost = 1e30f, rSAV =  1.0f / node.SurfaceArea();
 			unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
 			const float32x4x2_t* bb = binbox;
 			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
@@ -3377,7 +3484,8 @@ void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 				float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
 				float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
 			}
-			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+			float noSplitCost = (float)node.triCount * C_INT;
+			if (splitCost >= noSplitCost) break; // not splitting is better.
 			// in-place partition
 			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
 			unsigned t, fr = triIdx[src];
diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp
index 9f5db15..357ab8f 100644
--- a/tiny_bvh_fenster.cpp
+++ b/tiny_bvh_fenster.cpp
@@ -1,7 +1,7 @@
 #include "external/fenster.h" // https://github.com/zserge/fenster
 
 // #define USE_EMBREE // enable to verify correct implementation, win64 only for now.
-#define LOADSPONZA
+#define LOADSCENE
 
 #define TINYBVH_IMPLEMENTATION
 #include "tiny_bvh.h"
@@ -19,8 +19,9 @@ void embreeError( void* userPtr, enum RTCError error, const char* str )
 BVH bvh;
 #endif
 
-#ifdef LOADSPONZA
+#ifdef LOADSCENE
 bvhvec4* triangles = 0;
+const char scene[] = "happybuddha.bin";
 #include <fstream>
 #else
 ALIGNED( 16 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{};
@@ -51,14 +52,16 @@ void sphere_flake( float x, float y, float z, float s, int d = 0 )
 
 void Init()
 {
-#ifdef LOADSPONZA
+#ifdef LOADSCENE
 	// load raw vertex data for Crytek's Sponza
-	std::string filename{ "../testdata/cryteksponza.bin" };
+	std::string filename{ "../testdata/" };
+	filename += scene;
 	std::fstream s{ filename, s.binary | s.in };
 	if (!s.is_open())
 	{
 		// try again, look in .\testdata
-		std::string filename{ "./testdata/cryteksponza.bin" };
+		std::string filename{ "./testdata/" };
+		filename += scene;
 		s = std::fstream{ filename, s.binary | s.in };
 		assert( s.is_open() );
 	}
@@ -94,8 +97,9 @@ void Init()
 
 	// build a BVH over the scene
 #if defined(BVH_USEAVX)
-	bvh.BuildAVX( triangles, verts / 3 );
-	bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
+	bvh.BuildHQ( triangles, verts / 3 ); 
+	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH8 );
+	bvh.Convert( BVH::BASIC_BVH8, BVH::CWBVH );
 #elif defined(BVH_USENEON)
 	bvh.BuildNEON( triangles, verts / 3 );
 #else
@@ -110,8 +114,9 @@ void Tick( uint32_t* buf )
 {
 	// setup view pyramid for a pinhole camera: 
 	// eye, p1 (top-left), p2 (top-right) and p3 (bottom-left)
-#ifdef LOADSPONZA
-	bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) );
+#ifdef LOADSCENE
+	// bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) );
+	bvhvec3 eye( 0, 13, 30 ), view = normalize( bvhvec3( 0, 0.01f, -1 ) );
 #else
 	bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) );
 #endif
@@ -140,7 +145,9 @@ void Tick( uint32_t* buf )
 	}
 
 	// trace primary rays
-#if defined(USE_EMBREE)
+#if !defined USE_EMBREE
+	for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::CWBVH );
+#else
 	struct RTCRayHit rayhit;
 	for (int i = 0; i < N; i++)
 	{
@@ -152,8 +159,6 @@ void Tick( uint32_t* buf )
 		rays[i].hit.u = rayhit.hit.u, rays[i].hit.u = rayhit.hit.v;
 		rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar;
 	}
-#else
-	for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::AILA_LAINE );
 #endif
 
 	// visualize result
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index 8d6bda1..1cc927e 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -395,7 +395,7 @@ int main()
 	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
 	bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_GPU );
 	// create OpenCL buffers for the BVH data calculated by tiny_bvh.h
-	tinyocl::Buffer gpu4Nodes( bvh.usedAlt4Blocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt );
+	tinyocl::Buffer gpu4Nodes( bvh.usedAlt4aBlocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh4Alt );
 	// synchronize the host-side data to the gpu side
 	gpu4Nodes.CopyToDevice();
 #ifndef GPU_2WAY // otherwise these already exist.