diff --git a/tiny_bvh.h b/tiny_bvh.h
index 9d94f32..c0272d5 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+// Nov 08, '24: version 0.3.0
 // Oct 30, '24: version 0.1.0 : Initial release.
 // Oct 29, '24: version 0.0.1 : Establishing interface.
 //
@@ -72,7 +73,8 @@ THE SOFTWARE.
 
 // library version
 #define TINY_BVH_VERSION_MAJOR	0
-#define TINY_BVH_VERSION_MINOR	2
+#define TINY_BVH_VERSION_MINOR	3
+#define TINY_BVH_VERSION_SUB	0
 
 // ============================================================================
 //
@@ -236,6 +238,15 @@ static bvhvec3 normalize( const bvhvec3& a )
 	return a * rl;
 }
 
+// SIMD typedef, helps keeping the interface generic
+#ifdef BVH_USEAVX
+typedef __m128 SIMDVEC4;
+#define SIMD_SETVEC(a,b,c,d) _mm_set_ps( a, b, c, d )
+#else
+typedef bvhvec4 SIMDVEC4;
+#define SIMD_SETVEC(a,b,c,d) bvhvec4( d, c, b, a )
+#endif
+
 // ============================================================================
 //
 //        R A Y   T R A C I N G   S T R U C T S  /  C L A S S E S
@@ -298,16 +309,14 @@ class BVH
 		bvhvec3 rmax; unsigned int firstTri; // total: 64 bytes
 		bool isLeaf() const { return triCount > 0; }
 	};
-#ifdef BVH_USEAVX
 	struct BVHNodeAlt2
 	{
 		// Second alternative 64-byte BVH node layout, same as BVHNodeAlt but
 		// with child AABBs stored in SoA order.
-		__m128 xxxx, yyyy, zzzz;
+		SIMDVEC4 xxxx, yyyy, zzzz;
 		unsigned int left, right, triCount, firstTri; // total: 64 bytes
 		bool isLeaf() const { return triCount > 0; }
 	};
-#endif
 	struct Fragment
 	{
 		// A fragment stores the bounds of an input primitive. The name 'Fragment' is from
@@ -352,11 +361,11 @@ class BVH
 	void Refit();
 	int Intersect( Ray& ray, BVHLayout layout = WALD_32BYTE ) const;
 	void Intersect256Rays( Ray* first ) const;
-	void Intersect256RaysSSE( Ray* packet ) const;
+	void Intersect256RaysSSE( Ray* packet ) const; // requires BVH_USEAVX
 private:
 	int Intersect_Wald32Byte( Ray& ray ) const;
 	int Intersect_AilaLaine( Ray& ray ) const;
-	int Intersect_AltSoA( Ray& ray ) const;
+	int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX
 	void IntersectTri( Ray& ray, const unsigned int triIdx ) const;
 	static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax );
 	static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax )
@@ -373,9 +382,7 @@ class BVH
 	unsigned int newNodePtr = 0;	// number of reserved nodes
 	BVHNode* bvhNode = 0;			// BVH node pool, Wald 32-byte format. Root is always in node 0.
 	BVHNodeAlt* altNode = 0;		// BVH node in Aila & Laine format.
-#ifdef BVH_USEAVX
 	BVHNodeAlt2* alt2Node = 0;		// BVH node in Aila & Laine (SoA version) format.
-#endif
 };
 
 } // namespace tinybvh
@@ -545,7 +552,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal )
 			}
 		}
 	}
-#ifdef BVH_USEAVX
 	else if (from == WALD_32BYTE && to == ALT_SOA)
 	{
 		// allocate space
@@ -570,9 +576,11 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal )
 			{
 				const BVHNode& left = bvhNode[node.leftFirst];
 				const BVHNode& right = bvhNode[node.leftFirst + 1];
-				alt2Node[idx].xxxx = _mm_setr_ps( left.aabbMin.x, left.aabbMax.x, right.aabbMin.x, right.aabbMax.x );
-				alt2Node[idx].yyyy = _mm_setr_ps( left.aabbMin.y, left.aabbMax.y, right.aabbMin.y, right.aabbMax.y );
-				alt2Node[idx].zzzz = _mm_setr_ps( left.aabbMin.z, left.aabbMax.z, right.aabbMin.z, right.aabbMax.z );
+				// This BVH layout requires BVH_USEAVX for traversal, but at least we
+				// can convert to it without SSE/AVX/NEON support.
+				alt2Node[idx].xxxx = SIMD_SETVEC( left.aabbMin.x, left.aabbMax.x, right.aabbMin.x, right.aabbMax.x );
+				alt2Node[idx].yyyy = SIMD_SETVEC( left.aabbMin.y, left.aabbMax.y, right.aabbMin.y, right.aabbMax.y );
+				alt2Node[idx].zzzz = SIMD_SETVEC( left.aabbMin.z, left.aabbMax.z, right.aabbMin.z, right.aabbMax.z );
 				alt2Node[idx].left = newAlt2Node; // right will be filled when popped
 				stack[stackPtr++] = idx;
 				stack[stackPtr++] = node.leftFirst + 1;
@@ -580,7 +588,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal )
 			}
 		}
 	}
-#endif
 	else
 	{
 		// For now all other conversions are invalid.
@@ -588,186 +595,6 @@ void BVH::Convert( BVHLayout from, BVHLayout to, bool deleteOriginal )
 	}
 }
 
-#ifdef BVH_USEAVX
-
-// Ultra-fast single-threaded AVX binned-SAH-builder.
-// This code produces BVHs nearly identical to reference, but much faster.
-// On a 12th gen laptop i7 CPU, Sponza Crytek (~260k tris) is processed in 51ms.
-// The code relies on the availability of AVX instructions. AVX2 is not needed.
-#ifdef _MSC_VER
-#define LANE(a,b) a.m128_f32[b]
-// Not using clang/g++ method under MSCC; compiler may benefit from .m128_i32.
-#define ILANE(a,b) a.m128i_i32[b]
-#else
-#define LANE(a,b) a[b]
-// Below method reduces to a single instruction.
-#define ILANE(a,b) _mm_cvtsi128_si32(_mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps( a ), _mm_castsi128_ps( a ), b)))
-#endif
-inline float halfArea( const __m128 a /* a contains extent of aabb */ )
-{
-	return LANE( a, 0 ) * LANE( a, 1 ) + LANE( a, 1 ) * LANE( a, 2 ) + LANE( a, 2 ) * LANE( a, 3 );
-}
-inline float halfArea( const __m256 a /* a contains aabb itself, with min.xyz negated */ )
-{
-	const __m128 q = _mm256_castps256_ps128( _mm256_add_ps( _mm256_permute2f128_ps( a, a, 5 ), a ) );
-	const __m128 v = _mm_mul_ps( q, _mm_shuffle_ps( q, q, 9 ) );
-	return LANE( v, 0 ) + LANE( v, 1 ) + LANE( v, 2 );
-}
-#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
-	ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \
-	splitCost = ANLR, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
-#if defined(_MSC_VER)
-#pragma warning ( push )
-#pragma warning( disable:4701 ) // "potentially uninitialized local variable 'bestLBox' used"
-#elif defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-void BVH::BuildAVX( const bvhvec4* vertices, const unsigned int primCount )
-{
-	int test = BVHBINS;
-	if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
-	// aligned data
-	ALIGNED( 64 ) __m256 binbox[3 * BVHBINS];				// 768 bytes
-	ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS];			// 768 bytes
-	ALIGNED( 64 ) unsigned int count[3][BVHBINS]{};			// 96 bytes
-	ALIGNED( 64 ) __m256 bestLBox, bestRBox;				// 64 bytes
-	// some constants
-	static const __m128 max4 = _mm_set1_ps( -1e30f ), half4 = _mm_set1_ps( 0.5f );
-	static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 );
-	static const __m256 max8 = _mm256_set1_ps( -1e30f );
-	static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f );
-	static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f );
-	static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() );
-	static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f );
-	for (unsigned int i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
-	// reset node pool
-	newNodePtr = 2;
-	if (!bvhNode)
-	{
-		triCount = primCount;
-		verts = (bvhvec4*)vertices;
-		triIdx = new unsigned int[triCount];
-		bvhNode = (BVHNode*)ALIGNED_MALLOC( triCount * 2 * sizeof( BVHNode ) );
-		memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
-		fragment = new Fragment[triCount];
-		idxCount = triCount;
-	}
-	else assert( triCount == primCount ); // don't change triangle count between builds.
-	struct FragSSE { __m128 bmin4, bmax4; };
-	FragSSE* frag4 = (FragSSE*)fragment;
-	__m256* frag8 = (__m256*)fragment;
-	const __m128* tris4 = (__m128*)verts;
-	// assign all triangles to the root node
-	BVHNode& root = bvhNode[0];
-	root.leftFirst = 0, root.triCount = triCount;
-	// initialize fragments and update root bounds
-	__m128 rootMin = max4, rootMax = max4;
-	for (unsigned int i = 0; i < triCount; i++)
-	{
-		const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ) );
-		const __m128 v2 = _mm_max_ps( _mm_max_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] );
-		frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = _mm_max_ps( rootMin, v1 ), rootMax = _mm_max_ps( rootMax, v2 ), triIdx[i] = i;
-	}
-	rootMin = _mm_xor_ps( rootMin, signFlip4 );
-	root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
-	// subdivide recursively
-	ALIGNED( 64 ) unsigned int task[128], taskCount = 0, nodeIdx = 0;
-	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-10f;
-	while (1)
-	{
-		while (1)
-		{
-			BVHNode& node = bvhNode[nodeIdx];
-			__m128* node4 = (__m128*) & bvhNode[nodeIdx];
-			// find optimal object split
-			const __m128 d4 = _mm_blendv_ps( min1, _mm_sub_ps( node4[1], node4[0] ), mask3 );
-			const __m128 nmin4 = _mm_mul_ps( _mm_and_ps( node4[0], mask3 ), two4 );
-			const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) );
-			// implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
-			// main loop operates on two fragments to minimize dependencies and maximize ILP.
-			unsigned int fi = triIdx[node.leftFirst];
-			memset( count, 0, sizeof( count ) );
-			__m256 r0, r1, r2, f = frag8[fi];
-			__m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) );
-			memcpy( binbox, binboxOrig, sizeof( binbox ) );
-			unsigned int i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
-			for (unsigned int i = 0; i < node.triCount - 1; i++)
-			{
-				const unsigned int fid = *ti++;
-				const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
-				const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
-				r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
-				const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) );
-				f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
-				binbox[i0] = r0, i0 = ILANE( b4, 0 );
-				binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
-				binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
-			}
-			// final business for final fragment
-			const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
-			count[0][i0]++, count[1][i1]++, count[2][i2]++;
-			r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
-			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
-			// calculate per-split totals
-			float splitCost = 1e30f;
-			unsigned int bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
-			const __m256* bb = binbox;
-			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
-			{
-				// hardcoded bin processing for BVHBINS == 8, see end of file for generic code.
-				assert( BVHBINS == 8 );
-				const unsigned int lN0 = count[a][0], rN0 = count[a][7];
-				const __m256 lb0 = bb[0], rb0 = bb[7];
-				const unsigned int lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
-				const unsigned int rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
-				const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] );
-				const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] );
-				const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] );
-				const unsigned int lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
-				const unsigned int rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
-				const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] );
-				const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] );
-				const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] );
-				float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split
-				float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 );
-				float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 );
-				float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 );
-				float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 );
-				float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
-				float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
-			}
-			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
-			// in-place partition
-			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
-			unsigned int t, fr = triIdx[src];
-			for (unsigned int i = 0; i < node.triCount; i++)
-			{
-				const unsigned int bi = (unsigned int)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
-				if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
-			}
-			// create child nodes and recurse
-			const unsigned int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
-			if (leftCount == 0 || rightCount == 0) break; // should not happen.
-			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 );
-			bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
-			node.leftFirst = n++, node.triCount = 0, newNodePtr += 2;
-			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestRBox, signFlip8 );
-			bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
-			task[taskCount++] = n, nodeIdx = n - 1;
-		}
-		// fetch subdivision task from stack
-		if (taskCount == 0) break; else nodeIdx = task[--taskCount];
-	}
-}
-#if defined(_MSC_VER)
-#pragma warning ( pop ) // restore 4701
-#elif defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop // restore -Wmaybe-uninitialized
-#endif
-
-#endif
-
 // Refitting: For animated meshes, where the topology remains intact. This
 // includes trees waving in the wind, or subsequent frames for skinned
 // animations. Repeated refitting tends to lead to deteriorated BVHs and
@@ -811,11 +638,9 @@ int BVH::Intersect( Ray& ray, BVHLayout layout ) const
 	case AILA_LAINE:
 		return Intersect_AilaLaine( ray );
 		break;
-	#ifdef BVH_USEAVX
 	case ALT_SOA:
 		return Intersect_AltSoA( ray );
 		break;
-	#endif
 	default:
 		assert( false );
 	};
@@ -899,131 +724,46 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const
 	return steps;
 }
 
-#ifdef BVH_USEAVX
-
-// Traverse the second alternative BVH layout (ALT_SOA).
-int BVH::Intersect_AltSoA( Ray& ray ) const
+// Intersect a WALD_32BYTE BVH with a ray packet.
+// The 256 rays travel together to better utilize the caches and to amortize the cost 
+// of memory transfers over the rays in the bundle.
+// Note that this basic implementation assumes a specific layout of the rays. Provided
+// as 'proof of concept', should not be used in production code.
+// Based on Large Ray Packets for Real-time Whitted Ray Tracing, Overbeck et al., 2008,
+// extended with sorted traversal and reduced stack traffic.
+void BVH::Intersect256Rays( Ray* packet ) const
 {
-	assert( alt2Node != 0 );
-	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
-	unsigned int stackPtr = 0, steps = 0;
-	const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x );
-	const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y );
-	const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z );
+	// convenience macro
+#define CALC_TMIN_TMAX_WITH_SLABTEST_ON_RAY( r ) const bvhvec3 rD = packet[r].rD, t1 = o1 * rD, t2 = o2 * rD; \
+	const float tmin = tinybvh_max( tinybvh_max( tinybvh_min( t1.x, t2.x ), tinybvh_min( t1.y, t2.y ) ), tinybvh_min( t1.z, t2.z ) ); \
+	const float tmax = tinybvh_min( tinybvh_min( tinybvh_max( t1.x, t2.x ), tinybvh_max( t1.y, t2.y ) ), tinybvh_max( t1.z, t2.z ) );
+	// Corner rays are: 0, 51, 204 and 255
+	// Construct the bounding planes, with normals pointing outwards
+	const bvhvec3 O = packet[0].O; // same for all rays in this case
+	const bvhvec3 p0 = packet[0].O + packet[0].D; // top-left
+	const bvhvec3 p1 = packet[51].O + packet[51].D; // top-right
+	const bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left
+	const bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right
+	const bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane
+	const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
+	const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
+	const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
+	const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
+	const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
+	const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
+	const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
+	const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 );
+	const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 );
+	// Traverse the tree with the packet
+	int first = 0, last = 255; // first and last active ray in the packet
+	const BVHNode* node = &bvhNode[0];
+	ALIGNED( 64 ) unsigned int stack[64], stackPtr = 0;
 	while (1)
 	{
-		steps++;
 		if (node->isLeaf())
 		{
-			for (unsigned int i = 0; i < node->triCount; i++)
-			{
-				const unsigned int tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
-				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
-				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
-				const bvhvec3 h = cross( ray.D, edge2 );
-				const float a = dot( edge1, h );
-				if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
-				const float f = 1 / a;
-				const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
-				const float u = f * dot( s, h );
-				if (u < 0 || u > 1) continue;
-				const bvhvec3 q = cross( s, edge1 );
-				const float v = f * dot( ray.D, q );
-				if (v < 0 || u + v > 1) continue;
-				const float t = f * dot( edge2, q );
-				if (t < 0 || t > ray.hit.t) continue;
-				ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx;
-			}
-			if (stackPtr == 0) break; else node = stack[--stackPtr];
-			continue;
-		}
-		__m128 x4 = _mm_mul_ps( _mm_sub_ps( node->xxxx, Ox4 ), rDx4 );
-		__m128 y4 = _mm_mul_ps( _mm_sub_ps( node->yyyy, Oy4 ), rDy4 );
-		__m128 z4 = _mm_mul_ps( _mm_sub_ps( node->zzzz, Oz4 ), rDz4 );
-		// transpose
-		__m128 t0 = _mm_unpacklo_ps( x4, y4 ), t2 = _mm_unpacklo_ps( z4, z4 );
-		__m128 t1 = _mm_unpackhi_ps( x4, y4 ), t3 = _mm_unpackhi_ps( z4, z4 );
-		__m128 xyzw1a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		__m128 xyzw2a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
-		__m128 xyzw1b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		__m128 xyzw2b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
-		// process
-		__m128 tmina4 = _mm_min_ps( xyzw1a, xyzw2a ), tmaxa4 = _mm_max_ps( xyzw1a, xyzw2a );
-		__m128 tminb4 = _mm_min_ps( xyzw1b, xyzw2b ), tmaxb4 = _mm_max_ps( xyzw1b, xyzw2b );
-		// transpose back
-		t0 = _mm_unpacklo_ps( tmina4, tmaxa4 ), t2 = _mm_unpacklo_ps( tminb4, tmaxb4 );
-		t1 = _mm_unpackhi_ps( tmina4, tmaxa4 ), t3 = _mm_unpackhi_ps( tminb4, tmaxb4 );
-		x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
-		z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
-		const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() );
-		const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) );
-		// TODO: use a shuffle here to do the comparison / select with SSE, then extract dist1 and dist2. 
-		const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 );
-		const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 );
-		float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f;
-		float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f;
-		unsigned int lidx = node->left, ridx = node->right;
-		if (dist1 > dist2)
-		{
-			float t = dist1; dist1 = dist2; dist2 = t;
-			unsigned int i = lidx; lidx = ridx; ridx = i;
-		}
-		if (dist1 == 1e30f)
-		{
-			if (stackPtr == 0) break; else node = stack[--stackPtr];
-		}
-		else
-		{
-			node = alt2Node + lidx;
-			if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx;
-		}
-	}
-	return steps;
-}
-
-#endif
-
-// Intersect a WALD_32BYTE BVH with a ray packet.
-// The 256 rays travel together to better utilize the caches and to amortize the cost 
-// of memory transfers over the rays in the bundle.
-// Note that this basic implementation assumes a specific layout of the rays. Provided
-// as 'proof of concept', should not be used in production code.
-// Based on Large Ray Packets for Real-time Whitted Ray Tracing, Overbeck et al., 2008,
-// extended with sorted traversal and reduced stack traffic.
-void BVH::Intersect256Rays( Ray* packet ) const
-{
-	// convenience macro
-#define CALC_TMIN_TMAX_WITH_SLABTEST_ON_RAY( r ) const bvhvec3 rD = packet[r].rD, t1 = o1 * rD, t2 = o2 * rD; \
-	const float tmin = tinybvh_max( tinybvh_max( tinybvh_min( t1.x, t2.x ), tinybvh_min( t1.y, t2.y ) ), tinybvh_min( t1.z, t2.z ) ); \
-	const float tmax = tinybvh_min( tinybvh_min( tinybvh_max( t1.x, t2.x ), tinybvh_max( t1.y, t2.y ) ), tinybvh_max( t1.z, t2.z ) );
-	// Corner rays are: 0, 51, 204 and 255
-	// Construct the bounding planes, with normals pointing outwards
-	const bvhvec3 O = packet[0].O; // same for all rays in this case
-	const bvhvec3 p0 = packet[0].O + packet[0].D; // top-left
-	const bvhvec3 p1 = packet[51].O + packet[51].D; // top-right
-	const bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left
-	const bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right
-	const bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane
-	const bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
-	const bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
-	const bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
-	const int sign0x = plane0.x < 0 ? 4 : 0, sign0y = plane0.y < 0 ? 5 : 1, sign0z = plane0.z < 0 ? 6 : 2;
-	const int sign1x = plane1.x < 0 ? 4 : 0, sign1y = plane1.y < 0 ? 5 : 1, sign1z = plane1.z < 0 ? 6 : 2;
-	const int sign2x = plane2.x < 0 ? 4 : 0, sign2y = plane2.y < 0 ? 5 : 1, sign2z = plane2.z < 0 ? 6 : 2;
-	const int sign3x = plane3.x < 0 ? 4 : 0, sign3y = plane3.y < 0 ? 5 : 1, sign3z = plane3.z < 0 ? 6 : 2;
-	const float d0 = dot( O, plane0 ), d1 = dot( O, plane1 );
-	const float d2 = dot( O, plane2 ), d3 = dot( O, plane3 );
-	// Traverse the tree with the packet
-	int first = 0, last = 255; // first and last active ray in the packet
-	const BVHNode* node = &bvhNode[0];
-	ALIGNED( 64 ) unsigned int stack[64], stackPtr = 0;
-	while (1)
-	{
-		if (node->isLeaf())
-		{
-			// handle leaf node
-			for (unsigned int j = 0; j < node->triCount; j++)
+			// handle leaf node
+			for (unsigned int j = 0; j < node->triCount; j++)
 			{
 				const unsigned int idx = triIdx[node->leftFirst + j], vid = idx * 3;
 				const bvhvec3 edge1 = verts[vid + 1] - verts[vid], edge2 = verts[vid + 2] - verts[vid];
@@ -1155,8 +895,230 @@ void BVH::Intersect256Rays( Ray* packet ) const
 	}
 }
 
+// IntersectTri
+void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const
+{
+	// Moeller-Trumbore ray/triangle intersection algorithm
+	const unsigned int vertIdx = idx * 3;
+	const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
+	const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
+	const bvhvec3 h = cross( ray.D, edge2 );
+	const float a = dot( edge1, h );
+	if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle
+	const float f = 1 / a;
+	const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
+	const float u = f * dot( s, h );
+	if (u < 0 || u > 1) return;
+	const bvhvec3 q = cross( s, edge1 );
+	const float v = f * dot( ray.D, q );
+	if (v < 0 || u + v > 1) return;
+	const float t = f * dot( edge2, q );
+	if (t > 0 && t < ray.hit.t)
+	{
+		// register a hit: ray is shortened to t
+		ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx;
+	}
+}
+
+// IntersectAABB
+float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax )
+{
+	// "slab test" ray/AABB intersection
+	float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x;
+	float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 );
+	float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y;
+	tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) );
+	tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) );
+	float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z;
+	tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) );
+	tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) );
+	if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f;
+}
+
+// ============================================================================
+//
+//        I M P L E M E N T A T I O N  -  S I M D  C O D E
+// 
+// ============================================================================
+
 #ifdef BVH_USEAVX
 
+// Ultra-fast single-threaded AVX binned-SAH-builder.
+// This code produces BVHs nearly identical to reference, but much faster.
+// On a 12th gen laptop i7 CPU, Sponza Crytek (~260k tris) is processed in 51ms.
+// The code relies on the availability of AVX instructions. AVX2 is not needed.
+#ifdef _MSC_VER
+#define LANE(a,b) a.m128_f32[b]
+// Not using clang/g++ method under MSCC; compiler may benefit from .m128_i32.
+#define ILANE(a,b) a.m128i_i32[b]
+#else
+#define LANE(a,b) a[b]
+// Below method reduces to a single instruction.
+#define ILANE(a,b) _mm_cvtsi128_si32(_mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps( a ), _mm_castsi128_ps( a ), b)))
+#endif
+inline float halfArea( const __m128 a /* a contains extent of aabb */ )
+{
+	return LANE( a, 0 ) * LANE( a, 1 ) + LANE( a, 1 ) * LANE( a, 2 ) + LANE( a, 2 ) * LANE( a, 3 );
+}
+inline float halfArea( const __m256 a /* a contains aabb itself, with min.xyz negated */ )
+{
+	const __m128 q = _mm256_castps256_ps128( _mm256_add_ps( _mm256_permute2f128_ps( a, a, 5 ), a ) );
+	const __m128 v = _mm_mul_ps( q, _mm_shuffle_ps( q, q, 9 ) );
+	return LANE( v, 0 ) + LANE( v, 1 ) + LANE( v, 2 );
+}
+#define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
+	ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \
+	splitCost = ANLR, bestAxis = a, bestPos = pos, bestLBox = lb, bestRBox = rb; }
+#if defined(_MSC_VER)
+#pragma warning ( push )
+#pragma warning( disable:4701 ) // "potentially uninitialized local variable 'bestLBox' used"
+#elif defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+void BVH::BuildAVX( const bvhvec4* vertices, const unsigned int primCount )
+{
+	int test = BVHBINS;
+	if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
+	// aligned data
+	ALIGNED( 64 ) __m256 binbox[3 * BVHBINS];				// 768 bytes
+	ALIGNED( 64 ) __m256 binboxOrig[3 * BVHBINS];			// 768 bytes
+	ALIGNED( 64 ) unsigned int count[3][BVHBINS]{};			// 96 bytes
+	ALIGNED( 64 ) __m256 bestLBox, bestRBox;				// 64 bytes
+	// some constants
+	static const __m128 max4 = _mm_set1_ps( -1e30f ), half4 = _mm_set1_ps( 0.5f );
+	static const __m128 two4 = _mm_set1_ps( 2.0f ), min1 = _mm_set1_ps( -1 );
+	static const __m256 max8 = _mm256_set1_ps( -1e30f );
+	static const __m256 signFlip8 = _mm256_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f );
+	static const __m128 signFlip4 = _mm_setr_ps( -0.0f, -0.0f, -0.0f, 0.0f );
+	static const __m128 mask3 = _mm_cmpeq_ps( _mm_setr_ps( 0, 0, 0, 1 ), _mm_setzero_ps() );
+	static const __m128 binmul3 = _mm_set1_ps( BVHBINS * 0.49999f );
+	for (unsigned int i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
+	// reset node pool
+	newNodePtr = 2;
+	if (!bvhNode)
+	{
+		triCount = primCount;
+		verts = (bvhvec4*)vertices;
+		triIdx = new unsigned int[triCount];
+		bvhNode = (BVHNode*)ALIGNED_MALLOC( triCount * 2 * sizeof( BVHNode ) );
+		memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
+		fragment = new Fragment[triCount];
+		idxCount = triCount;
+	}
+	else assert( triCount == primCount ); // don't change triangle count between builds.
+	struct FragSSE { __m128 bmin4, bmax4; };
+	FragSSE* frag4 = (FragSSE*)fragment;
+	__m256* frag8 = (__m256*)fragment;
+	const __m128* tris4 = (__m128*)verts;
+	// assign all triangles to the root node
+	BVHNode& root = bvhNode[0];
+	root.leftFirst = 0, root.triCount = triCount;
+	// initialize fragments and update root bounds
+	__m128 rootMin = max4, rootMax = max4;
+	for (unsigned int i = 0; i < triCount; i++)
+	{
+		const __m128 v1 = _mm_xor_ps( signFlip4, _mm_min_ps( _mm_min_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] ) );
+		const __m128 v2 = _mm_max_ps( _mm_max_ps( tris4[i * 3], tris4[i * 3 + 1] ), tris4[i * 3 + 2] );
+		frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = _mm_max_ps( rootMin, v1 ), rootMax = _mm_max_ps( rootMax, v2 ), triIdx[i] = i;
+	}
+	rootMin = _mm_xor_ps( rootMin, signFlip4 );
+	root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
+	// subdivide recursively
+	ALIGNED( 64 ) unsigned int task[128], taskCount = 0, nodeIdx = 0;
+	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-10f;
+	while (1)
+	{
+		while (1)
+		{
+			BVHNode& node = bvhNode[nodeIdx];
+			__m128* node4 = (__m128*) & bvhNode[nodeIdx];
+			// find optimal object split
+			const __m128 d4 = _mm_blendv_ps( min1, _mm_sub_ps( node4[1], node4[0] ), mask3 );
+			const __m128 nmin4 = _mm_mul_ps( _mm_and_ps( node4[0], mask3 ), two4 );
+			const __m128 rpd4 = _mm_and_ps( _mm_div_ps( binmul3, d4 ), _mm_cmpneq_ps( d4, _mm_setzero_ps() ) );
+			// implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
+			// main loop operates on two fragments to minimize dependencies and maximize ILP.
+			unsigned int fi = triIdx[node.leftFirst];
+			memset( count, 0, sizeof( count ) );
+			__m256 r0, r1, r2, f = frag8[fi];
+			__m128i bi4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) );
+			memcpy( binbox, binboxOrig, sizeof( binbox ) );
+			unsigned int i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
+			for (unsigned int i = 0; i < node.triCount - 1; i++)
+			{
+				const unsigned int fid = *ti++;
+				const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
+				const __m128 fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
+				r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
+				const __m128i b4 = _mm_cvtps_epi32( _mm_sub_ps( _mm_mul_ps( _mm_sub_ps( _mm_sub_ps( fmax, fmin ), nmin4 ), rpd4 ), half4 ) );
+				f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
+				binbox[i0] = r0, i0 = ILANE( b4, 0 );
+				binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
+				binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
+			}
+			// final business for final fragment
+			const __m256 b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
+			count[0][i0]++, count[1][i1]++, count[2][i2]++;
+			r0 = _mm256_max_ps( b0, f ), r1 = _mm256_max_ps( b1, f ), r2 = _mm256_max_ps( b2, f );
+			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
+			// calculate per-split totals
+			float splitCost = 1e30f;
+			unsigned int bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
+			const __m256* bb = binbox;
+			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+			{
+				// hardcoded bin processing for BVHBINS == 8, see end of file for generic code.
+				assert( BVHBINS == 8 );
+				const unsigned int lN0 = count[a][0], rN0 = count[a][7];
+				const __m256 lb0 = bb[0], rb0 = bb[7];
+				const unsigned int lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
+				const unsigned int rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
+				const __m256 lb1 = _mm256_max_ps( lb0, bb[1] ), rb1 = _mm256_max_ps( rb0, bb[6] );
+				const __m256 lb2 = _mm256_max_ps( lb1, bb[2] ), rb2 = _mm256_max_ps( rb1, bb[5] );
+				const __m256 lb3 = _mm256_max_ps( lb2, bb[3] ), rb3 = _mm256_max_ps( rb2, bb[4] );
+				const unsigned int lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
+				const unsigned int rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
+				const __m256 lb4 = _mm256_max_ps( lb3, bb[4] ), rb4 = _mm256_max_ps( rb3, bb[3] );
+				const __m256 lb5 = _mm256_max_ps( lb4, bb[5] ), rb5 = _mm256_max_ps( rb4, bb[2] );
+				const __m256 lb6 = _mm256_max_ps( lb5, bb[6] ), rb6 = _mm256_max_ps( rb5, bb[1] );
+				float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split
+				float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 );
+				float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 );
+				float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 );
+				float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 );
+				float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
+				float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
+			}
+			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+			// in-place partition
+			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
+			unsigned int t, fr = triIdx[src];
+			for (unsigned int i = 0; i < node.triCount; i++)
+			{
+				const unsigned int bi = (unsigned int)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
+				if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
+			}
+			// create child nodes and recurse
+			const unsigned int leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			if (leftCount == 0 || rightCount == 0) break; // should not happen.
+			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestLBox, signFlip8 );
+			bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
+			node.leftFirst = n++, node.triCount = 0, newNodePtr += 2;
+			*(__m256*)& bvhNode[n] = _mm256_xor_ps( bestRBox, signFlip8 );
+			bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
+			task[taskCount++] = n, nodeIdx = n - 1;
+		}
+		// fetch subdivision task from stack
+		if (taskCount == 0) break; else nodeIdx = task[--taskCount];
+	}
+}
+#if defined(_MSC_VER)
+#pragma warning ( pop ) // restore 4701
+#elif defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop // restore -Wmaybe-uninitialized
+#endif
+
 // Intersect a BVH with a ray packet, basic SSE-optimized version.
 // Note: This yields +10% on 10th gen Intel CPUs, but a small loss on
 // more recent hardware. This function needs a full conversion to work
@@ -1352,52 +1314,100 @@ void BVH::Intersect256RaysSSE( Ray* packet ) const
 	}
 }
 
-#endif
-
-// IntersectTri
-void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const
+// Traverse the second alternative BVH layout (ALT_SOA).
+int BVH::Intersect_AltSoA( Ray& ray ) const
 {
-	// Moeller-Trumbore ray/triangle intersection algorithm
-	const unsigned int vertIdx = idx * 3;
-	const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
-	const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
-	const bvhvec3 h = cross( ray.D, edge2 );
-	const float a = dot( edge1, h );
-	if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle
-	const float f = 1 / a;
-	const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
-	const float u = f * dot( s, h );
-	if (u < 0 || u > 1) return;
-	const bvhvec3 q = cross( s, edge1 );
-	const float v = f * dot( ray.D, q );
-	if (v < 0 || u + v > 1) return;
-	const float t = f * dot( edge2, q );
-	if (t > 0 && t < ray.hit.t)
+	assert( alt2Node != 0 );
+	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
+	unsigned int stackPtr = 0, steps = 0;
+	const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x );
+	const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y );
+	const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z );
+	while (1)
 	{
-		// register a hit: ray is shortened to t
-		ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx;
+		steps++;
+		if (node->isLeaf())
+		{
+			for (unsigned int i = 0; i < node->triCount; i++)
+			{
+				const unsigned int tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
+				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
+				const bvhvec3 h = cross( ray.D, edge2 );
+				const float a = dot( edge1, h );
+				if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
+				const float f = 1 / a;
+				const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
+				const float u = f * dot( s, h );
+				if (u < 0 || u > 1) continue;
+				const bvhvec3 q = cross( s, edge1 );
+				const float v = f * dot( ray.D, q );
+				if (v < 0 || u + v > 1) continue;
+				const float t = f * dot( edge2, q );
+				if (t < 0 || t > ray.hit.t) continue;
+				ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx;
+			}
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+			continue;
+		}
+		__m128 x4 = _mm_mul_ps( _mm_sub_ps( node->xxxx, Ox4 ), rDx4 );
+		__m128 y4 = _mm_mul_ps( _mm_sub_ps( node->yyyy, Oy4 ), rDy4 );
+		__m128 z4 = _mm_mul_ps( _mm_sub_ps( node->zzzz, Oz4 ), rDz4 );
+		// transpose
+		__m128 t0 = _mm_unpacklo_ps( x4, y4 ), t2 = _mm_unpacklo_ps( z4, z4 );
+		__m128 t1 = _mm_unpackhi_ps( x4, y4 ), t3 = _mm_unpackhi_ps( z4, z4 );
+		__m128 xyzw1a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+		__m128 xyzw2a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+		__m128 xyzw1b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+		__m128 xyzw2b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+		// process
+		__m128 tmina4 = _mm_min_ps( xyzw1a, xyzw2a ), tmaxa4 = _mm_max_ps( xyzw1a, xyzw2a );
+		__m128 tminb4 = _mm_min_ps( xyzw1b, xyzw2b ), tmaxb4 = _mm_max_ps( xyzw1b, xyzw2b );
+		// transpose back
+		t0 = _mm_unpacklo_ps( tmina4, tmaxa4 ), t2 = _mm_unpacklo_ps( tminb4, tmaxb4 );
+		t1 = _mm_unpackhi_ps( tmina4, tmaxa4 ), t3 = _mm_unpackhi_ps( tminb4, tmaxb4 );
+		x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+		y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+		z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) );
+		const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() );
+		const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) );
+		// TODO: use a shuffle here to do the comparison / select with SSE, then extract dist1 and dist2. 
+		const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 );
+		const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 );
+		float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f;
+		float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f;
+		unsigned int lidx = node->left, ridx = node->right;
+		if (dist1 > dist2)
+		{
+			float t = dist1; dist1 = dist2; dist2 = t;
+			unsigned int i = lidx; lidx = ridx; ridx = i;
+		}
+		if (dist1 == 1e30f)
+		{
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+		}
+		else
+		{
+			node = alt2Node + lidx;
+			if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx;
+		}
 	}
+	return steps;
 }
 
-// IntersectAABB
-float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax )
+#else
+
+int BVH::Intersect_AltSoA( Ray& ray ) const
 {
-	// "slab test" ray/AABB intersection
-	float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x;
-	float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 );
-	float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y;
-	tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) );
-	tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) );
-	float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z;
-	tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) );
-	tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) );
-	if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f;
+	// not implemented for platforms that do not support SSE/AVX.
+	assert( false );
+	return 0;
 }
 
-} // namespace tinybvh
+#endif // BVH_USEAVX
 
-#endif
+} // namespace tinybvh
 
-#endif
+#endif // TINYBVH_IMPLEMENTATION
 
-// EOF
+#endif // TINY_BVH_H_
\ No newline at end of file
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index df1bb93..71af684 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -120,8 +120,10 @@ int main()
 
 	//  T I N Y _ B V H   P E R F O R M A N C E   M E A S U R E M E N T S
 
-	int minor = TINY_BVH_VERSION_MINOR, major = TINY_BVH_VERSION_MAJOR;
-	printf( "tiny_bvh version %i.%i performance statistics\n", major, minor );
+	int minor = TINY_BVH_VERSION_MINOR;
+	int major = TINY_BVH_VERSION_MAJOR;
+	int sub = TINY_BVH_VERSION_SUB;
+	printf( "tiny_bvh version %i.%i.%i performance statistics\n", major, minor, sub );
 	printf( "----------------------------------------------------------------\n" );
 
 	Timer t;