From 6feadfb612f015abf05f04b16127c34cf97527c0 Mon Sep 17 00:00:00 2001
From: jbikker <bikker.j@gmail.com>
Date: Mon, 4 Nov 2024 13:47:11 +0100
Subject: [PATCH] Added plain C packet traversal.

---
 tiny_bvh.h             | 102 +++++++++++++++++++++++++++++++++++++++++
 tiny_bvh_fenster.cpp   |  54 ++++++++++++++--------
 tiny_bvh_speedtest.cpp |  47 +++++++++++++++----
 3 files changed, 175 insertions(+), 28 deletions(-)

diff --git a/tiny_bvh.h b/tiny_bvh.h
index 320547f..16d8a37 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -320,6 +320,7 @@ class BVH
 	void BuildAVX( const bvhvec4* vertices, const unsigned int primCount );
 	void Refit();
 	int Intersect( Ray& ray ) const;
+	void Intersect256Rays( Ray* first ) const;
 private:
 	void IntersectTri( Ray& ray, const unsigned int triIdx ) const;
 	static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax );
@@ -717,6 +718,107 @@ int BVH::Intersect( Ray& ray ) const
 	return steps;
 }
 
+// Intersect a BVH with a ray packet.
+// The 256 rays travel together to better utilize the caches and to amortize the cost 
+// of memory transfers over the rays in the bundle.
+// Note that this basic implementation assumes a specific layout of the rays. Provided
+// as 'proof of concept', should not be used in production code.
+void BVH::Intersect256Rays( Ray* packet ) const
+{
+	// Corner rays are: 0, 51, 204 and 255
+	// Construct the bounding planes, with normals pointing outwards
+	bvhvec3 O = packet[0].O; // same for all rays in this case
+	bvhvec3 p0 = packet[0].O + packet[0].D; // top-left
+	bvhvec3 p1 = packet[51].O + packet[51].D; // top-right
+	bvhvec3 p2 = packet[204].O + packet[204].D; // bottom-left
+	bvhvec3 p3 = packet[255].O + packet[255].D; // bottom-right
+	bvhvec3 plane0 = normalize( cross( p0 - O, p0 - p2 ) ); // left plane
+	bvhvec3 plane1 = normalize( cross( p3 - O, p3 - p1 ) ); // right plane
+	bvhvec3 plane2 = normalize( cross( p1 - O, p1 - p0 ) ); // top plane
+	bvhvec3 plane3 = normalize( cross( p2 - O, p2 - p3 ) ); // bottom plane
+	float t0 = dot( O, plane0 ), t1 = dot( O, plane1 );
+	float t2 = dot( O, plane2 ), t3 = dot( O, plane3 );
+	// Traverse the tree with the packet
+	int first = 0, last = 255, tmp; // first and last active ray in the packet
+	BVHNode* node = &bvhNode[0], * stack[64];
+	unsigned int stackPtr = 0, firstLast[64];
+	while (1)
+	{
+		// 1. Early-in test: if first ray hits the node, the packet visits the node
+		bool earlyHit = IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f;
+		// 2. Early-out test: if the node aabb is outside the four planes, we skip the node
+		if (!earlyHit)
+		{
+			const bvhvec3 bmin = node->aabbMin, bmax = node->aabbMax;
+			bvhvec3 p0( plane0.x < 0 ? bmax.x : bmin.x, plane0.y < 0 ? bmax.y : bmin.y, plane0.z < 0 ? bmax.z : bmin.z );
+			bvhvec3 p1( plane1.x < 0 ? bmax.x : bmin.x, plane1.y < 0 ? bmax.y : bmin.y, plane1.z < 0 ? bmax.z : bmin.z );
+			bvhvec3 p2( plane2.x < 0 ? bmax.x : bmin.x, plane2.y < 0 ? bmax.y : bmin.y, plane2.z < 0 ? bmax.z : bmin.z );
+			bvhvec3 p3( plane3.x < 0 ? bmax.x : bmin.x, plane3.y < 0 ? bmax.y : bmin.y, plane3.z < 0 ? bmax.z : bmin.z );
+			if (dot( p0, plane0 ) > t0 || dot( p1, plane1 ) > t1 || dot( p2, plane2 ) > t2 || dot( p3, plane3 ) > t3)
+			{
+				if (stackPtr == 0) break; else // pop
+					node = stack[--stackPtr], tmp = firstLast[stackPtr], 
+					first = tmp >> 8, last = tmp & 255;
+			}
+			else
+			{
+				// 3. Last resort: update first and last, stay in node if first > last
+				for( ; first <= last; first++ )
+					if (IntersectAABB( packet[first], node->aabbMin, node->aabbMax ) < 1e30f) break;
+				for( ; last >= first; last-- )
+					if (IntersectAABB( packet[last], node->aabbMin, node->aabbMax ) < 1e30f) break;
+			}
+		}
+		// process result
+		if (last >= first)
+		{
+			if (node->isLeaf())
+			{
+				for (unsigned int j = 0; j < node->triCount; j++) 
+				{
+					const unsigned int idx = triIdx[node->leftFirst + j];
+					const unsigned int vertIdx = idx * 3;
+					const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
+					const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
+					for( int i = first; i <= last; i++ )
+					{
+						// Moeller-Trumbore ray/triangle intersection algorithm
+						Ray& ray = packet[i];
+						const bvhvec3 h = cross( ray.D, edge2 );
+						const float a = dot( edge1, h );
+						if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
+						const float f = 1 / a;
+						const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
+						const float u = f * dot( s, h );
+						if (u < 0 || u > 1) continue;
+						const bvhvec3 q = cross( s, edge1 );
+						const float v = f * dot( ray.D, q );
+						if (v < 0 || u + v > 1) continue;
+						const float t = f * dot( edge2, q );
+						if (t > 0 && t < ray.hit.t)
+						{
+							// register a hit: ray is shortened to t
+							ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx;
+						}
+					}
+				}
+				if (stackPtr == 0) break; else // pop
+					node = stack[--stackPtr], tmp = firstLast[stackPtr], 
+					first = tmp >> 8, last = tmp & 255;
+			}
+			else
+			{
+				firstLast[stackPtr] = (first << 8) + last;
+				stack[stackPtr++] = &bvhNode[node->leftFirst + 1];
+				node = &bvhNode[node->leftFirst];
+			}
+		}
+		else if (stackPtr == 0) break; else // pop
+			node = stack[--stackPtr], tmp = firstLast[stackPtr], 
+			first = tmp >> 8, last = tmp & 255;
+	}
+}
+
 // IntersectTri
 void BVH::IntersectTri( Ray& ray, const unsigned int idx ) const
 {
diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp
index 0b0df62..3078252 100644
--- a/tiny_bvh_fenster.cpp
+++ b/tiny_bvh_fenster.cpp
@@ -49,37 +49,53 @@ void Tick( uint32_t* buf )
 	bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view;
 	bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up;
 
-	// generate primary rays in a buffer
+	// generate primary rays in a cacheline-aligned buffer - and, for data locality:
+	// organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile.
 	int N = 0;
-	Ray* rays = new Ray[SCRWIDTH * SCRHEIGHT * 16];
-	for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
+	Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
+	for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
 	{
-		for (int s = 0; s < 16; s++) // 16 samples per pixel
+		for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
 		{
-			float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4);
-			float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
-			bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
-			rays[N++] = Ray( eye, normalize( P - eye ) );
+			int pixel_x = tx * 4 + x;
+			int pixel_y = ty * 4 + y;
+			for (int s = 0; s < 16; s++) // 16 samples per pixel
+			{
+				float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4);
+				float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
+				bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
+				rays[N++] = Ray( eye, normalize( P - eye ) );
+			}
 		}
 	}
 	
 	// trace primary rays
+#if 1
+	const int packetCount = N / 256;
+	for (int i = 0; i < packetCount; i++) bvh.Intersect256Rays( rays + i * 256 );
+#else
 	for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
+#endif
 	
 	// visualize result
-	for (int i = 0, y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
+	for( int i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
 	{
-		float avg = 0;
-		for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000)
+		for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
 		{
-			int primIdx = rays[i].hit.prim;
-			bvhvec3 v0 = triangles[primIdx * 3 + 0];
-			bvhvec3 v1 = triangles[primIdx * 3 + 1];
-			bvhvec3 v2 = triangles[primIdx * 3 + 2];
-			bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) );
-			avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) );
+			int pixel_x = tx * 4 + x;
+			int pixel_y = ty * 4 + y;
+			float avg = 0;
+			for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000)
+			{
+				int primIdx = rays[i].hit.prim;
+				bvhvec3 v0 = triangles[primIdx * 3 + 0];
+				bvhvec3 v1 = triangles[primIdx * 3 + 1];
+				bvhvec3 v2 = triangles[primIdx * 3 + 2];
+				bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) );
+				avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) );
+			}
+			int c = (int)(15.9f * avg);
+			buf[pixel_x + pixel_y * SCRWIDTH] = c + (c << 8) + (c << 16);
 		}
-		int c = (int)(15.9f * avg);
-		buf[x + y * SCRWIDTH] = c + (c << 8) + (c << 16);
 	}
 }
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index cd5e75c..f0b569c 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -67,17 +67,23 @@ int main()
 	bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view;
 	bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up;
 
-	// generate primary rays in a cacheline-aligned buffer
+	// generate primary rays in a cacheline-aligned buffer - and, for data locality:
+	// organized in 4x4 pixel tiles, 16 samples per pixel, so 256 rays per tile.
 	int N = 0;
 	Ray* rays = (Ray*)ALIGNED_MALLOC( SCRWIDTH * SCRHEIGHT * 16 * sizeof( Ray ) );
-	for (int y = 0; y < SCRHEIGHT; y++) for (int x = 0; x < SCRWIDTH; x++)
+	for( int ty = 0; ty < SCRHEIGHT / 4; ty++ ) for( int tx = 0; tx < SCRWIDTH / 4; tx++ )
 	{
-		for (int s = 0; s < 16; s++) // 16 samples per pixel
+		for (int y = 0; y < 4; y++) for (int x = 0; x < 4; x++)
 		{
-			float u = (float)(x * 4 + (s & 3)) / (SCRWIDTH * 4);
-			float v = (float)(y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
-			bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
-			rays[N++] = Ray( eye, normalize( P - eye ) );
+			int pixel_x = tx * 4 + x;
+			int pixel_y = ty * 4 + y;
+			for (int s = 0; s < 16; s++) // 16 samples per pixel
+			{
+				float u = (float)(pixel_x * 4 + (s & 3)) / (SCRWIDTH * 4);
+				float v = (float)(pixel_y * 4 + (s >> 2)) / (SCRHEIGHT * 4);
+				bvhvec3 P = p1 + u * (p2 - p1) + v * (p3 - p1);
+				rays[N++] = Ray( eye, normalize( P - eye ) );
+			}
 		}
 	}
 
@@ -88,12 +94,15 @@ int main()
 	printf( "----------------------------------------------------------------\n" );
 
 	Timer t;
+	float mrays;
 
 	// measure single-core bvh construction time - warming caches
 	printf( "BVH construction speed\n" );
 	printf( "warming caches...\n" );
 	bvh.Build( (bvhvec4*)triangles, verts / 3 );
 
+#if 1
+
 	// measure single-core bvh construction time - reference builder
 	t.reset();
 	printf( "- reference builder: " );
@@ -125,7 +134,7 @@ int main()
 	for (int pass = 0; pass < 3; pass++)
 		for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
 	float traceTimeST = t.elapsed() / 3.0f;
-	float mrays = (float)N / traceTimeST;
+	mrays = (float)N / traceTimeST;
 	printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeST * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 
 	// trace all rays three times to estimate average performance
@@ -146,7 +155,27 @@ int main()
 	mrays = (float)N / traceTimeMT;
 	printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMT * 1000, (float)N * 1e-6f, mrays * 1e-6f );
 
-	// shuffle rays for the next experiment
+#endif
+
+	// trace all rays three times to estimate average performance
+	// - coherent distribution, multi-core, packet traversal
+	t.reset();
+	printf( "- CPU, coherent, basic 2-way layout, MT, packets:  " );
+	for (int j = 0; j < 3; j++)
+	{
+		const int batchCount = N / (30 * 256); // batches of 30 packets of 256 rays
+	#pragma omp parallel for schedule(dynamic)
+		for (int batch = 0; batch < batchCount; batch++)
+		{
+			const int batchStart = batch * 30 * 256;
+			for (int i = 0; i < 30; i++) bvh.Intersect256Rays( rays + batchStart + i * 256 );
+		}
+	}
+	float traceTimeMTP = t.elapsed() / 3.0f;
+	mrays = (float)N / traceTimeMTP;
+	printf( "%.2fms for %.2fM rays (%.2fMRays/s)\n", traceTimeMTP * 1000, (float)N * 1e-6f, mrays * 1e-6f );
+
+	// shuffle rays for the next experiment - TODO: replace by random bounce
 	for( int i = 0; i < N; i++ )
 	{
 		int j = (i + 17 * rand()) % N;